From 9066f874175600048ab4caffc0931948ed7e0cba Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 27 Apr 2022 14:15:37 +0300 Subject: [PATCH 001/775] cr-dump: do not report success to logs if post-dump script failed It can be confusing to see error from post-dump action script and non zero return from criu though at the same time see "Dumping finished successfully" in log. I believe it is logical to consider post-dump action script as a part of "dump" process so fail in it means that the whole dump failed. Signed-off-by: Pavel Tikhomirov --- criu/cr-dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f58701e5c..60e90baed 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2049,7 +2049,7 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); close_image_dir(); - if (ret) { + if (ret || post_dump_ret) { pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); From b26e1fdbf7bbcb98a6fe8c2f922e4aaf38c54f12 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 8 May 2022 16:04:26 +0700 Subject: [PATCH 002/775] mem: Skip pre-dumping on hugetlb mappings As private hugetlb mappings are not pre-mapped, the content of them is restored in the the restorer which cannot use page_read->read_pages. As a result, we cannot recursively read the content of pre-dumped image in the parent directory and use preadv to read the content from the last dumped image only. Therefore, it may freeze while restoring when the content of mapping is in pre-dumped image in parent directory. We need to skip pre-dumping on hugetlb mappings to resolve the issue. Suggested-by: Alexander Mikhalitsyn Signed-off-by: Bui Quang Minh --- criu/mem.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/criu/mem.c b/criu/mem.c index 136439518..ab86a1f6d 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -246,6 +246,12 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; + /* + * We totally ignore MAP_HUGETLB on pre-dump. + * See also generate_vma_iovs() comment. + */ + if ((vma->e->flags & MAP_HUGETLB) && skip_non_trackable) + continue; if (vma->e->prot & PROT_READ) continue; @@ -402,7 +408,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { + /* + * We want to completely ignore these VMA types on the pre-dump: + * 1. VMA_AREA_AIORING because it is not soft-dirty trackable (kernel writes) + * 2. MAP_HUGETLB mappings because they are not premapped and we can't use + * parent images from pre-dump stages. Instead, the content is restored from + * the parasite context using full memory image. + */ + if (vma_entry_is(vma->e, VMA_AREA_AIORING) || vma->e->flags & MAP_HUGETLB) { if (pre_dump) return 0; has_parent = false; From c830643d86cccc8dc901f048e4aa96d8c77fc696 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 8 May 2022 16:19:45 +0700 Subject: [PATCH 003/775] Revert "ci: skip new hugetlb maps09/maps10 tests for pre-dump" This reverts commit 37ea8c5fcfef2108800b6d53054f3a7c4f710752. Signed-off-by: Bui Quang Minh --- test/jenkins/criu-dedup.sh | 2 +- test/jenkins/criu-lazy-migration.sh | 2 +- test/jenkins/criu-lazy-pages.sh | 2 +- test/jenkins/criu-pre-dump.sh | 5 ++--- test/jenkins/criu-remote-lazy-pages.sh | 2 +- test/jenkins/criu-snap.sh | 4 ++-- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/test/jenkins/criu-dedup.sh b/test/jenkins/criu-dedup.sh index 842d218bd..edb1b653d 100755 --- a/test/jenkins/criu-dedup.sh +++ b/test/jenkins/criu-dedup.sh @@ -4,7 +4,7 @@ set -e source `dirname $0`/criu-lib.sh prep -./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 -x maps09 -x maps10 || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 || fail # Additionally run these tests as they touch a lot of # memory and it makes sense to additionally check it diff --git a/test/jenkins/criu-lazy-migration.sh b/test/jenkins/criu-lazy-migration.sh index b23f31c79..02a212e0d 100755 --- a/test/jenkins/criu-lazy-migration.sh +++ b/test/jenkins/criu-lazy-migration.sh @@ -15,7 +15,7 @@ LAZY_MIGRATE_EXCLUDE="-x fifo_loop -x file_locks -x ptrace_sig -x overmount_file --lazy-migrate $LAZY_EXCLUDE $LAZY_MIGRATE_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 -f uns \ diff --git a/test/jenkins/criu-lazy-pages.sh b/test/jenkins/criu-lazy-pages.sh index f62912090..9ef721739 100755 --- a/test/jenkins/criu-lazy-pages.sh +++ b/test/jenkins/criu-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --lazy-pages $LAZY_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-pre-dump.sh b/test/jenkins/criu-pre-dump.sh index b2972d941..137f7c23f 100755 --- a/test/jenkins/criu-pre-dump.sh +++ b/test/jenkins/criu-pre-dump.sh @@ -5,6 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -# FIXME: https://github.com/checkpoint-restore/criu/issues/1868 -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' || fail diff --git a/test/jenkins/criu-remote-lazy-pages.sh b/test/jenkins/criu-remote-lazy-pages.sh index 48787f3f6..1c677e333 100755 --- a/test/jenkins/criu-remote-lazy-pages.sh +++ b/test/jenkins/criu-remote-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --remote-lazy-pages $LAZY_EXCLUDE -x maps04 || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from "remote" dump with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-snap.sh b/test/jenkins/criu-snap.sh index d8fdf02b3..b08c57f52 100755 --- a/test/jenkins/criu-snap.sh +++ b/test/jenkins/criu-snap.sh @@ -5,5 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' || fail From ced4ab4b0a2db401108268a2c1232a0ef479871c Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 10 May 2022 20:40:53 +0300 Subject: [PATCH 004/775] zdtm: skip zdtm/static/shm-hugetlb when hugetlb is not supported Reported-by: Mr. Jenkins (ppc64le) Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/shm-hugetlb.checkskip | 4 ++++ 1 file changed, 4 insertions(+) create mode 100755 test/zdtm/static/shm-hugetlb.checkskip diff --git a/test/zdtm/static/shm-hugetlb.checkskip b/test/zdtm/static/shm-hugetlb.checkskip new file mode 100755 index 000000000..df2370815 --- /dev/null +++ b/test/zdtm/static/shm-hugetlb.checkskip @@ -0,0 +1,4 @@ +#!/bin/bash + +# will fail with EOPNOTSUPP +cat /proc/sys/vm/nr_hugepages &> /dev/null From 295dc85ca0d8d478889a34f296a1fb4aa844f2dc Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 31 May 2022 11:25:03 +0300 Subject: [PATCH 005/775] github: use git-clang-format instead of make indent This allows us to only detect bad formating in PR changes but not all the CRIU codebase. Signed-off-by: Pavel Tikhomirov --- .github/workflows/lint.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c3886c707..d32403d05 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,7 +9,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell + run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell git-clang-format - uses: actions/checkout@v2 @@ -22,7 +22,12 @@ jobs: - name: Run make indent run: > - make indent && + if [ -z "${{github.base_ref}}" ]; then + make indent + else + git fetch origin ${{github.base_ref}} && + git clang-format --style file --extensions c,h --quiet origin/${{github.base_ref}} + fi && STATUS=$(git status --porcelain) && if [ ! -z "$STATUS" ]; then echo "FAIL: some files are not correctly formatted."; From 28358db13ba453d1292104e117790da076c91721 Mon Sep 17 00:00:00 2001 From: Ashutosh Mehra Date: Mon, 30 May 2022 15:57:07 -0400 Subject: [PATCH 006/775] Fix the check for mnt namespace in criu-ns criu-ns script incorrectly compares the pidns fd with mntns fd. Also reversed the condition in is_my_namespace function to align it with the function name. Signed-off-by: Ashutosh Mehra --- scripts/criu-ns | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 9fc58b640..1217c3dcd 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -153,9 +153,9 @@ def _set_namespace(fd): raise OSError(_errno, errno.errorcode[_errno]) -def is_my_namespace(fd): +def is_my_namespace(fd, ns): """Returns True if fd refers to current namespace""" - return os.stat('/proc/self/ns/pid').st_ino != os.fstat(fd).st_ino + return os.stat('/proc/self/ns/%s' % ns).st_ino == os.fstat(fd).st_ino def set_pidns(tpid, pid_idx): @@ -165,7 +165,7 @@ def set_pidns(tpid, pid_idx): pid namespace. """ ns_fd = os.open('/proc/%s/ns/pid' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "pid"): for line in open('/proc/%s/status' % tpid): if not line.startswith('NSpid:'): continue @@ -190,7 +190,7 @@ def set_mntns(tpid): will be the same in target mntns. """ ns_fd = os.open('/proc/%s/ns/mnt' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "mnt"): root_st = os.stat('/') cwd_st = os.stat('.') cwd_path = os.path.realpath('.') From 5cd7092fda7c37e46a9ad606824ca20a0bef4ae3 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 9 Jun 2022 12:17:06 +0300 Subject: [PATCH 007/775] sk-unix: make add_fake_unix_queuers earier and rework find_queuer_for Before this patch, if we had a unixsk with incomming scm packets (with fds) and with the sender side fd closed, we got an error: Error (criu/sk-unix.c:1125): unix: Can't find sender for 0x1e First part of the problem is that unix_note_scm_rights() expects to see a "queuer" which would send scm packets to the unixsk, and there is no as the sender side is closed. Second part of the problem is that we already have "fake" queuers feature so that it already creates a unix socket pair and leaves other end open for later queuing packets. But function add_fake_unix_queuers() is called after unix_note_scm_rights() thus there is no chance to find queuer at the point of failure. Third part is that when we look for a queuer in find_queuer_for() we actually look for a socket for which we are a queuer and not for the socket which is a queuer for us, which is opposite to the name. For cases where both ends are alive both are queuers for each other so this was not important, but for our closed sender case it breaks. So let's reorder add_fake_unix_queuers() before unix_note_scm_rights() and make find_queuer_for() actually do what it's name implies. This situation is started to reproduce on Virtuozzo start/stop tests with the unixsk belonging to systemd, we suppose that this state where the sender fd side is closed happens rarely only on systemd start/stop, so we don't see it in regular suspend resume of long-living containers. Signed-off-by: Pavel Tikhomirov --- criu/cr-restore.c | 8 ++++---- criu/sk-unix.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9853c0585..398faf048 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -351,6 +351,10 @@ static int root_prepare_shared(void) if (ret) goto err; + ret = add_fake_unix_queuers(); + if (ret) + goto err; + /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. @@ -367,10 +371,6 @@ static int root_prepare_shared(void) if (ret) goto err; - ret = add_fake_unix_queuers(); - if (ret) - goto err; - show_saved_files(); err: return ret; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index c6021bc1f..47e1b2962 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -1021,8 +1021,8 @@ static struct unix_sk_info *find_queuer_for(int id) struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { - if (ui->queuer && ui->queuer->ue->id == id) - return ui; + if (ui->queuer && ui->ue->id == id) + return ui->queuer; } return NULL; From b117b211ab6b265b017c0e0189c8b1cedca1522f Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 9 Jun 2022 17:48:37 +0300 Subject: [PATCH 008/775] zdtm/scm: add scm09 test with closed sender fd Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 2 ++ test/zdtm/static/scm00.c | 3 +++ test/zdtm/static/scm09.c | 1 + 3 files changed, 6 insertions(+) create mode 120000 test/zdtm/static/scm09.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4a93659d4..a3c1ccf4b 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -199,6 +199,7 @@ TST_NOFILE := \ scm04 \ scm05 \ scm06 \ + scm09 \ aio00 \ aio01 \ fd \ @@ -593,6 +594,7 @@ vdso01: LDLIBS += -lrt scm01: CFLAGS += -DKEEP_SENT_FD scm02: CFLAGS += -DSEND_BOTH scm04: CFLAGS += -DSEPARATE +scm09: CFLAGS += -DCLOSE_SENDER_FD mntns_link_remap: CFLAGS += -DZDTM_LINK_REMAP mntns_shared_bind02: CFLAGS += -DSHARED_BIND02 mntns_root_bind02: CFLAGS += -DROOT_BIND02 diff --git a/test/zdtm/static/scm00.c b/test/zdtm/static/scm00.c index d66975582..670e6fd6a 100644 --- a/test/zdtm/static/scm00.c +++ b/test/zdtm/static/scm00.c @@ -105,6 +105,9 @@ int main(int argc, char **argv) p[1] = p[0]; p[0] = -1; #endif +#endif +#ifdef CLOSE_SENDER_FD + close(sk[0]); #endif test_daemon(); diff --git a/test/zdtm/static/scm09.c b/test/zdtm/static/scm09.c new file mode 120000 index 000000000..4cab0edd2 --- /dev/null +++ b/test/zdtm/static/scm09.c @@ -0,0 +1 @@ +scm00.c \ No newline at end of file From dd0217976ce0f314a2be036d267993a6279f15a2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 12 May 2022 21:44:00 +0100 Subject: [PATCH 009/775] amdgpu: Add gitignore Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 plugins/amdgpu/.gitignore diff --git a/plugins/amdgpu/.gitignore b/plugins/amdgpu/.gitignore new file mode 100644 index 000000000..4e5c8f58e --- /dev/null +++ b/plugins/amdgpu/.gitignore @@ -0,0 +1,3 @@ +*.pb-c.c +*.pb-c.h +test_topology_remap From 91e971c4d941547419c191b95e0afb9ac0dae11f Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 12 May 2022 22:40:54 +0700 Subject: [PATCH 010/775] hugetlb: don't dump anonymous private hugetlb mapping using memfd approach Currently, the content of anonymous private hugetlb mapping is dumped in 2 different images: memfd approach and normal private mapping dumping. In memfd approach, we dump the content of the backing pseudo file (/anon_hugepage). This is incorrect and redundant since the mapping is private, the content of backing file may differ from the content of the mapping. With this commit, we remove the redundant memfd approach dump and only do the normal private mapping dump on anonymous hugetlb mapping. Run zdtm.py run -f h --keep-img always -t zdtm/static/maps09, du -h in the dumped image directory Before this commit 13M test/dump/zdtm/static/maps09/55/1 After this commit 8.5M test/dump/zdtm/static/maps09/55/1 The reduction in size is approximately 4MB which is the size of anonymous private hugetlb mapping in the test. Signed-off-by: Bui Quang Minh --- criu/hugetlb.c | 13 +++++++++++++ criu/include/hugetlb.h | 6 ++++++ criu/proc_parse.c | 7 +++---- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/criu/hugetlb.c b/criu/hugetlb.c index aa98662d8..866c4050f 100644 --- a/criu/hugetlb.c +++ b/criu/hugetlb.c @@ -35,6 +35,19 @@ int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag) return 0; } +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma) +{ + /* + * Dump the hugetlb backed mapping using memfd_hugetlb when it is not + * anonymous private mapping. + */ + if (kdat.has_memfd_hugetlb && is_hugetlb_dev(dev, hugetlb_size_flag) && + !((vma->e->flags & MAP_PRIVATE) && !strncmp(file_path, ANON_HUGEPAGE_PREFIX, ANON_HUGEPAGE_PREFIX_LEN))) + return 1; + + return 0; +} + unsigned long get_size_from_hugetlb_flag(int flag) { int i; diff --git a/criu/include/hugetlb.h b/criu/include/hugetlb.h index c0e83652b..9aee5bed3 100644 --- a/criu/include/hugetlb.h +++ b/criu/include/hugetlb.h @@ -4,6 +4,11 @@ #include #include +#include "vma.h" + +#define ANON_HUGEPAGE_PREFIX "/anon_hugepage" +#define ANON_HUGEPAGE_PREFIX_LEN (sizeof(ANON_HUGEPAGE_PREFIX) - 1) + enum hugepage_size { HUGETLB_16KB, HUGETLB_64KB, @@ -46,6 +51,7 @@ struct htlb_info { extern struct htlb_info hugetlb_info[HUGETLB_MAX]; int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag); +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma); unsigned long get_size_from_hugetlb_flag(int flag); #ifndef MFD_HUGETLB diff --git a/criu/proc_parse.c b/criu/proc_parse.c index b3badb6e4..6b41a81db 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -620,17 +620,16 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat pr_info("path: %s\n", file_path); vma_area->e->status |= VMA_AREA_SYSVIPC; } else { - /* Dump shmem dev, hugetlb dev (private and share) mappings the same way as memfd - * when possible. + /* We dump memfd backed mapping, both normal and hugepage anonymous share + * mapping using memfd approach when possible. */ if (is_memfd(st_buf->st_dev) || is_anon_shmem_map(st_buf->st_dev) || - (kdat.has_memfd_hugetlb && is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag))) { + can_dump_with_memfd_hugetlb(st_buf->st_dev, &hugetlb_flag, file_path, vma_area)) { vma_area->e->status |= VMA_AREA_MEMFD; vma_area->e->flags |= hugetlb_flag; if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } else if (is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag)) { - /* hugetlb mapping but memfd does not support HUGETLB */ vma_area->e->flags |= hugetlb_flag; vma_area->e->flags |= MAP_ANONYMOUS; From 49caf85b20ae0570b4ffd6e537cb0a47ac0899c5 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 21 Jun 2022 12:17:52 +0300 Subject: [PATCH 011/775] config: fail on --track-mem option if dirty tracking is not available Else we trigger BUG in task_reset_dirty_track(): Error (criu/mem.c:45): BUG at criu/mem.c:45 The check in kerndat_get_dirty_track() does not work right. https://github.com/checkpoint-restore/criu/issues/1917 Reported-by: @mrc1119 Signed-off-by: Pavel Tikhomirov --- criu/config.c | 5 +++++ criu/kerndat.c | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/criu/config.c b/criu/config.c index 14a11f9c3..4023d807c 100644 --- a/criu/config.c +++ b/criu/config.c @@ -1115,6 +1115,11 @@ int check_options(void) } } + if (opts.track_mem && !kdat.has_dirty_track) { + pr_err("Tracking memory is not available. Consider omitting --track-mem option.\n"); + return 1; + } + if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/kerndat.c b/criu/kerndat.c index b8b6bc95d..bc5dccab1 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -420,10 +420,6 @@ static int kerndat_get_dirty_track(void) } else { no_dt: pr_info("Dirty tracking support is OFF\n"); - if (opts.track_mem) { - pr_err("Tracking memory is not available\n"); - return -1; - } } return 0; From c7858ba42bed2e072ef3842046f4a6335e70ee44 Mon Sep 17 00:00:00 2001 From: Yuriy Vasiliev Date: Thu, 20 Jan 2022 17:13:59 +0100 Subject: [PATCH 012/775] infect: add SIGTSTP support Add SIGTSTP signal dump and restore. Add a corresponding field in the image, save it only if a task is in the stopped state. Restore task state by sending desired stop signal if it is present in the image. Fallback to SIGSTOP if it's absent. Signed-off-by: Yuriy Vasiliev --- Documentation/compel.txt | 5 ++- compel/include/uapi/infect.h | 3 ++ compel/src/lib/infect.c | 87 +++++++++++++++++++++++++++++------- criu/cr-dump.c | 5 +++ criu/cr-restore.c | 11 ++++- criu/include/pid.h | 4 ++ criu/proc_parse.c | 15 ++++++- criu/pstree.c | 1 + criu/seize.c | 8 +++- images/core.proto | 2 + 10 files changed, 121 insertions(+), 20 deletions(-) diff --git a/Documentation/compel.txt b/Documentation/compel.txt index a44ca22c6..506228f59 100644 --- a/Documentation/compel.txt +++ b/Documentation/compel.txt @@ -97,7 +97,10 @@ Following steps are performed to infect the victim process: - execute system call: *int compel_syscall(ctl, int syscall_nr, long *ret, int arg ...);* - infect victim: *int compel_infect(ctl, nr_thread, size_of_args_area);* - cure the victim: *int compel_cure(ctl);* //ctl pointer is freed by this call - - Resume victim: *int compel_resume_task(pid, orig_state, state);* + - Resume victim: *int compel_resume_task(pid, orig_state, state)* or + *int compel_resume_task_sig(pid, orig_state, state, stop_signo).* + //compel_resume_task_sig() could be used in case when victim is in stopped state. + stop_signo could be read by calling compel_parse_stop_signo(). *ctl* must be configured with blob information by calling *PREFIX_setup_c_header()*, with ctl as its argument. *PREFIX* is the argument given to *-p* when calling hgen, else it is deduced from file name. diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 3040a67a7..7073f343f 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -18,6 +18,7 @@ extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; unsigned long long shdpnd; + unsigned long long sigblk; char state; int vpid; int ppid; @@ -30,7 +31,9 @@ extern int __must_check compel_wait_task(int pid, int ppid, struct seize_task_status *st, void *data); extern int __must_check compel_stop_task(int pid); +extern int __must_check compel_parse_stop_signo(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); +extern int compel_resume_task_sig(pid_t pid, int orig_state, int state, int stop_signo); struct parasite_ctl; struct parasite_thread_ctl; diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index c78c02a6a..b99f23b36 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -92,6 +92,12 @@ static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(aux, "SigBlk:", 7)) { + if (sscanf(aux + 7, "%llx", &ss->sigblk) != 1) + goto err_parse; + + continue; + } } fclose(f); @@ -186,6 +192,29 @@ static int skip_sigstop(int pid, int nr_signals) return 0; } +#define SIG_MASK(sig) (1ULL << ((sig)-1)) + +#define SIG_IN_MASK(sig, mask) ((sig) > 0 && (sig) <= SIGMAX && (SIG_MASK(sig) & (mask))) + +#define SUPPORTED_STOP_MASK ((1ULL << (SIGSTOP - 1)) | (1ULL << (SIGTSTP - 1))) + +static inline int sig_stop(int sig) +{ + return SIG_IN_MASK(sig, SUPPORTED_STOP_MASK); +} + +int compel_parse_stop_signo(int pid) +{ + siginfo_t si; + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si) < 0) { + pr_perror("SEIZE %d: can't parse stopped siginfo", pid); + return -1; + } + + return si.si_signo; +} + /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace @@ -198,7 +227,7 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ void *data) { siginfo_t si; - int status, nr_sigstop; + int status, nr_stopsig; int ret = 0, ret2, wait_errno = 0; /* @@ -291,17 +320,32 @@ try_again: goto err; } - nr_sigstop = 0; - if (ss->sigpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (ss->shdpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (si.si_signo == SIGSTOP) - nr_sigstop++; + nr_stopsig = 0; + if (SIG_IN_MASK(SIGSTOP, ss->sigpnd)) + nr_stopsig++; + if (SIG_IN_MASK(SIGSTOP, ss->shdpnd)) + nr_stopsig++; - if (nr_sigstop) { - if (skip_sigstop(pid, nr_sigstop)) - goto err_stop; + if (SIG_IN_MASK(SIGTSTP, ss->sigpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + if (SIG_IN_MASK(SIGTSTP, ss->shdpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + + if (sig_stop(si.si_signo)) + nr_stopsig++; + + if (nr_stopsig) { + if (skip_sigstop(pid, nr_stopsig)) { + /* + * Make sure that the task is stopped by a supported stop signal and + * send it again to restore task state before criu intervention. + */ + if (sig_stop(si.si_signo)) + kill(pid, si.si_signo); + else + kill(pid, SIGSTOP); + goto err; + } return COMPEL_TASK_STOPPED; } @@ -313,8 +357,6 @@ try_again: goto err; } -err_stop: - kill(pid, SIGSTOP); err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); @@ -322,6 +364,11 @@ err: } int compel_resume_task(pid_t pid, int orig_st, int st) +{ + return compel_resume_task_sig(pid, orig_st, st, SIGSTOP); +} + +int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) { int ret = 0; @@ -345,8 +392,18 @@ int compel_resume_task(pid_t pid, int orig_st, int st) * task with STOP in queue that would get lost after * detach, so stop it again. */ - if (orig_st == COMPEL_TASK_STOPPED) - kill(pid, SIGSTOP); + if (orig_st == COMPEL_TASK_STOPPED) { + /* + * Check that stop_signo contain supported stop signal. + * If it isn't, then send SIGSTOP. It makes sense in the case + * when we get COMPEL_TASK_STOPPED from old image, + * where stop_signo was not yet supported. + */ + if (sig_stop(stop_signo)) + kill(pid, stop_signo); + else + kill(pid, SIGSTOP); + } } else { pr_err("Unknown final state %d\n", st); ret = -1; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 60e90baed..e60da88ed 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -781,6 +781,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[0]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + if (core->tc->task_state == TASK_STOPPED) { + core->tc->has_stop_signo = true; + core->tc->stop_signo = item->pid->stop_signo; + } + ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 398faf048..279246c19 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1350,6 +1350,9 @@ static inline int fork_with_pid(struct pstree_item *item) item->pid->state = ca.core->tc->task_state; rsti(item)->cg_set = ca.core->tc->cg_set; + if (ca.core->tc->has_stop_signo) + item->pid->stop_signo = ca.core->tc->stop_signo; + if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); return -1; @@ -2104,8 +2107,14 @@ static void finalize_restore(void) xfree(ctl); - if ((item->pid->state == TASK_STOPPED) || (opts.final_state == TASK_STOPPED)) + if (opts.final_state == TASK_STOPPED) kill(item->pid->real, SIGSTOP); + else if (item->pid->state == TASK_STOPPED) { + if (item->pid->stop_signo > 0) + kill(item->pid->real, item->pid->stop_signo); + else + kill(item->pid->real, SIGSTOP); + } } } diff --git a/criu/include/pid.h b/criu/include/pid.h index 49cb2d322..b2b7a361a 100644 --- a/criu/include/pid.h +++ b/criu/include/pid.h @@ -31,6 +31,10 @@ struct pid { pid_t real; int state; /* TASK_XXX constants */ + /* If an item is in stopped state it has a signal number + * that caused task to stop. + */ + int stop_signo; /* * The @virt pid is one which used in the image itself and keeps diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 6b41a81db..946b0fc40 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1027,12 +1027,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) cr->s.sigpnd = 0; cr->s.shdpnd = 0; + cr->s.sigblk = 0; cr->s.seccomp_mode = SECCOMP_MODE_DISABLED; if (bfdopenr(&f)) return -1; - while (done < 13) { + while (done < 14) { str = breadline(&f); if (str == NULL) break; @@ -1143,13 +1144,23 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) goto err_parse; cr->s.sigpnd |= sigpnd; + done++; + continue; + } + if (!strncmp(str, "SigBlk:", 7)) { + unsigned long long sigblk = 0; + + if (sscanf(str + 7, "%llx", &sigblk) != 1) + goto err_parse; + cr->s.sigblk |= sigblk; + done++; continue; } } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 11 : 10); + expected_done = (parsed_seccomp ? 12 : 11); if (kdat.has_nspid) expected_done++; if (done == expected_done) diff --git a/criu/pstree.c b/criu/pstree.c index f4d77b3a4..72c4a3502 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -222,6 +222,7 @@ struct pstree_item *__alloc_pstree_item(bool rst) item->pid->ns[0].virt = -1; item->pid->real = -1; item->pid->state = TASK_UNDEF; + item->pid->stop_signo = -1; item->born_sid = -1; item->pid->item = item; futex_init(&item->task_st); diff --git a/criu/seize.c b/criu/seize.c index 58564ca74..1333d6db9 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -615,6 +615,9 @@ static int collect_children(struct pstree_item *item) else processes_to_wait--; + if (ret == TASK_STOPPED) + c->pid->stop_signo = compel_parse_stop_signo(pid); + c->pid->real = pid; c->parent = item; c->pid->state = ret; @@ -646,7 +649,7 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) * the item->state is the state task was in when we seized one. */ - compel_resume_task(item->pid->real, item->pid->state, st); + compel_resume_task_sig(item->pid->real, item->pid->state, st, item->pid->stop_signo); if (st == TASK_DEAD) return; @@ -950,6 +953,9 @@ int collect_pstree(void) else processes_to_wait--; + if (ret == TASK_STOPPED) + root_item->pid->stop_signo = compel_parse_stop_signo(pid); + pr_info("Seized task %d, state %d\n", pid, ret); root_item->pid->state = ret; diff --git a/images/core.proto b/images/core.proto index 35079f366..345bdca53 100644 --- a/images/core.proto +++ b/images/core.proto @@ -60,6 +60,8 @@ message task_core_entry { // Reserved for container relative start time //optional uint64 start_time = 19; optional uint64 blk_sigset_extended = 20[(criu).hex = true]; + + optional uint32 stop_signo = 21; } message task_kobj_ids_entry { From 6cef6e726a6a844bb5dea6ab3841bf30b9dba122 Mon Sep 17 00:00:00 2001 From: Yuriy Vasiliev Date: Tue, 18 Jan 2022 14:35:55 +0100 Subject: [PATCH 013/775] zdtm: add tests for SIGTSTP stopped03 check that stopped by SIGTSTP tasks are restored correctly. stopped04 check that stopped by SIGSTOP tasks which have blocked SIGTSTP and have SIGTSTP pending are restored correctly. Signed-off-by: Yuriy Vasiliev --- test/zdtm/static/Makefile | 2 + test/zdtm/static/stopped03.c | 161 +++++++++++++++++++++++++++++++++++ test/zdtm/static/stopped04.c | 135 +++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 test/zdtm/static/stopped03.c create mode 100644 test/zdtm/static/stopped04.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index a3c1ccf4b..5a8a5f75c 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -184,6 +184,8 @@ TST_NOFILE := \ stopped01 \ stopped02 \ stopped12 \ + stopped03 \ + stopped04 \ rtc \ clean_mntns \ mntns_rw_ro_rw \ diff --git a/test/zdtm/static/stopped03.c b/test/zdtm/static/stopped03.c new file mode 100644 index 000000000..85c7177f7 --- /dev/null +++ b/test/zdtm/static/stopped03.c @@ -0,0 +1,161 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +#define STOP_SIGNO SIGTSTP +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_CHECK, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} * sh; + +static int new_pgrp(void) +{ + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, STOP_SIGNO)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_CHECK); + + infop.si_code = 0; + infop.si_status = 0; + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + futex_set_and_wake(&sh->fstate, TEST_DONE); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + pr_err("Process is not in correct state before C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + futex_set_and_wake(&sh->fstate, TEST_CHECK); + futex_wait_while_lt(&sh->fstate, TEST_DONE); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + fail = 1; + pr_err("Process is not in correct state after C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} diff --git a/test/zdtm/static/stopped04.c b/test/zdtm/static/stopped04.c new file mode 100644 index 000000000..237094ca4 --- /dev/null +++ b/test/zdtm/static/stopped04.c @@ -0,0 +1,135 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} * sh; + +static int new_pgrp(void) +{ + sigset_t sigset; + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + sigemptyset(&sigset); + sigaddset(&sigset, SIGTSTP); + sigprocmask(SIG_BLOCK, &sigset, NULL); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, SIGSTOP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + if (kill(pid, SIGTSTP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} From 18fba412551df519ee41943e9e3967b039d1427b Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 30 May 2022 17:34:20 +0000 Subject: [PATCH 014/775] config/files-reg: Add opt to skip file r/w/x check on restore A file's r/w/x changing between checkpoint and restore does not necessarily imply that something is wrong. For example, if a process opens a file having perms rw- for reading and we change the perms to r--, the process can be restored and will function as expected. Therefore, this patch adds an option --skip-file-rwx-check to disable this check on restore. File validation is unaffected and should still function as expected with respect to the content of files. Signed-off-by: Younes Manton --- Documentation/criu.txt | 3 +++ criu/config.c | 1 + criu/cr-service.c | 3 +++ criu/crtools.c | 3 +++ criu/files-reg.c | 18 +++++++++++++++--- criu/include/cr_options.h | 1 + images/rpc.proto | 1 + lib/c/criu.c | 11 +++++++++++ lib/c/criu.h | 2 ++ 9 files changed, 40 insertions(+), 3 deletions(-) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 8b128f63e..8d2e91443 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -668,6 +668,9 @@ The 'mode' may be one of the following: build-ID cannot be obtained, 'chksm-first' method will be used. This is the default if mode is unspecified. +*--skip-file-rwx-check*:: + Skip checking file permissions (r/w/x for u/g/o) on restore. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to diff --git a/criu/config.c b/criu/config.c index 4023d807c..24c445c8b 100644 --- a/criu/config.c +++ b/criu/config.c @@ -696,6 +696,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097 }, { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), diff --git a/criu/cr-service.c b/criu/cr-service.c index a6eb9ebd3..1d9f0aca3 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -464,6 +464,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_shell_job) opts.shell_job = req->shell_job; + if (req->has_skip_file_rwx_check) + opts.skip_file_rwx_check = req->skip_file_rwx_check; + if (req->has_file_locks) opts.handle_file_locks = req->file_locks; diff --git a/criu/crtools.c b/criu/crtools.c index cc8d9179f..8bcbe8e38 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -504,6 +504,9 @@ usage: " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" + " --skip-file-rwx-check\n" + " Skip checking file permissions\n" + " (r/w/x for u/g/o) on restore.\n" "\n" "Check options:\n" " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/files-reg.c b/criu/files-reg.c index 0249063c2..ce8788637 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2199,9 +2199,21 @@ ext: if (!validate_file(tmp, &st, rfi)) goto err; - if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { - pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); - goto err; + if (rfi->rfe->has_mode) { + mode_t curr_mode = st.st_mode; + mode_t saved_mode = rfi->rfe->mode; + + if (opts.skip_file_rwx_check) { + curr_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + saved_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + } + + if (curr_mode != saved_mode) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n" + "File r/w/x checks can be skipped with the --skip-file-rwx-check option\n", + rfi->path, (int)curr_mode, saved_mode); + goto err; + } } /* diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index bf1a762cc..e544a2d9a 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -179,6 +179,7 @@ struct cr_options { bool lazy_pages; char *work_dir; int network_lock_method; + int skip_file_rwx_check; /* * When we scheduler for removal some functionality we first diff --git a/images/rpc.proto b/images/rpc.proto index a6cc5da48..3cf431639 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -138,6 +138,7 @@ message criu_opts { optional string lsm_mount_context = 63; optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; optional bool mntns_compat_mode = 65; + optional bool skip_file_rwx_check = 66; /* optional bool check_mounts = 128; */ } diff --git a/lib/c/criu.c b/lib/c/criu.c index 7807d7bc5..8171f7a12 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -555,6 +555,17 @@ void criu_set_shell_job(bool shell_job) criu_local_set_shell_job(global_opts, shell_job); } +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check) +{ + opts->rpc->has_skip_file_rwx_check = true; + opts->rpc->skip_file_rwx_check = skip_file_rwx_check; +} + +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) +{ + criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); +} + void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; diff --git a/lib/c/criu.h b/lib/c/criu.h index 7cc6a199c..c32a8a646 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -78,6 +78,7 @@ void criu_set_tcp_close(bool tcp_close); void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); @@ -238,6 +239,7 @@ void criu_local_set_tcp_close(criu_opts *opts, bool tcp_close); void criu_local_set_weak_sysctls(criu_opts *opts, bool val); void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices); void criu_local_set_shell_job(criu_opts *opts, bool shell_job); +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check); void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master); void criu_local_set_file_locks(criu_opts *opts, bool file_locks); void criu_local_set_track_mem(criu_opts *opts, bool track_mem); From ad58553d904d457110dfaaad2328479961d439ad Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 3 Jun 2022 09:47:11 -0700 Subject: [PATCH 015/775] Add --skip-file-rwx-check opt test Add a simple test using tail to check that processes can't be restored by default when the r/w/x mode of an open file changes, unless --skip-file-rwx-check is used. Signed-off-by: Younes Manton --- scripts/ci/run-ci-tests.sh | 1 + test/Makefile | 2 +- test/others/skip-file-rwx-check/Makefile | 7 +++++ test/others/skip-file-rwx-check/run.sh | 37 ++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 test/others/skip-file-rwx-check/Makefile create mode 100755 test/others/skip-file-rwx-check/run.sh diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 8d9de6e55..3760a65e3 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -260,6 +260,7 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling diff --git a/test/Makefile b/test/Makefile index 8416b1961..e8fcffe3f 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job skip-file-rwx-check other: for t in $(TESTS); do \ diff --git a/test/others/skip-file-rwx-check/Makefile b/test/others/skip-file-rwx-check/Makefile new file mode 100644 index 000000000..419d592b7 --- /dev/null +++ b/test/others/skip-file-rwx-check/Makefile @@ -0,0 +1,7 @@ +.PHONY: run clean + +run: + ./run.sh + +clean: + rm -rf testfile *.img dump.log restore-expected-fail.log restore.log stats-dump stats-restore diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh new file mode 100755 index 000000000..0803d78ec --- /dev/null +++ b/test/others/skip-file-rwx-check/run.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +source ../env.sh + +make clean +touch testfile +chmod +w testfile +tail --follow testfile & +tailpid=$! +if ! "$criu" dump --tree=$tailpid --shell-job --verbosity=4 --log-file=dump.log +then + kill $tailpid + echo "Failed to dump process as expected" + echo FAIL + exit 1 +fi +chmod -w testfile +if "$criu" restore --restore-detached --shell-job --verbosity=4 --log-file=restore-expected-fail.log +then + kill $tailpid + echo "Unexpectedly restored process with reference to a file who's r/w/x perms changed when --skip-file-rwx-check option was not used" + echo FAIL + exit 1 +fi +if ! "$criu" restore --skip-file-rwx-check --restore-detached --shell-job --verbosity=4 --log-file=restore.log +then + echo "Failed to restore process with reference to a file who's r/w/x perms changed when --skip-file-rwx-check option was used" + echo FAIL + exit 1 +fi +kill $tailpid +echo PASS From e30d18f435e534f210cbaa3db0d97f0f586bdf0a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 8 Jul 2022 12:36:57 +0000 Subject: [PATCH 016/775] rseq: fix headers conflict on Mariner GNU/Linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. For some reason, Marier distribution headers not correctly define __GLIBC_HAVE_KERNEL_RSEQ compile-time constant. It remains undefined, but in fact header files provides corresponding rseq types declaration which leads to conflict. 2. Another issue, is that they use uint*_t types instead of __u* types as in original rseq.h. This leads to compile time issues like this: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type 'uint64_t' {aka 'long unsigned int'} and we can't even replace %llx to %PRIx64 because it will break compilation on other distros (like Fedora) with analogical error: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 6 has type ‘__u64’ {aka ‘long long unsigned int’} Let's use our-own struct rseq copy fully equal to the kernel one, it's safe because this structure is a part of Linux Kernel ABI. Fixes #1934 Reported-by: Nikola Bojanic Signed-off-by: Alexander Mikhalitsyn --- Makefile.config | 3 ++- criu/cr-dump.c | 15 ++++++++------- criu/include/linux/rseq.h | 20 ++++++++++++++------ criu/include/pstree.h | 2 +- scripts/feature-tests.mak | 19 +++++++++++++++++++ 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/Makefile.config b/Makefile.config index d46d84f2d..d113e2246 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,7 +78,8 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE OPENAT2 + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE \ + OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test diff --git a/criu/cr-dump.c b/criu/cr-dump.c index e60da88ed..210f66232 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1039,7 +1039,7 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } -static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct rseq_cs *rseq_cs, +static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct criu_rseq_cs *rseq_cs, struct criu_rseq *rseq) { int ret; @@ -1070,10 +1070,11 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, st if (!rseq->rseq_cs) return 0; - ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct rseq_cs)); + ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct criu_rseq_cs)); if (ret) { pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, - (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, (unsigned long)sizeof(struct rseq_cs)); + (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, + (unsigned long)sizeof(struct criu_rseq_cs)); return -1; } @@ -1088,7 +1089,7 @@ static int dump_thread_rseq(struct pstree_item *item, int i) CoreEntry *core = item->core[i]; RseqEntry **rseqep = &core->thread_core->rseq_entry; struct criu_rseq rseq = {}; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* @@ -1154,7 +1155,7 @@ err: static int dump_task_rseq(pid_t pid, struct pstree_item *item) { int i; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* if rseq() syscall isn't supported then nothing to dump */ if (!kdat.has_rseq) @@ -1179,7 +1180,7 @@ free_rseq: return -1; } -static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) +static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) { return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; } @@ -1187,7 +1188,7 @@ static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) static int fixup_thread_rseq(struct pstree_item *item, int i) { CoreEntry *core = item->core[i]; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* equivalent to (struct rseq)->rseq_cs is NULL */ diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h index a47876e66..5ceefbf8e 100644 --- a/criu/include/linux/rseq.h +++ b/criu/include/linux/rseq.h @@ -9,7 +9,12 @@ #endif #endif -#ifndef __GLIBC_HAVE_KERNEL_RSEQ +#include +#include + +#include "common/config.h" + +#ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS /* * linux/rseq.h * @@ -18,9 +23,6 @@ * Copyright (c) 2015-2018 Mathieu Desnoyers */ -#include -#include - enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, RSEQ_CPU_ID_REGISTRATION_FAILED = -2, @@ -41,13 +43,20 @@ enum rseq_cs_flags { RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), }; +#endif /* CONFIG_HAS_NO_LIBC_RSEQ_DEFS */ +/* + * Let's use our own definition of struct rseq_cs because some distros + * (for example Mariner GNU/Linux) declares this structure their-own way. + * This makes trouble with inconsistency between printf formatters and + * struct rseq_cs field types. + */ /* * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. It is usually declared as * link-time constant data. */ -struct rseq_cs { +struct criu_rseq_cs { /* Version of this structure. */ __u32 version; /* enum rseq_cs_flags */ @@ -57,7 +66,6 @@ struct rseq_cs { __u64 post_commit_offset; __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); -#endif /* __GLIBC_HAVE_KERNEL_RSEQ */ /* * We have to have our own copy of struct rseq definition because diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 8ae750e1a..1137046d4 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -63,7 +63,7 @@ struct dmp_info { struct parasite_ctl *parasite_ctl; struct parasite_thread_ctl **thread_ctls; uint64_t *thread_sp; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* * Although we don't support dumping different struct creds in general, diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 592552cb8..014e893a8 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -196,3 +196,22 @@ int main(void) return 0; } endef + +define FEATURE_TEST_NO_LIBC_RSEQ_DEFS + +#ifdef __has_include +#if __has_include(\"sys/rseq.h\") +#include +#endif +#endif + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +int main(void) +{ + return 0; +} +endef From c502d480f94231b4c8068ac90bbd7ab86612356e Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 10 May 2022 13:37:09 +0200 Subject: [PATCH 017/775] x86/compel/fault-inject: fixup mxcsr for PTRACE_SETFPREGS Error from: ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst (00.003111) Dumping GP/FPU registers for 56 (00.003121) Error (compel/arch/x86/src/lib/infect.c:310): Corrupting fpuregs for 56, seed 1651766595 (00.003125) Error (compel/arch/x86/src/lib/infect.c:314): Can't set FPU registers for 56: Invalid argument (00.003129) Error (compel/src/lib/infect.c:688): Can't obtain regs for thread 56 (00.003174) Error (criu/cr-dump.c:1564): Can't infect (pid: 56) with parasite See also: 145e9e0d8c6 ("x86/fpu: Fail ptrace() requests that try to set invalid MXCSR values") https://github.com/torvalds/linux/commit/145e9e0d8c6fada4a40f9fc65b34658077874d9c We decided to move from mxcsr cleaning up scheme and use mxcsr mask (0x0000ffbf) as kernel does. Thanks to Dmitry Safonov for pointing out. Tested-on: Intel(R) Xeon(R) CPU E3-1246 v3 @ 3.50GHz Reported-by: Mr. Jenkins Suggested-by: Dmitry Safonov Signed-off-by: Alexander Mikhalitsyn --- compel/arch/x86/src/lib/infect.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 98e2512e7..c0e7a544a 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -245,6 +245,19 @@ static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) return 0; } +static inline void fixup_mxcsr(struct xsave_struct *xsave) +{ + /* + * Right now xsave->i387.mxcsr filled with the random garbage, + * let's make it valid by applying mask which allows all + * features, except the denormals-are-zero feature bit. + * + * See also fpu__init_system_mxcsr function: + * https://github.com/torvalds/linux/blob/8cb1ae19/arch/x86/kernel/fpu/init.c#L117 + */ + xsave->i387.mxcsr &= 0x0000ffbf; +} + /* See arch/x86/kernel/fpu/xstate.c */ static void validate_random_xstate(struct xsave_struct *xsave) { @@ -272,17 +285,6 @@ static void validate_random_xstate(struct xsave_struct *xsave) /* No reserved bits may be set */ memset(&hdr->reserved, 0, sizeof(hdr->reserved)); - - /* - * While using PTRACE_SETREGSET the kernel checks that - * "Reserved bits in MXCSR must be zero." - * if (mxcsr[0] & ~mxcsr_feature_mask) - * return -EINVAL; - * - * As the mxcsr_feature_mask depends on the CPU the easiest solution for - * this error injection test is to set mxcsr just to zero. - */ - xsave->i387.mxcsr = 0; } /* @@ -309,6 +311,8 @@ static int corrupt_extregs(pid_t pid) */ pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); + fixup_mxcsr(&ext_regs); + if (!use_xsave) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); From b30f3ee3d36ff5b2d386d7262340d9918f2199d1 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 21 Jul 2022 22:46:10 +0700 Subject: [PATCH 018/775] zdtm: Remove permission part check for skipping vsyscall vma Normally, vsyscall vma has VM_READ, VM_EXEC permission. However, when CONFIG_LEGACY_VSYSCALL_XONLY=y, that vma only has VM_EXEC. This commit removes the permission part when checking to skip vsyscall vma in x32 tests. Signed-off-by: Bui Quang Minh --- test/zdtm.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index c011c79c0..d264c4878 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1651,6 +1651,15 @@ def get_visible_state(test): return files, maps, mounts +def has_vsyscall(maps): + vsyscall = u"ffffffffff600000-ffffffffff601000" + for i in maps: + if vsyscall in i: + return i + + return None + + def check_visible_state(test, state, opts): new = get_visible_state(test) @@ -1666,9 +1675,9 @@ def check_visible_state(test, state, opts): new_maps = new[1][pid] if os.getenv("COMPAT_TEST"): # the vsyscall vma isn't unmapped from x32 processes - vsyscall = u"ffffffffff600000-ffffffffff601000 r-xp" - if vsyscall in new_maps and vsyscall not in old_maps: - new_maps.remove(vsyscall) + entry = has_vsyscall(new_maps) + if entry and has_vsyscall(old_maps) is None: + new_maps.remove(entry) if old_maps != new_maps: print("%s: Old maps lost: %s" % (pid, old_maps - new_maps)) print("%s: New maps appeared: %s" % (pid, new_maps - old_maps)) From 70a9cd6fbf46d96b7bd49f38f3e013bd892a990e Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 21 Jul 2022 21:26:58 +0700 Subject: [PATCH 019/775] vdso-compat: Increase the reserved buffer for compat vdso On Arch Linux with 5.18.3-zen1-1-zen kernel, the vdso's size is 3 pages which exceeds the current 2-page reserved buffer. This commit simply increases the reserved buffer size to 4 pages. Fixes: https://github.com/checkpoint-restore/criu/issues/1916 Signed-off-by: Bui Quang Minh --- criu/vdso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/vdso.c b/criu/vdso.c index 1a51f1451..7de2fae78 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -479,7 +479,7 @@ out_close: return ret; } -#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 2) +#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 4) static int vdso_fill_compat_symtable(struct vdso_maps *native, struct vdso_maps *compat) { void *vdso_mmap; From d12e2364c412caca4c227c656a67b1b1404d6875 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 26 Jul 2022 13:20:58 +0300 Subject: [PATCH 020/775] zdtm: make root mount private in criu mntns If root mount in criu mntns is slave, it would be slave of host mount where criu is stored, so if someone mounts something in subdir of {criu-dir}/test/ on host while tests are running this mount can influence the test as it appears on top of root mount in criu mntns. 1) With mount-compat this mount can get into restored test mntns, which means wrong restore, as this mount was not there on dump. 2) With mount-v2 this mount would just fail container restore, as root container mount is mounted non-recursively to protect from unexpected mounts appear after restore. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index d264c4878..aefcb36a4 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -267,7 +267,7 @@ class ns_flavor: def init(self, l_bins, x_bins): subprocess.check_call( - ["mount", "--make-slave", "--bind", ".", self.root]) + ["mount", "--make-private", "--bind", ".", self.root]) self.root_mounted = True if not os.access(self.root + "/.constructed", os.F_OK): From 01b8d40ced03c38cb545bb83d568a5a951e0b609 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 27 Jul 2022 16:03:25 +0300 Subject: [PATCH 021/775] zdtm/mnt_root_ext: don't allow propagation from test mntns to criu mntns This test specifically wants to create external bind-mount of "/" from criu mntns to test mntns, and it wants "/" in criu mntns to be a shared mount so that "external" mount in the test mntns is it's slave. This is to triger specific dirname() resolution which happens only when sharing restore is involved for external mounts, and only if rootfs is involved. But initially I missed that when we create external mount in test's temporary mntns it creates a propagation in criu mntns on top of root mount. This mount may influence other tests restore as child mount in root mount converts to locked child mount in criu service mntns (for uns flavour) and when criu would restore root container mount it would fail with EINVAL on non recursive bind with locked children. To fix this mess we just need to prohibit propagating from tests temporary mntns to criu mntns by making mounts slave. Fixes: #1941 Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/mnt_root_ext.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/zdtm/static/mnt_root_ext.c b/test/zdtm/static/mnt_root_ext.c index 6a2eb068c..305e87262 100644 --- a/test/zdtm/static/mnt_root_ext.c +++ b/test/zdtm/static/mnt_root_ext.c @@ -51,6 +51,14 @@ int main(int argc, char **argv) return 1; } + /* + * Make mounts in temporary mntns slave, to prevent propagation to criu mntns + */ + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { + pr_perror("make rslave"); + return 1; + } + /* * Populate to the tests root host's rootfs subdir */ From fbded7978806d403727e4e19e597adb821d1ea31 Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Tue, 26 Jul 2022 23:39:33 +0800 Subject: [PATCH 022/775] files-reg.c: modify the check of ghost_limit to support large sparse files files-reg.c checks whether the file size is larger than ghost_limit with st_size (in dump_ghost_remap), which can not deal with large ghost sparse file, since its actual file size is not the same as what st_size shows. Therefore, in this commit, I replace st_size with st_blocks, which shows the actual file size. (1 block = 512B), thus criu can deal with large ghost sparse file. Signed-off-by: Liang-Chun Chen --- criu/files-reg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index ce8788637..c3761b5ed 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -29,6 +29,7 @@ * and checked. */ #define BUILD_ID_MAP_SIZE 1048576 +#define ST_UNIT 512 #include "cr_options.h" #include "imgset.h" @@ -946,8 +947,8 @@ static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); - if (st->st_size > opts.ghost_limit) { - pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_size); + if (st->st_blocks * ST_UNIT > opts.ghost_limit) { + pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_blocks * ST_UNIT); return -1; } From 2d34b56024c232c721a20683f5e968f107287a23 Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Wed, 27 Jul 2022 01:45:00 +0800 Subject: [PATCH 023/775] unlink_largefile.desc: remove crfail, since criu now can support unlink_largefile test In the past, the unlink_largefile test should be fail on large ghost file. However, it used sparse file, it will pass in current criu, since the large ghost sparse file issue was fixed. So the crfail flag of this test should be removed. Signed-off-by: Liang-Chun Chen --- test/zdtm/static/unlink_largefile.desc | 1 - 1 file changed, 1 deletion(-) delete mode 100644 test/zdtm/static/unlink_largefile.desc diff --git a/test/zdtm/static/unlink_largefile.desc b/test/zdtm/static/unlink_largefile.desc deleted file mode 100644 index ded89879a..000000000 --- a/test/zdtm/static/unlink_largefile.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'crfail'} From e62d541bde1ccd241f64034d28825b62a8bb2216 Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Thu, 28 Jul 2022 13:09:29 +0800 Subject: [PATCH 024/775] zdtm: add two tests for large ghost sparse file ghost_holes_large00 is a test which creates a large ghost sparse file with 1GiB hole(pwrite can only handle 2GiB maximum on 32-bit system) and 8KiB data, criu should be able to handle this kind of situation. ghost_holes_large01 is a test which creates a large ghost sparse file with 1GiB hole and 2MiB data, since 2MiB is larger than the default ghost_limit(1MiB), criu should fail on this test. v2: fix overflow on 32-bit arch. Signed-off-by: Liang-Chun Chen --- test/zdtm/static/Makefile | 3 + test/zdtm/static/ghost_holes_large00.c | 152 ++++++++++++++++++++++ test/zdtm/static/ghost_holes_large01.c | 1 + test/zdtm/static/ghost_holes_large01.desc | 1 + 4 files changed, 157 insertions(+) create mode 100644 test/zdtm/static/ghost_holes_large00.c create mode 120000 test/zdtm/static/ghost_holes_large01.c create mode 100644 test/zdtm/static/ghost_holes_large01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 5a8a5f75c..b28345400 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -306,6 +306,8 @@ TST_FILE = \ ghost_holes00 \ ghost_holes01 \ ghost_holes02 \ + ghost_holes_large00 \ + ghost_holes_large01 \ unlink_largefile \ mtime_mmap \ fifo \ @@ -609,6 +611,7 @@ unlink_fstat04: CFLAGS += -DUNLINK_FSTAT04 unlink_fstat041: CFLAGS += -DUNLINK_FSTAT041 -DUNLINK_FSTAT04 ghost_holes01: CFLAGS += -DTAIL_HOLE ghost_holes02: CFLAGS += -DHEAD_HOLE +ghost_holes_large01: CFLAGS += -DLIMIT sk-freebind-false: CFLAGS += -DZDTM_FREEBIND_FALSE selinux02: CFLAGS += -DUSING_SOCKCREATE stopped01: CFLAGS += -DZDTM_STOPPED_KILL diff --git a/test/zdtm/static/ghost_holes_large00.c b/test/zdtm/static/ghost_holes_large00.c new file mode 100644 index 000000000..1a9739f8e --- /dev/null +++ b/test/zdtm/static/ghost_holes_large00.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with one large hole(1GiB) in the middle"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for data size */ +#ifdef LIMIT +#define BUFSIZE 1024 * 1024 +#else +#define BUFSIZE 4096 +#endif +static unsigned char buf[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define DATA1_OFF 0 +#define HOLE_SIZE (1LL * 1 * 1024 * 1024 * 1024) +#define DATA2_OFF (BUFSIZE + HOLE_SIZE) +#define FILE_SIZE (2 * BUFSIZE + HOLE_SIZE) +#define ST_UNIT 512 + +int main(int argc, char **argv) +{ + int fd; + struct stat st; + uint32_t crc; + bool chk_hole = true; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + pr_perror("can't write data1"); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + pr_perror("can't write data2"); + goto failed; + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + test_msg("Won't check for hole\n"); + chk_hole = false; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("file size OK\n"); + + if (st.st_blocks * ST_UNIT != 2 * BUFSIZE) { + fail("actual file size changed to %ld", (long)st.st_blocks * ST_UNIT); + goto failed; + } + + test_msg("actual file size OK\n"); + + /* Data 1 */ + if (pread(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + fail("pread1 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk1 fail"); + goto failed; + } + + test_msg("Data1 OK\n"); + + /* Data 2 */ + if (pread(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + fail("pread2 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk2 fail"); + goto failed; + } + + test_msg("Data2 OK\n"); + + /* Hole */ + if (chk_hole) { + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + fail("Begin of mid hole not found"); + goto failed; + } + if (lseek(fd, DATA1_OFF + BUFSIZE, SEEK_DATA) != DATA2_OFF) { + fail("End of mid hole not found"); + goto failed; + } + test_msg("Mid hole OK\n"); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_holes_large01.c b/test/zdtm/static/ghost_holes_large01.c new file mode 120000 index 000000000..1b90363d4 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.c @@ -0,0 +1 @@ +ghost_holes_large00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_holes_large01.desc b/test/zdtm/static/ghost_holes_large01.desc new file mode 100644 index 000000000..8e6a476bd --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} \ No newline at end of file From da4803beae6791d6430881ba6bd7aa4a5bb35524 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 4 Aug 2022 16:56:30 +0100 Subject: [PATCH 025/775] MAINTAINERS: Add Radostin (myself) to maintainers I've been contributing to CRIU for sometime and I'm hoping that my familiarity with the project would be sufficient to self-nominate as a maintainer. I would like to help with code reviews, submitting patches, implementing new features, and maintaining the project in general. Signed-off-by: Radostin Stoyanov --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index bb153f1ab..7d53d0dc1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4,3 +4,4 @@ Mike Rapoport Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov +Radostin Stoyanov From eb4ecb3cfd158b9c79af6f33c42c7a64137b2626 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 23 Jul 2022 18:23:34 +0100 Subject: [PATCH 026/775] ci: unset XDG_RUNTIME_DIR when invoking podman We need to pass environment variables from the CI environment to distinguish between CI environments. However, when `sudo -E` is used to run Podman it results in the XDG_RUNTIME_DIR environment variable being set incorrectly that prevents Podman from running. This patch fixes the following error in the GitHub Action virtual environment: error running container: error from /usr/bin/crun creating container for [/bin/sh -c /bin/prepare-for-fedora-rawhide.sh]: sd-bus call: Connection reset by peer Fixes: #1942 Signed-off-by: Radostin Stoyanov --- .github/workflows/fedora-rawhide-test.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 00bc3b2bd..b6d94d23e 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -9,4 +9,8 @@ jobs: steps: - uses: actions/checkout@v2 - name: Run Fedora Rawhide Test - run: sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" + # We need to pass environment variables from the CI environment to + # distinguish between CI environments. However, we need to make sure that + # XDG_RUNTIME_DIR environment variable is not set due to a bug in Podman. + # FIXME: https://github.com/containers/podman/issues/14920 + run: sudo -E XDG_RUNTIME_DIR= make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" From e8a6765d1ef16935a55fd688bb49e0535922f3f1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 31 Jul 2022 16:07:30 +0000 Subject: [PATCH 027/775] criu: fix conflicting headers There are several changes in glibc 2.36 that make sys/mount.h header incompatible with kernel headers: https://sourceware.org/glibc/wiki/Release/2.36#Usage_of_.3Clinux.2Fmount.h.3E_and_.3Csys.2Fmount.h.3E This patch removes conflicting includes for `` and updates the content of `criu/include/linux/mount.h` to match `/usr/include/sys/mount.h`. In addition, inline definitions sys_*() functions have been moved from "linux/mount.h" to "syscall.h" to avoid conflicts with `uapi/compel/plugins/std/syscall.h` and ``. The include for `` has been replaced with local include to avoid conflicts with ``. Fixes: #1949 Signed-off-by: Radostin Stoyanov --- Makefile.config | 2 +- criu/cgroup.c | 1 + criu/cr-check.c | 2 +- criu/cr-restore.c | 3 ++- criu/include/aio.h | 2 +- criu/include/linux/aio_abi.h | 14 +++++++++++ criu/include/linux/mount.h | 48 +++++++++++++++++++----------------- criu/include/syscall.h | 17 +++++++++++++ criu/pie/parasite.c | 2 +- criu/util.c | 1 + scripts/feature-tests.mak | 13 ---------- 11 files changed, 64 insertions(+), 41 deletions(-) create mode 100644 criu/include/linux/aio_abi.h create mode 100644 criu/include/syscall.h diff --git a/Makefile.config b/Makefile.config index d113e2246..270ec61c0 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,7 +78,7 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE \ + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name diff --git a/criu/cgroup.c b/criu/cgroup.c index e05b0832e..325df6a1d 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -27,6 +27,7 @@ #include "images/cgroup.pb-c.h" #include "kerndat.h" #include "linux/mount.h" +#include "syscall.h" /* * This structure describes set of controller groups diff --git a/criu/cr-check.c b/criu/cr-check.c index f589a91da..0ca80192c 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,7 +21,6 @@ #include #include #include -#include #include "../soccr/soccr.h" @@ -52,6 +51,7 @@ #include "net.h" #include "restorer.h" #include "uffd.h" +#include "linux/aio_abi.h" #include "images/inventory.pb-c.h" diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 279246c19..d11d28173 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -22,7 +22,6 @@ #include #include "common/compiler.h" -#include "linux/mount.h" #include "linux/rseq.h" #include "clone-noasan.h" @@ -86,6 +85,8 @@ #include #include "compel/include/asm/syscall.h" +#include "linux/mount.h" + #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" diff --git a/criu/include/aio.h b/criu/include/aio.h index d1655739d..38e704020 100644 --- a/criu/include/aio.h +++ b/criu/include/aio.h @@ -1,7 +1,7 @@ #ifndef __CR_AIO_H__ #define __CR_AIO_H__ -#include +#include "linux/aio_abi.h" #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); diff --git a/criu/include/linux/aio_abi.h b/criu/include/linux/aio_abi.h new file mode 100644 index 000000000..d9ce78720 --- /dev/null +++ b/criu/include/linux/aio_abi.h @@ -0,0 +1,14 @@ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +typedef __kernel_ulong_t aio_context_t; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#endif /* __LINUX__AIO_ABI_H */ diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index 9a3a28b10..0d55a588c 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -4,32 +4,34 @@ #include "common/config.h" #include "compel/plugins/std/syscall-codes.h" -#ifdef CONFIG_HAS_FSCONFIG -#include -#else +/* Copied from /usr/include/sys/mount.h */ + +#ifndef FSCONFIG_CMD_CREATE +/* The type of fsconfig call made. */ enum fsconfig_command { - FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ - FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ - FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ - FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ - FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ - FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ +#define FSCONFIG_SET_FLAG FSCONFIG_SET_FLAG + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ +#define FSCONFIG_SET_STRING FSCONFIG_SET_STRING + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ +#define FSCONFIG_SET_BINARY FSCONFIG_SET_BINARY + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ +#define FSCONFIG_SET_PATH FSCONFIG_SET_PATH + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ +#define FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_PATH_EMPTY + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ +#define FSCONFIG_SET_FD FSCONFIG_SET_FD + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ +#define FSCONFIG_CMD_CREATE FSCONFIG_CMD_CREATE FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +#define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; +#endif // FSCONFIG_CMD_CREATE + +#ifndef MS_MGC_VAL +/* Magic mount flag number. Has to be or-ed to the flag values. */ +#define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */ +#define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */ #endif -static inline int sys_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} -static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - return syscall(__NR_fsconfig, fd, cmd, key, value, aux); -} -static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - return syscall(__NR_fsmount, fd, flags, attr_flags); -} - #endif diff --git a/criu/include/syscall.h b/criu/include/syscall.h new file mode 100644 index 000000000..c38d6d971 --- /dev/null +++ b/criu/include/syscall.h @@ -0,0 +1,17 @@ +#ifndef __CR_SYSCALL_H__ +#define __CR_SYSCALL_H__ + +static inline int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} +static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} +static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +#endif /* __CR_SYSCALL_H__ */ \ No newline at end of file diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index e7eb1fcb6..f75fe13bb 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -14,6 +13,7 @@ #include "int.h" #include "types.h" #include +#include "linux/mount.h" #include "parasite.h" #include "fcntl.h" #include "prctl.h" diff --git a/criu/util.c b/criu/util.c index 5f69465b4..060ca3bd4 100644 --- a/criu/util.c +++ b/criu/util.c @@ -40,6 +40,7 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" +#include "syscall.h" #include "clone-noasan.h" #include "cr_options.h" diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 014e893a8..fb5d2ef7a 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -137,19 +137,6 @@ ENTRY(main) END(main) endef -define FEATURE_TEST_FSCONFIG - -#include - -int main(void) -{ - if (FSCONFIG_CMD_CREATE > 0) - return 0; - return 0; -} - -endef - define FEATURE_TEST_NFTABLES_LIB_API_0 #include From 750acec25fde0a21d833957f214dc1eb9eb3d107 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Aug 2022 21:44:11 +0100 Subject: [PATCH 028/775] Revert "ci: Switch to non overlaysfs tests" This reverts commit 8bb05e3bf3fe96ce93071e22330c2701e86b9a55. The following bug has been fixed: https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 5 +---- scripts/ci/podman-test.sh | 6 +----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index f36b4e458..d4b11bd55 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -21,10 +21,7 @@ add-apt-repository \ . /etc/lsb-release -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use devicemapper storage drive as a work-around -echo '{ "experimental": true, "storage-driver": "devicemapper" }' > /etc/docker/daemon.json +echo '{ "experimental": true }' > /etc/docker/daemon.json CRIU_LOG='/criu.log' mkdir -p /etc/criu diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 414004514..973d2d722 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -25,11 +25,7 @@ make install popd rm -rf "${tmp_dir}" -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use VFS storage drive as a work-around -export STORAGE_DRIVER=vfs -podman --storage-driver vfs info +podman info # shellcheck disable=SC2016 podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' From f9bc0a750a48ed10c0da289111f2efafe9ef9b20 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Aug 2022 22:00:16 +0100 Subject: [PATCH 029/775] docker-test: use containerd installed from package In commits [1, 2] the version of containerd installed by default in the GitHub CI virtual environment was replaced with the latest release from GitHub as a workaround to a bug in containerd. This bug has been fixed sometime ago and the current default version of containerd (1.6.6) does not require this workaround. However, with the latest release, the containerd binaries uploaded on GitHub have been built for Ubuntu 22.04 [3]. Our tests are still running on Ubuntu 20.04 and this results in the following error: /usr/bin/containerd: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /usr/bin/containerd) /usr/bin/containerd: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /usr/bin/containerd) [1] https://github.com/checkpoint-restore/criu/commit/046cad8 [2] https://github.com/checkpoint-restore/criu/commit/81a68ad [3] https://github.com/containerd/containerd/commit/6b2dc9a37 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index d4b11bd55..63941437e 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -21,24 +21,14 @@ add-apt-repository \ . /etc/lsb-release +# docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf -service docker stop -systemctl stop containerd.service - -# Always use the latest containerd release. -# Restore with containerd versions after v1.2.14 and before v1.5.0-beta.0 are broken. -# https://github.com/checkpoint-restore/criu/issues/1223 -CONTAINERD_DOWNLOAD_URL=$(curl -s https://api.github.com/repos/containerd/containerd/releases/latest | grep '"browser_download_url":.*/containerd-.*-linux-amd64.tar.gz.$' | cut -d\" -f4) -wget -nv "$CONTAINERD_DOWNLOAD_URL" -O - | tar -xz -C /usr/ - -systemctl restart containerd.service -service docker restart - export SKIP_CI_TEST=1 ./run-ci-tests.sh From a1262f55fbd7a5a3a4f700bd4b9ae29584517861 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Aug 2022 14:11:49 +0100 Subject: [PATCH 030/775] cr-check: fix check for apparmor stacking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The feature check for AppArmor stacking was introduced in commit: 8723e3f998d1ec5f125e6600436a96f7ff9c1631 check: add a feature test for apparmor_stacking However, on systems that don't support AppArmour, this check always fails. As a result, `criu check --all` shows the following message: Looks good but some kernel features are missing which, depending on your process tree, may cause dump or restore failure. Reported-by: André Rösti (@andrej) Signed-off-by: Radostin Stoyanov --- criu/cr-check.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0ca80192c..0f09b902a 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1478,13 +1478,15 @@ int cr_check(void) ret |= check_newifindex(); ret |= check_pidfd_store(); ret |= check_ns_pid(); - ret |= check_apparmor_stacking(); ret |= check_network_lock_nftables(); ret |= check_sockopt_buf_lock(); ret |= check_memfd_hugetlb(); ret |= check_move_mount_set_group(); ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); + + if (kdat.lsm == LSMTYPE__APPARMOR) + ret |= check_apparmor_stacking(); } /* From 1f9bd82a55e2c125831931d169add46c9df79996 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sat, 6 Aug 2022 22:03:15 +0100 Subject: [PATCH 031/775] cr-check: optimize check for apparmor stacking The result of check_aa_ns_dumping() is stored in kdat. Instead of doing the same check twice - once on kerndat_init(), and again in check_apparmor_stacking(), we can check the stored value. Suggested-by: Pavel Tikhomirov Signed-off-by: Radostin Stoyanov --- criu/cr-check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0f09b902a..6c95ffb25 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -104,7 +104,7 @@ out: static int check_apparmor_stacking(void) { - if (!check_aa_ns_dumping()) + if (!kdat.apparmor_ns_dumping_enabled) return -1; return 0; From cacddf19dad339f963b0b01f7174091b90c49e5d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Wed, 20 Jul 2022 14:36:28 +0300 Subject: [PATCH 032/775] cr-restore: rseq: dynamically handle *libc with rseq Before this patch we assumed that CRIU is compiled against the same GLibc as it runs with. But as we see from real world examples like #1935 it's not always true. The idea of this patch is to detect rseq configuration for the main CRIU process and use it to unregister rseq for all further child processes. It's correct, because we restore pstree using clone*() syscalls, don't use exec*() (!) syscalls, so rseq gets inherited in the kernel and rseq configuration remains the same for all children processes. This will prevent issues like this: https://github.com/checkpoint-restore/criu/issues/1935 Suggested-by: Florian Weimer Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 16 ++++++++-------- criu/include/kerndat.h | 2 ++ criu/kerndat.c | 25 +++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d11d28173..5b5b41dfc 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3103,14 +3103,14 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) #else static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { - /* - * TODO: handle built-in rseq on other libc'ies like musl - * We can do that using get_rseq_conf kernel feature. - * - * For now we just assume that other libc libraries are - * not registering rseq by default. - */ - rseq->rseq_abi_pointer = 0; + if (!kdat.has_rseq || !kdat.has_ptrace_get_rseq_conf) { + rseq->rseq_abi_pointer = 0; + return; + } + + rseq->rseq_abi_pointer = kdat.libc_rseq_conf.rseq_abi_pointer; + rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; + rseq->signature = kdat.libc_rseq_conf.signature; } #endif diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 83d867e75..a3959c992 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -7,6 +7,7 @@ #include "asm/kerndat.h" #include "util-vdso.h" #include "hugetlb.h" +#include struct stat; @@ -82,6 +83,7 @@ struct kerndat_s { bool has_openat2; bool has_rseq; bool has_ptrace_get_rseq_conf; + struct __ptrace_rseq_configuration libc_rseq_conf; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index bc5dccab1..0f7d5fc8f 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -923,6 +923,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void) pid_t pid; int len; struct __ptrace_rseq_configuration rseq; + int ret = 0; pid = fork_and_ptrace_attach(NULL); if (pid < 0) @@ -930,6 +931,9 @@ static int kerndat_has_ptrace_get_rseq_conf(void) len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); if (len != sizeof(rseq)) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); goto out; @@ -940,16 +944,27 @@ static int kerndat_has_ptrace_get_rseq_conf(void) * we need to pay attention to that and, possibly, make changes on the CRIU side. */ if (rseq.flags != 0) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); } else { + if (!kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = true; + + if (memcmp(&kdat.libc_rseq_conf, &rseq, sizeof(rseq))) + ret = 1; /* we should update kdat */ + + kdat.libc_rseq_conf = rseq; } out: kill(pid, SIGKILL); waitpid(pid, NULL, 0); - return 0; + return ret; } int kerndat_sockopt_buf_lock(void) @@ -1472,6 +1487,12 @@ int kerndat_try_load_new(void) if (ret < 0) return ret; + ret = kerndat_has_ptrace_get_rseq_conf(); + if (ret < 0) { + pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); + return ret; + } + /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); @@ -1657,7 +1678,7 @@ int kerndat_init(void) pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_ptrace_get_rseq_conf()) { + if (!ret && (kerndat_has_ptrace_get_rseq_conf() < 0)) { pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ret = -1; } From f7972a3f0468e32231af6914e2e9c9e07ac53ae6 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Wed, 20 Jul 2022 15:17:35 +0300 Subject: [PATCH 033/775] cr-restore: rseq: use glibc-specific way to unregister only as fallback Let's use dynamic approach to detect built-in *libc rseq in all cases, and "old" static approach as a fallback path if the user kernel lacks support of ptrace_get_rseq_conf feature. Suggested-by: Florian Weimer Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 5b5b41dfc..919d10ab5 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3088,7 +3088,6 @@ static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) return 0; } -#if defined(__GLIBC__) && defined(RSEQ_SIG) static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { if (!kdat.has_rseq) { @@ -3096,15 +3095,14 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) return; } - rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); - rseq->rseq_abi_size = __rseq_size; - rseq->signature = RSEQ_SIG; -} + if (!kdat.has_ptrace_get_rseq_conf) { +#if defined(__GLIBC__) && defined(RSEQ_SIG) + rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + rseq->rseq_abi_size = __rseq_size; + rseq->signature = RSEQ_SIG; #else -static void prep_libc_rseq_info(struct rst_rseq_param *rseq) -{ - if (!kdat.has_rseq || !kdat.has_ptrace_get_rseq_conf) { rseq->rseq_abi_pointer = 0; +#endif return; } @@ -3112,7 +3110,6 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; rseq->signature = kdat.libc_rseq_conf.signature; } -#endif static rlim_t decode_rlim(rlim_t ival) { From 49319cd579f7c1250ed71c78b756e92aae2b9d7f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 9 Aug 2022 09:42:37 -0700 Subject: [PATCH 034/775] Add Alexander Mikhalitsyn to maintainers Alex implemented a few complex features and maintain our CI system. Signed-off-by: Andrei Vagin --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7d53d0dc1..8fee8e571 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5,3 +5,4 @@ Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov Radostin Stoyanov +Alexander Mikhalitsyn From 2642b657da4eaffa7ed0f72094b1805581fa3ae0 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 11 Aug 2022 09:51:34 +0100 Subject: [PATCH 035/775] docker-test: handle race condition error There is a race condition in docker/containerd that causes docker to occasionally fail when starting a container from a checkpoint immediately after the checkpoint has been created. This problem is unrelated to criu and has been reported in https://github.com/moby/moby/issues/42900 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 63941437e..ca93ed77c 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -75,17 +75,37 @@ checkpoint_container () { docker wait cr } -restore_container () { - CHECKPOINT_NAME=$1 - - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { +print_logs () { cat "$(grep log 'log file:' | sed 's/log file:\s*//')" || true docker logs cr || true cat $CRIU_LOG || true dmesg docker ps exit 1 - } +} + +declare -i max_restore_container_tries=3 +current_iteration= + +restore_container () { + CHECKPOINT_NAME=$1 + + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { + # FIXME: There is a race condition in docker/containerd that causes + # docker to occasionally fail when starting a container from a + # checkpoint immediately after the checkpoint has been created. + # https://github.com/moby/moby/issues/42900 + if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then + print_logs + fi + grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && { + ((current_iteration+=1)) + echo "Retry container restore: $current_iteration" + sleep 1; + restore_container "$CHECKPOINT_NAME" + } || + print_logs + } && current_iteration=0 } # Scenario: Create multiple containers and checkpoint and restore them once From a202ec271d6cc91265f717e94dd2138ce3d71c03 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 30 Mar 2022 19:27:20 -0700 Subject: [PATCH 036/775] ci/cirrus: add CentOS Stream 9 Mostly a copy-paste from the CentOS 8 task, with a few differences: - Use dnf instead of yum - Enable crb instead of powertools - Different way of installing EPEL - No need to switch to python3 as this is the default - junit_xml is now available as an rpm Signed-off-by: Kir Kolyshkin --- .cirrus.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 2b6903ddc..6a5d75149 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -19,6 +19,34 @@ task: build_script: | make -C scripts/ci vagrant-fedora-no-vdso +task: + name: CentOS Stream 9 based test + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: centos-cloud + image: family/centos-stream-9 + platform: linux + cpu: 4 + memory: 8G + + setup_script: | + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + dnf config-manager --set-enabled crb # Same as CentOS 8 powertools + dnf -y install epel-release epel-next-release + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python-flake8 xmlto + systemctl stop sssd + # Even with selinux in permissive mode the selinux tests will be executed. + # The Cirrus CI user runs as a service from selinux point of view and is + # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0). + # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode. + setenforce 0 + + build_script: | + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" + task: name: Vagrant Fedora Rawhide based test environment: From c089159a464ed32684a7754160eea84feca4b547 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 18:19:41 -0700 Subject: [PATCH 037/775] ci/cirrus: centos 8 job nits 1. Rename CentOS 8 to CentOS Stream 8 (which it is). 2. Install junit_xml from the repo rather than via pip. Signed-off-by: Kir Kolyshkin --- .cirrus.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 6a5d75149..03ed79748 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -69,7 +69,7 @@ task: make -C scripts/ci vagrant-fedora-rawhide task: - name: CentOS 8 based test + name: CentOS Stream 8 based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" @@ -85,7 +85,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-junit_xml xmlto alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed @@ -93,7 +93,6 @@ task: # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode setenforce 0 - pip3 install junit_xml build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" From d7477dac03940f714b845bf39f3b57b4145bbb97 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 7 Aug 2022 16:27:22 -0700 Subject: [PATCH 038/775] compel: set TRACESYSGOOD to distinguish breakpoints from syscalls When delivering system call traps, set bit 7 in the signal number (i.e., deliver SIGTRAP|0x80). This makes it easy for the tracer to distinguish normal traps from those caused by a system call. Signed-off-by: Andrei Vagin --- compel/include/ptrace.h | 2 ++ compel/include/uapi/infect.h | 4 ++-- compel/src/lib/infect.c | 27 ++++++++++++++------------- compel/src/lib/ptrace.c | 2 +- criu/cr-restore.c | 13 ++++++++----- 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/compel/include/ptrace.h b/compel/include/ptrace.h index bf2701e63..00013f937 100644 --- a/compel/include/ptrace.h +++ b/compel/include/ptrace.h @@ -5,6 +5,8 @@ #include #include +#define PTRACE_SYSCALL_TRAP 0x80 + #define PTRACE_SI_EVENT(_si_code) (((_si_code)&0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 7073f343f..19d4da2b1 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -80,9 +80,9 @@ enum trace_flags { TRACE_EXIT, }; -extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat, enum trace_flags trace); +extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat); -extern int __must_check compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); +extern int __must_check compel_stop_pie(pid_t pid, void *addr, bool no_bp); extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index b99f23b36..7d7865480 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -304,6 +304,11 @@ try_again: goto try_again; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } + if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; @@ -1366,7 +1371,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; - enum trace_flags flag; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) @@ -1407,11 +1411,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return -1; /* Go to sigreturn as closer as we can */ - ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ret = compel_stop_pie(pid, ctl->sigreturn_addr, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; - if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag)) + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; if (ptrace_flush_breakpoints(pid)) @@ -1546,7 +1550,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) if (ret) goto err; - ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1), TRACE_ENTER); + ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1)); /* * Don't touch extended registers here: they were restored @@ -1558,7 +1562,7 @@ err: return ret; } -int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) +int compel_stop_pie(pid_t pid, void *addr, bool no_bp) { int ret; @@ -1575,7 +1579,6 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ - *tf = TRACE_EXIT; return 0; } @@ -1588,14 +1591,12 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) pr_perror("Unable to restart the %d process", pid); return -1; } - - *tf = TRACE_ENTER; return 0; } static bool task_is_trapped(int status, pid_t pid) { - if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) + if (WIFSTOPPED(status) && (WSTOPSIG(status) & ~PTRACE_SYSCALL_TRAP) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); @@ -1629,15 +1630,13 @@ static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ -int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, enum trace_flags trace) +int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) { + enum trace_flags trace = tasks > 1 ? TRACE_ALL : TRACE_ENTER; user_regs_struct_t regs; int status, ret; pid_t pid; - if (tasks > 1) - trace = TRACE_ALL; - /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); @@ -1651,6 +1650,8 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, pr_debug("%d was trapped\n", pid); + if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) + goto goon; if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c index 49b685d70..717ee2839 100644 --- a/compel/src/lib/ptrace.c +++ b/compel/src/lib/ptrace.c @@ -23,7 +23,7 @@ int ptrace_suspend_seccomp(pid_t pid) { - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD) < 0) { pr_perror("suspending seccomp failed"); return -1; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 919d10ab5..9a1b23999 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1964,6 +1964,10 @@ static int attach_to_tasks(bool root_seized) return -1; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -2028,7 +2032,7 @@ static int restore_rseq_cs(void) return 0; } -static int catch_tasks(bool root_seized, enum trace_flags *flag) +static int catch_tasks(bool root_seized) { struct pstree_item *item; @@ -2058,7 +2062,7 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, flag, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); if (ret < 0) return -1; } @@ -2225,7 +2229,6 @@ static void reap_zombies(void) static int restore_root_task(struct pstree_item *init) { - enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item; @@ -2440,7 +2443,7 @@ skip_ns_bouncing: timing_stop(TIME_RESTORE); - if (catch_tasks(root_seized, &flag)) { + if (catch_tasks(root_seized)) { pr_err("Can't catch all tasks\n"); goto out_kill_network_unlocked; } @@ -2450,7 +2453,7 @@ skip_ns_bouncing: __restore_switch_stage(CR_STATE_COMPLETE); - ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1)); if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; From 719fea2fc9d4df405bc708b34d1e53f6943375ec Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 7 Aug 2022 16:36:15 -0700 Subject: [PATCH 039/775] compel: clear a breakpoint right after it's been triggered Breakpoints are used to stop as close as possible to a target system call. First, we don't need it after this point. Second, PTRACE_CONT can't pass through a breakpoint on arm64. Signed-off-by: Andrei Vagin --- compel/src/lib/infect.c | 15 +++++++++++---- criu/cr-restore.c | 21 --------------------- 2 files changed, 11 insertions(+), 25 deletions(-) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 7d7865480..6413a1860 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1418,9 +1418,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; - if (ptrace_flush_breakpoints(pid)) - return -1; - /* * All signals are unblocked now. The kernel notifies about leaving * syscall before starting to deliver signals. All parasite code are @@ -1650,8 +1647,18 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) pr_debug("%d was trapped\n", pid); - if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) + if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) { + /* + * On some platforms such as ARM64, it is impossible to + * pass through a breakpoint, so let's clear it right + * after it has been triggered. + */ + if (ptrace_flush_breakpoints(pid)) { + pr_err("Unable to clear breakpoints\n"); + return -1; + } goto goon; + } if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9a1b23999..9c480be78 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2071,24 +2071,6 @@ static int catch_tasks(bool root_seized) return 0; } -static int clear_breakpoints(void) -{ - struct pstree_item *item; - int ret = 0, i; - - if (fault_injected(FI_NO_BREAKPOINTS)) - return 0; - - for_each_pstree_item(item) { - if (!task_alive(item)) - continue; - for (i = 0; i < item->nr_threads; i++) - ret |= ptrace_flush_breakpoints(item->threads[i].real); - } - - return ret; -} - static void finalize_restore(void) { struct pstree_item *item; @@ -2459,9 +2441,6 @@ skip_ns_bouncing: goto out_kill_network_unlocked; } - if (clear_breakpoints()) - pr_err("Unable to flush breakpoints\n"); - finalize_restore(); /* * Some external devices such as GPUs might need a very late From b7953c6c7f020ea4a2f4221c5dd9c8314a96f733 Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Fri, 15 Apr 2022 13:00:04 +0800 Subject: [PATCH 040/775] compel: switch breakpoint functions to non-inline at arm64 platform Signed-off-by: fu.lin Signed-off-by: Andrei Vagin --- .../aarch64/src/lib/include/uapi/asm/breakpoints.h | 11 ++--------- compel/arch/aarch64/src/lib/infect.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 5f090490d..796aec016 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,14 +2,7 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} +int ptrace_set_breakpoint(pid_t pid, void *addr); +int ptrace_flush_breakpoints(pid_t pid); #endif diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index bd1ed0da3..316ff73e7 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -176,3 +176,13 @@ unsigned long compel_task_size(void) break; return task_size; } + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} From bb73e1cf5ae230c9448230c3e9e21ae2850c944a Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Sun, 7 Aug 2022 16:52:39 -0700 Subject: [PATCH 041/775] breakpoint: implement hw breakpoint for arm64 platform The x86 implement hardware breakpoint to accelerate the tracing syscall procedure instead of `ptrace(PTRACE_SYSCALL)`. The arm64 has the same capability according to <>[[1]]. <[[2]] illustrates the usage detailly: - D2.8 Breakpoint Instruction exceptions - D2.9 Breakpoint exceptions - D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers, n Note: [1]: https://developer.arm.com/documentation/102120/0100 [2]: https://developer.arm.com/documentation/ddi0487/latest Signed-off-by: fu.lin Signed-off-by: Andrei Vagin --- .../src/lib/include/uapi/asm/breakpoints.h | 34 +++++++ compel/arch/aarch64/src/lib/infect.c | 91 ++++++++++++++++++- 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 796aec016..8a61b268f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,6 +2,40 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT +#include +#include + +struct hwbp_cap { + char arch; + char bp_count; +}; + +/* copied from `linux/arch/arm64/include/asm/hw_breakpoint.h` */ +/* Lengths */ +#define ARM_BREAKPOINT_LEN_1 0x1 +#define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 +#define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f +#define ARM_BREAKPOINT_LEN_8 0xff + +/* Privilege Levels */ +#define AARCH64_BREAKPOINT_EL1 1 +#define AARCH64_BREAKPOINT_EL0 2 + +/* Breakpoint */ +#define ARM_BREAKPOINT_EXECUTE 0 + +/* Watchpoints */ +#define ARM_BREAKPOINT_LOAD 1 +#define ARM_BREAKPOINT_STORE 2 +#define AARCH64_ESR_ACCESS_MASK (1 << 6) + +#define DISABLE_HBP 0 +#define ENABLE_HBP 1 + int ptrace_set_breakpoint(pid_t pid, void *addr); int ptrace_flush_breakpoints(pid_t pid); diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 316ff73e7..7b75da890 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,7 +2,9 @@ #include #include #include +#include #include + #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" @@ -10,6 +12,7 @@ #include "errno.h" #include "infect.h" #include "infect-priv.h" +#include "asm/breakpoints.h" unsigned __page_size = 0; unsigned __page_shift = 0; @@ -177,12 +180,98 @@ unsigned long compel_task_size(void) return task_size; } +static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) +{ + static struct hwbp_cap info; + static int available = -1; + + if (available == -1) { + unsigned int val; + struct iovec iovec = { + .iov_base = &val, + .iov_len = sizeof(val), + }; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_HW_BREAK, &iovec) < 0) + available = 0; + else { + info.arch = (char)((val >> 8) & 0xff); + info.bp_count = (char)(val & 0xff); + + available = (info.arch != 0); + } + } + + return available == 1 ? &info : NULL; +} + int ptrace_set_breakpoint(pid_t pid, void *addr) { - return 0; + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + /* + * The struct is copied from `arch/arm64/include/asm/hw_breakpoint.h` in + * linux kernel: + * struct arch_hw_breakpoint_ctrl { + * __u32 __reserved : 19, + * len : 8, + * type : 2, + * privilege : 2, + * enabled : 1; + * }; + * + * The part of `struct arch_hw_breakpoint_ctrl` bits meaning is defined + * in <>, + * D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers. + */ + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | ENABLE_HBP; + regs.dbg_regs[0].addr = (__u64)addr; + regs.dbg_regs[0].ctrl = ctrl; + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { + pr_perror("Unable to restart the stopped tracee process %d", pid); + return -1; + } + + return 1; } int ptrace_flush_breakpoints(pid_t pid) { + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | DISABLE_HBP; + regs.dbg_regs[0].addr = 0ul; + regs.dbg_regs[0].ctrl = ctrl; + + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + return 0; } From dfe9d006add46d0cda5baaf32f12796b7faed528 Mon Sep 17 00:00:00 2001 From: "fu.lin" Date: Tue, 9 Aug 2022 12:18:00 -0700 Subject: [PATCH 042/775] breakpoint: enable breakpoints by default on amd64 and arm64 Signed-off-by: fu.lin Signed-off-by: Andrei Vagin --- compel/arch/aarch64/src/lib/infect.c | 12 ++++++++++++ compel/arch/x86/src/lib/infect.c | 11 +++++++++++ criu/include/fault-injection.h | 8 -------- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 7b75da890..d0189f003 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -207,6 +207,7 @@ static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) int ptrace_set_breakpoint(pid_t pid, void *addr) { + k_rtsigset_t block; struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); struct user_hwdebug_state regs = {}; unsigned int ctrl = 0; @@ -242,6 +243,17 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) return -1; + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } + if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { pr_perror("Unable to restart the stopped tracee process %d", pid); return -1; diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index c0e7a544a..01959b95b 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -588,6 +588,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) int ptrace_set_breakpoint(pid_t pid, void *addr) { + k_rtsigset_t block; int ret; /* Set a breakpoint */ @@ -603,6 +604,16 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) return -1; } + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index f33918de8..69d670be9 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -24,14 +24,6 @@ enum faults { static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { - /* - * Temporary workaround for Xen guests. Breakpoints degrade - * performance linearly, so until we find out the reason, - * let's disable them. - */ - if (f == FI_NO_BREAKPOINTS) - return true; - return fi_strategy == f; } From dca55d281a483bcdf6e156c4b7cdf73e14067b77 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 6 Apr 2022 18:35:26 +0200 Subject: [PATCH 043/775] criu: fail migration if data was sent to an in-flight socket Before this change, CRIU would just lose that data upon migration. So it's better to fail migration in this case. To reproduce the bug one can: 1. Create an AF_UNIX socket and call listen on it. 2. Create a second AF_UNIX socket and call connect to the first one. 3. Send the data to the second socket. 4. Migrate. 5. Call accept on the first socket and then read. There would be no data available. It should be even possible to close the second socket before migration. This would cause accept to hang because CRIU totally misses a closed in-flight socket. Signed-off-by: Michal Clapinski --- criu/sk-unix.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 47e1b2962..873360bfa 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -497,9 +497,34 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) goto err; } + if (sk->wqlen != 0) { + /* + * There's no known way to get data out of the write + * queue of an icon socket. The only good solution for + * now is to fail the migration. + */ + pr_err("Non-empty write queue on an in-flight socket %#x\n", ue->ino); + goto err; + } + ue->peer = e->sk_desc->sd.ino; pr_debug("\t\tFixed inflight socket %u peer %u)\n", ue->ino, ue->peer); + } else if (ue->state == TCP_LISTEN) { + int i; + + for (i = 0; i < sk->nr_icons; i++) + if (sk->icons[i] == 0) { + /* + * Inode of an icon socket equal to 0 means + * it's already been closed. That means we have + * no simple way to check if it sent any data. + * The only good solution for now is to fail + * the migration. + */ + pr_err("Found a closed in-flight socket to %#x\n", ue->ino); + goto err; + } } dump: if (dump_socket_opts(lfd, skopts)) From 83c606e023594f75d271091de1cfb49ceaa6baee Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Apr 2022 15:46:34 -0700 Subject: [PATCH 044/775] zdtm: return 1 from pr_err, pr_perror, fail This allows to make test code more compact: if (ret == -1) { pr_perror("XXX"); return 1; } vs if (ret == -1) return pr_perror("XXX"); Signed-off-by: Andrei Vagin --- test/zdtm/lib/zdtmtst.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index ed7c23ee2..d91886d25 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -126,11 +126,25 @@ extern int write_pidfile(int pid); /* message helpers */ extern int test_log_init(const char *outfile, const char *suffix); extern int zdtm_seccomp; -#define pr_err(format, arg...) test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg) -#define pr_perror(format, arg...) \ - test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#define fail(format, arg...) \ - test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#define pr_err(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg); \ + 1; \ + }) + +#define pr_perror(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) + +#define fail(format, arg...) \ + ({ \ + test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) #define skip(format, arg...) test_msg("SKIP: %s:%d: " format "\n", __FILE__, __LINE__, ##arg) #define pass() test_msg("PASS\n") From aeaff64452c1b30c419e782b286be7467ceea45e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Apr 2022 15:41:25 -0700 Subject: [PATCH 045/775] test/unix: check C/R of unix listen queues Check that CRIU handles non-empty listen queues properly. Signed-off-by: Andrei Vagin [mclapinski@google.com: update test_doc and test_author] Signed-off-by: Michal Clapinski --- test/zdtm/static/Makefile | 8 ++ test/zdtm/static/sk-unix-listen01.c | 117 +++++++++++++++++++++++++ test/zdtm/static/sk-unix-listen02.c | 1 + test/zdtm/static/sk-unix-listen02.desc | 1 + test/zdtm/static/sk-unix-listen03.c | 1 + test/zdtm/static/sk-unix-listen03.desc | 1 + test/zdtm/static/sk-unix-listen04.c | 1 + test/zdtm/static/sk-unix-listen04.desc | 1 + 8 files changed, 131 insertions(+) create mode 100644 test/zdtm/static/sk-unix-listen01.c create mode 120000 test/zdtm/static/sk-unix-listen02.c create mode 100644 test/zdtm/static/sk-unix-listen02.desc create mode 120000 test/zdtm/static/sk-unix-listen03.c create mode 100644 test/zdtm/static/sk-unix-listen03.desc create mode 120000 test/zdtm/static/sk-unix-listen04.c create mode 100644 test/zdtm/static/sk-unix-listen04.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index b28345400..0ac22731b 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -352,6 +352,10 @@ TST_FILE = \ socket_close_data01 \ fifo_upon_unix_socket00 \ fifo_upon_unix_socket01 \ + sk-unix-listen01 \ + sk-unix-listen02 \ + sk-unix-listen03 \ + sk-unix-listen04 \ TST_DIR = \ cwd00 \ @@ -670,6 +674,10 @@ bpf_array: LDLIBS += -lbpf fifo_upon_unix_socket01: CFLAGS += -DFIFO_UPON_UNIX01 +sk-unix-listen02: CFLAGS += -DSK_UNIX_LISTEN02 +sk-unix-listen03: CFLAGS += -DSK_UNIX_LISTEN03 +sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/sk-unix-listen01.c b/test/zdtm/static/sk-unix-listen01.c new file mode 100644 index 000000000..5c9274acb --- /dev/null +++ b/test/zdtm/static/sk-unix-listen01.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test in-flight unix sockets with data in them\n"; +const char *test_author = "Andrei Vagin "; + +#define SK_DATA "packet" + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +#define TEST_MODE 0640 + +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + +int main(int argc, char *argv[]) +{ + struct sockaddr_un addr; + unsigned int addrlen; + int ssk, sk; + + char path[PATH_MAX]; + char *cwd; + int ret; + + test_init(argc, argv); + + cwd = get_current_dir_name(); + if (!cwd) + return pr_perror("get_current_dir_name"); + + snprintf(path, sizeof(path), "%s/%s", cwd, filename); + unlink(path); + + addr.sun_family = AF_UNIX; + addrlen = strlen(filename); + if (addrlen > sizeof(addr.sun_path)) + return pr_err("address is too long"); + memcpy(addr.sun_path, filename, addrlen); + addrlen += sizeof(addr.sun_family); + + ssk = socket(AF_UNIX, SOCK_TYPE, 0); + if (ssk == -1) + return pr_perror("socket"); + + sk = socket(AF_UNIX, SOCK_TYPE, 0); + if (sk < 0) + return pr_perror("socket"); + + ret = bind(ssk, (struct sockaddr *)&addr, addrlen); + if (ret) + return pr_perror("bind"); + + ret = listen(ssk, 16); + if (ret) + return pr_perror("listen"); + + if (connect(sk, (struct sockaddr *)&addr, addrlen)) + return pr_perror("connect"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + memset(buf, 0, sizeof(buf)); + write(sk, SK_DATA, sizeof(SK_DATA)); + } +#endif + +#ifdef SK_UNIX_LISTEN03 + close(sk); + sk = -1; +#endif + + test_daemon(); + test_waitsig(); + + if (sk != -1) + close(sk); + + ret = accept(ssk, NULL, NULL); + if (ret < 0) + return fail("accept"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + if (read(ret, &buf, sizeof(buf)) != sizeof(SK_DATA)) + return pr_perror("read"); + + if (strcmp(buf, SK_DATA)) + return fail("data corrupted"); + } +#endif + + close(ssk); + unlink(path); + + pass(); + return 0; +} diff --git a/test/zdtm/static/sk-unix-listen02.c b/test/zdtm/static/sk-unix-listen02.c new file mode 120000 index 000000000..1211f4666 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen02.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen02.desc b/test/zdtm/static/sk-unix-listen02.desc new file mode 100644 index 000000000..ded89879a --- /dev/null +++ b/test/zdtm/static/sk-unix-listen02.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-listen03.c b/test/zdtm/static/sk-unix-listen03.c new file mode 120000 index 000000000..1211f4666 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen03.desc b/test/zdtm/static/sk-unix-listen03.desc new file mode 100644 index 000000000..ded89879a --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-listen04.c b/test/zdtm/static/sk-unix-listen04.c new file mode 120000 index 000000000..1211f4666 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen04.desc b/test/zdtm/static/sk-unix-listen04.desc new file mode 100644 index 000000000..ded89879a --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} From f78d3d821c07b91a0fdc1e371bbc36eafb56e003 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 20:07:25 -0700 Subject: [PATCH 046/775] gitignore: Ignore top-evel build dir only The entry "build/" will ignore any directory named "build" at any level of the source tree, including our scripts/build directory. We only want to ignore the top-level build directory created by `make install`. As the git manpage suggests, entries with slashes at the start or in the middle will only match at the same level as the .gitignore, hence use build/** instead. Signed-off-by: Younes Manton --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d5135f5f8..23894d631 100644 --- a/.gitignore +++ b/.gitignore @@ -42,4 +42,4 @@ lib/.crit-setup.files compel/include/asm include/common/asm include/common/config.h -build/ +build/** From 39b3de60b615404c47594f5e42eacb2e31405f3f Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 19:48:52 -0700 Subject: [PATCH 047/775] ci: Rename openj9 Dockerfiles to hotspot We used to pull AdoptOpenJDK's OpenJ9 builds but switched to Eclipse Temurin, which uses the HotSpot VM instead of OpenJ9. Rename the corresponding Dockerfiles to hotspot. Signed-off-by: Younes Manton --- .../{Dockerfile.openj9-alpine => Dockerfile.hotspot-alpine} | 5 +---- .../{Dockerfile.openj9-ubuntu => Dockerfile.hotspot-ubuntu} | 0 2 files changed, 1 insertion(+), 4 deletions(-) rename scripts/build/{Dockerfile.openj9-alpine => Dockerfile.hotspot-alpine} (69%) rename scripts/build/{Dockerfile.openj9-ubuntu => Dockerfile.hotspot-ubuntu} (100%) diff --git a/scripts/build/Dockerfile.openj9-alpine b/scripts/build/Dockerfile.hotspot-alpine similarity index 69% rename from scripts/build/Dockerfile.openj9-alpine rename to scripts/build/Dockerfile.hotspot-alpine index f92011283..d6e6e5130 100644 --- a/scripts/build/Dockerfile.openj9-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,6 +1,4 @@ -# FIXME: Replace with eclipse-temurin once Alpine support has been added. -# https://github.com/adoptium/containers/pull/60 -FROM adoptopenjdk/openjdk8-openj9:alpine +FROM docker.io/library/eclipse-temurin:8-alpine ARG CC=gcc RUN apk update && apk add \ @@ -29,4 +27,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu similarity index 100% rename from scripts/build/Dockerfile.openj9-ubuntu rename to scripts/build/Dockerfile.hotspot-ubuntu From 0178f2f990e899f5e2a13447554193017cee988b Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 20:09:50 -0700 Subject: [PATCH 048/775] ci: Add Dockerfile for openj9 on Ubuntu Semeru builds (which use OpenJ9 instead of HotSpot) are the successors of AdoptOpenJDK's OpenJ9 builds. Signed-off-by: Younes Manton --- scripts/build/Dockerfile.openj9-ubuntu | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 scripts/build/Dockerfile.openj9-ubuntu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu new file mode 100644 index 000000000..2e35358ff --- /dev/null +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -0,0 +1,33 @@ +FROM docker.io/library/ibm-semeru-runtimes:open-8-jdk-focal +ARG CC=gcc + +COPY scripts/ci/apt-install /bin/apt-install + +RUN apt-install protobuf-c-compiler \ + libprotobuf-c-dev \ + libaio-dev \ + python3-future \ + libprotobuf-dev \ + protobuf-compiler \ + libcap-dev \ + libnl-3-dev \ + gdb \ + bash \ + python3-protobuf \ + python3-yaml \ + libnet-dev \ + libnl-route-3-dev \ + libbsd-dev \ + make \ + git \ + pkg-config \ + iptables \ + gcc \ + maven + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && make -j $(nproc) CC="$CC" + +ENTRYPOINT mvn -q -f test/javaTests/pom.xml test From 7bc24688d6759ff1b22b8629e4edfe0c6a32b16b Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Wed, 24 Aug 2022 21:17:06 -0700 Subject: [PATCH 049/775] ci: Clean up and improve Java testing This patch changes top-level OpenJ9 filename and data references to Java to make them generic and launches tests against both HotSpot and OpenJ9 JVMs. Signed-off-by: Younes Manton --- .../{openj9-test.yml => java-test.yml} | 6 ++--- scripts/ci/Makefile | 4 +-- scripts/ci/java-test.sh | 25 +++++++++++++++++++ scripts/ci/openj9-test.sh | 20 --------------- 4 files changed, 30 insertions(+), 25 deletions(-) rename .github/workflows/{openj9-test.yml => java-test.yml} (54%) create mode 100755 scripts/ci/java-test.sh delete mode 100755 scripts/ci/openj9-test.sh diff --git a/.github/workflows/openj9-test.yml b/.github/workflows/java-test.yml similarity index 54% rename from .github/workflows/openj9-test.yml rename to .github/workflows/java-test.yml index 1d7a1eb6b..211953495 100644 --- a/.github/workflows/openj9-test.yml +++ b/.github/workflows/java-test.yml @@ -1,4 +1,4 @@ -name: OpenJ9 Test +name: Java Test on: [push, pull_request] @@ -7,5 +7,5 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - - name: Run OpenJ9 Test - run: sudo make -C scripts/ci openj9-test + - name: Run Java Test + run: sudo make -C scripts/ci java-test diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 120f561e4..3a1634fb8 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -85,8 +85,8 @@ podman-test: # overlayfs behaves differently on Ubuntu and breaks CRIU # https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 # Switch to devicemapper -openj9-test: restart-docker - ./openj9-test.sh +java-test: restart-docker + ./java-test.sh setup-vagrant: ./vagrant.sh setup diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh new file mode 100755 index 000000000..7cf704f07 --- /dev/null +++ b/scripts/ci/java-test.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +cd ../.. || exit 1 + +failures="" + +docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . +if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then + failures="$failures openj9-ubuntu" +fi + +docker build -t criu-hotspot-alpine-test:latest -f scripts/build/Dockerfile.hotspot-alpine . +if ! docker run --rm --privileged criu-hotspot-alpine-test:latest; then + failures="$failures hotspot-alpine" +fi + +docker build -t criu-hotspot-ubuntu-test:latest -f scripts/build/Dockerfile.hotspot-ubuntu . +if ! docker run --rm --privileged criu-hotspot-ubuntu-test:latest; then + failures="$failures hotspot-ubuntu" +fi + +if [ -n "$failures" ]; then + echo "Tests failed on $failures" + exit 1 +fi diff --git a/scripts/ci/openj9-test.sh b/scripts/ci/openj9-test.sh deleted file mode 100755 index b8c07f180..000000000 --- a/scripts/ci/openj9-test.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -cd ../.. || exit 1 - -failures="" - -docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . -if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then - failures="$failures ubuntu" -fi - -docker build -t criu-openj9-alpine-test:latest -f scripts/build/Dockerfile.openj9-alpine . -if ! docker run --rm --privileged criu-openj9-alpine-test:latest; then - failures="$failures alpine" -fi - -if [ -n "$failures" ]; then - echo "Tests failed on $failures" - exit 1 -fi From 40e1aaf5639d0e4b970914d0b435e908b070450a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 24 Aug 2022 21:20:30 +0200 Subject: [PATCH 050/775] mount: add definition for FSOPEN_CLOEXEC A recent change in glibc introduced `enum fsconfig_command` [1] and as a result the compilation of criu fails with the following errors In file included from criu/pie/util.c:3: /usr/include/sys/mount.h:240:6: error: redeclaration of 'enum fsconfig_command' 240 | enum fsconfig_command | ^~~~~~~~~~~~~~~~ In file included from /usr/include/sys/mount.h:32: criu/include/linux/mount.h:11:6: note: originally defined here 11 | enum fsconfig_command { | ^~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:242:3: error: redeclaration of enumerator 'FSCONFIG_SET_FLAG' 242 | FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ | ^~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:12:9: note: previous definition of 'FSCONFIG_SET_FLAG' with type 'enum fsconfig_command' 12 | FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ | ^~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:244:3: error: redeclaration of enumerator 'FSCONFIG_SET_STRING' 244 | FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ | ^~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:14:9: note: previous definition of 'FSCONFIG_SET_STRING' with type 'enum fsconfig_command' 14 | FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ | ^~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:246:3: error: redeclaration of enumerator 'FSCONFIG_SET_BINARY' 246 | FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ | ^~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:16:9: note: previous definition of 'FSCONFIG_SET_BINARY' with type 'enum fsconfig_command' 16 | FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ | ^~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:248:3: error: redeclaration of enumerator 'FSCONFIG_SET_PATH' 248 | FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ | ^~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:18:9: note: previous definition of 'FSCONFIG_SET_PATH' with type 'enum fsconfig_command' 18 | FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ | ^~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:250:3: error: redeclaration of enumerator 'FSCONFIG_SET_PATH_EMPTY' 250 | FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ | ^~~~~~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:20:9: note: previous definition of 'FSCONFIG_SET_PATH_EMPTY' with type 'enum fsconfig_command' 20 | FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ | ^~~~~~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:252:3: error: redeclaration of enumerator 'FSCONFIG_SET_FD' 252 | FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ | ^~~~~~~~~~~~~~~ criu/include/linux/mount.h:22:9: note: previous definition of 'FSCONFIG_SET_FD' with type 'enum fsconfig_command' 22 | FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ | ^~~~~~~~~~~~~~~ /usr/include/sys/mount.h:254:3: error: redeclaration of enumerator 'FSCONFIG_CMD_CREATE' 254 | FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ | ^~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:24:9: note: previous definition of 'FSCONFIG_CMD_CREATE' with type 'enum fsconfig_command' 24 | FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ | ^~~~~~~~~~~~~~~~~~~ /usr/include/sys/mount.h:256:3: error: redeclaration of enumerator 'FSCONFIG_CMD_RECONFIGURE' 256 | FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ | ^~~~~~~~~~~~~~~~~~~~~~~~ criu/include/linux/mount.h:26:9: note: previous definition of 'FSCONFIG_CMD_RECONFIGURE' with type 'enum fsconfig_command' 26 | FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ This patch adds definition for FSOPEN_CLOEXEC to solve this problem. In particular, sys/mount.h includes ifndef check for FSOPEN_CLOEXEC surrounding `enum fsconfig_command`. [1] https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=7eae6a91e9b1670330c9f15730082c91c0b1d570 Reported-by: Younes Manton (@ymanton) Signed-off-by: Radostin Stoyanov --- criu/include/linux/mount.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index 0d55a588c..fefafa89e 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -6,7 +6,7 @@ /* Copied from /usr/include/sys/mount.h */ -#ifndef FSCONFIG_CMD_CREATE +#ifndef FSOPEN_CLOEXEC /* The type of fsconfig call made. */ enum fsconfig_command { FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ @@ -26,7 +26,13 @@ enum fsconfig_command { FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ #define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; -#endif // FSCONFIG_CMD_CREATE + +#endif // FSOPEN_CLOEXEC + +/* fsopen flags. With the redundant definition, we check if the kernel, + * glibc value and our value still match. + */ +#define FSOPEN_CLOEXEC 0x00000001 #ifndef MS_MGC_VAL /* Magic mount flag number. Has to be or-ed to the flag values. */ From 9e91e62a7c16348793b7d402eae749a72e1f541e Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 22 Jun 2022 12:12:07 +0300 Subject: [PATCH 051/775] criu-ns: capture controlling tty When we are restoring in new pidns we specifically do setsid() from criu-ns init so that sids of restored tasks are non-zero in this pidns and on next dump CRIU would not have problems with zero sids, see [1]. But after this CRIU tries to inherit and setup a tty for the restored process, and it fails to set it's process group via TIOCSPGRP to be a foreground group for it's tty, because tty already is a controlling tty for other session (which we had before setsid). So to make it restore we need to reset tty to be a controlling tty of criu-ns init via TIOCSCTTY before calling criu. Else when restoring first time via criu-ns (from criu-ns dump) we get: Error (criu/tty.c:689): tty: Failed to set group 40816 on 0: Inappropriate ioctl for device https://github.com/checkpoint-restore/criu/issues/232 [1] v2: add why and what comment in code, set controlling tty only for --shell-job and fail if stdin is not a tty. Fixes: #1893 Signed-off-by: Pavel Tikhomirov --- scripts/criu-ns | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/criu-ns b/scripts/criu-ns index 1217c3dcd..d51e7772c 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -4,6 +4,8 @@ import ctypes.util import errno import sys import os +import fcntl +import termios # constants for unshare CLONE_NEWNS = 0x00020000 @@ -124,6 +126,16 @@ def wrap_restore(): criu_pid = os.fork() if criu_pid == 0: os.setsid() + # Set stdin tty to be a controlling tty of our new session, this is + # required by --shell-job option, as for it CRIU would try to set a + # process group of restored root task to be a foreground group on the + # terminal. + if '--shell-job' in restore_args or '-j' in restore_args: + if os.isatty(sys.stdin.fileno()): + fcntl.ioctl(sys.stdin.fileno(), termios.TIOCSCTTY, 1) + else: + raise OSError(errno.EINVAL, 'The stdin is not a tty for a --shell-job') + _mount_new_proc() run_criu(restore_args) From 2039d73200b661432b33a8ec49c82ad2b6532cc0 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 22 Jun 2022 13:09:20 +0300 Subject: [PATCH 052/775] files-reg: skip failed mount lookup for shell-job's tty When we restore a shell-job we would inherit tty-s, so even if we don't have a right mount for it in container on dump, on restore it should just be right. Else when dumping second time via criu-ns we get: (00.005678) Error (criu/files-reg.c:1710): Can't lookup mount=29 for fd=0 path=/dev/pts/20 Fixes: #1893 Signed-off-by: Pavel Tikhomirov --- criu/files-reg.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index c3761b5ed..2e3d57c5e 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -14,6 +14,8 @@ #include #include +#include "tty.h" + #ifndef SEEK_DATA #define SEEK_DATA 3 #define SEEK_HOLE 4 @@ -1689,6 +1691,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) int ret; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; + bool skip_for_shell_job = false; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) @@ -1708,11 +1711,15 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) mi = lookup_mnt_id(p->mnt_id); if (mi == NULL) { - pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); - return -1; + if (opts.shell_job && is_tty(p->stat.st_rdev, p->stat.st_dev)) { + skip_for_shell_job = true; + } else { + pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); + return -1; + } } - if (mnt_is_overmounted(mi)) { + if (!skip_for_shell_job && mnt_is_overmounted(mi)) { pr_err("Open files on overmounted mounts are not supported yet\n"); return -1; } @@ -1732,7 +1739,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) return -1; } - if (check_path_remap(link, p, lfd, id, mi->nsid)) + if (!skip_for_shell_job && check_path_remap(link, p, lfd, id, mi->nsid)) return -1; rfe.name = &link->name[1]; ext: From 86ac0f05ea271a7943a69ad081b291bfa033454e Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 18:09:49 -0700 Subject: [PATCH 053/775] ci/gha/lint: install a recent shellcheck Instead of using shellcheck v0.7.2 from fedora repo, let's install the latest version (v0.8.0). This allows to remove some "shellcheck disable=..." annotations, and (I hope) better checking quality overall. While at it, remove findutils from dnf install as this package is already installed. Signed-off-by: Kir Kolyshkin --- .github/workflows/lint.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index d32403d05..3d42f3dcf 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,7 +9,18 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell git-clang-format + run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format + + # TODO: remove this and use ShellCheck from repo once F37 with ShellCheck 0.8.0 is out. + - name: install shellcheck + env: + VERSION: v0.8.0 + BASEURL: https://github.com/koalaman/shellcheck/releases/download + SHA256: f4bce23c11c3919c1b20bcb0f206f6b44c44e26f2bc95f8aa708716095fa0651 + run: | + curl -sSfL --retry 5 $BASEURL/$VERSION/shellcheck-$VERSION.linux.x86_64.tar.xz | + tar xfJ - -C /usr/local/bin --strip 1 shellcheck-$VERSION/shellcheck + sha256sum --strict --check - <<<"$SHA256 /usr/local/bin/shellcheck" - uses: actions/checkout@v2 From 968eec0d591e5a7a00078c4ce5ac1c73e29cc68a Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 16:06:18 -0700 Subject: [PATCH 054/775] scripts/ci/apt-install: fix (not ignore) shellcheck warning It is ok to quote $@, as it expands to "$1" "$2" ... Signed-off-by: Kir Kolyshkin --- scripts/ci/apt-install | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/ci/apt-install b/scripts/ci/apt-install index 5a790901a..45aca13f4 100755 --- a/scripts/ci/apt-install +++ b/scripts/ci/apt-install @@ -15,8 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - # shellcheck disable=SC2068 - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends $@ && break + apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" From 9d2948b239263b995e61fe56a1f4294b5015115d Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 16:07:56 -0700 Subject: [PATCH 055/775] scripts/ci/asan.sh: fix, not ignore, shellcheck warning We can use globstar bash feature instead of find in this case. Signed-off-by: Kir Kolyshkin --- scripts/ci/asan.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index 8113b9b19..deeeca0b9 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -1,7 +1,5 @@ #!/bin/bash -# shellcheck disable=2044 - set -x cat /proc/self/mountinfo @@ -13,7 +11,8 @@ chmod 0777 test/zdtm/static ./test/zdtm.py run -a --keep-going -k always --parallel 4 -x zdtm/static/rtc "$@" ret=$? -for i in $(find / -name 'asan.log*'); do +shopt -s globstar nullglob +for i in /**/asan.log*; do echo "$i" echo ======================================== cat "$i" From b1fb9f2f0b992b777e8aeb69638bfcc99e1670e9 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 16:40:31 -0700 Subject: [PATCH 056/775] Fix, not ignore, shellcheck SC1091 warnings This is easy to fix (but we have to specify -x). Signed-off-by: Kir Kolyshkin --- Makefile | 8 ++++---- scripts/ci/docker-test.sh | 3 ++- test/others/config-file/run.sh | 2 +- test/others/crit/test.sh | 3 ++- test/others/criu-coredump/test.sh | 2 +- test/others/libcriu/run.sh | 2 +- 6 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 436ebfd0d..537720339 100644 --- a/Makefile +++ b/Makefile @@ -423,10 +423,10 @@ lint: shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install - shellcheck test/others/crit/*.sh - shellcheck test/others/libcriu/*.sh - shellcheck test/others/crit/*.sh test/others/criu-coredump/*.sh - shellcheck test/others/config-file/*.sh + shellcheck -x test/others/crit/*.sh + shellcheck -x test/others/libcriu/*.sh + shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh + shellcheck -x test/others/config-file/*.sh codespell # Do not append \n to pr_perror or fail ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index ca93ed77c..eacfe136e 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -1,6 +1,6 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2015 +# shellcheck disable=SC2015 set -x -e -o pipefail @@ -19,6 +19,7 @@ add-apt-repository \ ./apt-install docker-ce +# shellcheck source=/dev/null . /etc/lsb-release # docker checkpoint and restore is an experimental feature diff --git a/test/others/config-file/run.sh b/test/others/config-file/run.sh index 92195883e..26b835b45 100755 --- a/test/others/config-file/run.sh +++ b/test/others/config-file/run.sh @@ -11,7 +11,7 @@ set -xbm -#shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh if [ ! -d /etc/criu ]; then diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 0d38043d7..7db88e0a9 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -1,8 +1,9 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2002 +# shellcheck disable=SC2002 set -x +# shellcheck source=test/others/env.sh source ../env.sh images_list="" diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index dd774e298..9b6e56475 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -1,7 +1,7 @@ #!/bin/bash set -x -# shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh || exit 1 function gen_imgs { diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 77bdfb87e..f7d363aab 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -9,7 +9,7 @@ TEST_LOG="${TEST_DIR}/test.log" DUMP_LOG="${TEST_DIR}/dump.log" RESTORE_LOG="${TEST_DIR}/restore.log" -# shellcheck disable=1091 +# shellcheck source=test/others/env.sh source "${MAIN_DIR}/../env.sh" || exit 1 echo "== Clean" From aeb6961f3d58e435f546cd8c7282c83fd6849ac2 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 17:07:43 -0700 Subject: [PATCH 057/775] scripts/ci/run-ci-tests: use bash arrays This is a preferred way of fixing SC2086 shellcheck warning. Note that since ZDTM_OPTS is passed as a string (via make or docker), we are converting it to an array using read -a. Remove all "shellcheck disable=SC2086" annotations. Signed-off-by: Kir Kolyshkin --- scripts/ci/run-ci-tests.sh | 55 +++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 3760a65e3..1b761ea56 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,14 +1,17 @@ #!/bin/bash set -x -e -CI_PKGS="protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev +CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml libperl-dev pkg-config python3-future python3-protobuf - python3-junit.xml" + python3-junit.xml) -X86_64_PKGS="gcc-multilib" +X86_64_PKGS=(gcc-multilib) + +# Convert from string to array. +IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) @@ -46,14 +49,14 @@ ci_prep () { else CC=gcc fi - CI_PKGS="$CI_PKGS $CC" + CI_PKGS+=("$CC") # Do not install x86_64 specific packages on other architectures if [ "$UNAME_M" = "x86_64" ]; then - CI_PKGS="$CI_PKGS $X86_64_PKGS" + CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "$CI_PKGS" + scripts/ci/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" # zdtm uses an unversioned python binary to run the tests. @@ -69,9 +72,8 @@ test_stream() { # restorer and eventually close the page read. However, image-streamer expects the # whole image to be read and the image is not reopened, sent twice. These MAP_HUGETLB # test cases will result in EPIPE error at the moment. - STREAM_TEST_EXCLUDE="-x maps09 -x maps10" - # shellcheck disable=SC2086 - ./test/zdtm.py run --stream -p 2 --keep-going -a $STREAM_TEST_EXCLUDE $ZDTM_OPTS + STREAM_TEST_EXCLUDE=(-x maps09 -x maps10) + ./test/zdtm.py run --stream -p 2 --keep-going -a "${STREAM_TEST_EXCLUDE[@]}" "${ZDTM_OPTS[@]}" } print_header() { @@ -160,21 +162,20 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then # for 32-bit tests. A better way would involve launching docker.. # But it would require making zdtm.py aware of docker and launching # tests inside the CT. - INCOMPATIBLE_LIBS="libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev" - IA32_PKGS="" + INCOMPATIBLE_LIBS=(libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev) + IA32_PKGS=() REFUGE=64-refuge mkdir "$REFUGE" - for i in $INCOMPATIBLE_LIBS ; do + for i in "${INCOMPATIBLE_LIBS[@]}" ; do for j in $(dpkg --listfiles "$i" | grep '\.so$') ; do cp "$j" "$REFUGE/" done - IA32_PKGS="$IA32_PKGS $i:i386" + IA32_PKGS+=("$i:i386") done - # shellcheck disable=SC2086 - apt-get remove $INCOMPATIBLE_LIBS + apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "$IA32_PKGS" + scripts/ci/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi @@ -211,15 +212,12 @@ if [ "${STREAM_TEST}" = "1" ]; then exit 0 fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going "${ZDTM_OPTS[@]}" if criu/criu check --feature move_mount_set_group; then - # shellcheck disable=SC2086 - ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going $ZDTM_OPTS + ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${ZDTM_OPTS[@]}" fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going --criu-config $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going --criu-config "${ZDTM_OPTS[@]}" # Newer kernels are blocking access to userfaultfd: # uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults must be handled without obtaining CAP_SYS_PTRACE capability @@ -227,17 +225,14 @@ if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then echo 1 > /proc/sys/vm/unprivileged_userfaultfd fi -LAZY_EXCLUDE="-x maps04 -x cmdlinenv00 -x maps007" +LAZY_EXCLUDE=(-x maps04 -x cmdlinenv00 -x maps007) LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*' -LAZY_OPTS="-p 2 -T $LAZY_TESTS $LAZY_EXCLUDE $ZDTM_OPTS" +LAZY_OPTS=(-p 2 -T "$LAZY_TESTS" "${LAZY_EXCLUDE[@]}" "${ZDTM_OPTS[@]}") -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls +./test/zdtm.py run "${LAZY_OPTS[@]}" --lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages --tls bash -x ./test/jenkins/criu-fault.sh if [ "$UNAME_M" == "x86_64" ]; then From 75b859f23f8646c9ac242d7ac9c90b6b65fe9745 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 17:10:27 -0700 Subject: [PATCH 058/775] scripts/ci: rm shellcheck disable annotations Those are no longer needed with shellcheck 0.8.0. Signed-off-by: Kir Kolyshkin --- scripts/ci/docker-test.sh | 2 -- scripts/ci/podman-test.sh | 1 - 2 files changed, 3 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index eacfe136e..beb7da6da 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -1,7 +1,5 @@ #!/bin/bash -# shellcheck disable=SC2015 - set -x -e -o pipefail ./apt-install \ diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 973d2d722..e08fdf3bc 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -27,7 +27,6 @@ rm -rf "${tmp_dir}" podman info -# shellcheck disable=SC2016 podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' sleep 1 From 0a872ccf16e09afd89371d5c63666eba1f7859db Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 17:49:50 -0700 Subject: [PATCH 059/775] scripts/protobuf-gen.sh: fix (not ignore) shellcheck warnings This basically replaces for x in $(sed ...); do with sed ... | while IFS= read -r x; do The only caveat is, sed program was amended to remove empty lines (there was one right above the PB_AUTOGEN_STOP). Signed-off-by: Kir Kolyshkin --- scripts/protobuf-gen.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/protobuf-gen.sh b/scripts/protobuf-gen.sh index 0c738f13a..25d2feaeb 100644 --- a/scripts/protobuf-gen.sh +++ b/scripts/protobuf-gen.sh @@ -1,15 +1,15 @@ #!/bin/bash -# shellcheck disable=SC2013,SC1004 - TR="y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/" -for x in $(sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { +sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { /PB_AUTOGEN_ST/d; + /^[ \t]*$/d; s/,.*$//; s/\tPB_//; p; - }' criu/include/protobuf-desc.h); do + }' criu/include/protobuf-desc.h | \ +while IFS= read -r x; do x_la=$(echo "$x" | sed $TR) x_uf=$(echo "$x" | sed -nr 's/^./&#\\\ /; From 16f1c147c89f4ce584dcc80c3b80259e5fc54925 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 12 Apr 2022 18:03:29 -0700 Subject: [PATCH 060/775] test/others/crit/test.sh: use bash array In fact an array (aptly named array) is already used in run_test2, so let's just make it an array right from the start. While at it, remove ls invocation. Signed-off-by: Kir Kolyshkin --- test/others/crit/test.sh | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 7db88e0a9..5d13066e7 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -6,7 +6,7 @@ set -x # shellcheck source=test/others/env.sh source ../env.sh -images_list="" +images_list=() function gen_imgs { PID=$(../loop) @@ -17,15 +17,15 @@ function gen_imgs { exit 1 fi - images_list=$(ls -1 ./*.img) - if [ -z "$images_list" ]; then + images_list=(./*.img) + if [ "${#images_list[@]}" -eq 0 ]; then echo "Failed to generate images" exit 1 fi } function run_test1 { - for x in $images_list + for x in "${images_list[@]}" do echo "=== $x" if [[ $x == *pages* ]]; then @@ -46,9 +46,7 @@ function run_test1 { function run_test2 { - mapfile -t array <<< "$images_list" - - PROTO_IN=${array[0]} + PROTO_IN="${images_list[0]}" JSON_IN=$(mktemp -p ./ tmp.XXXXXXXXXX.json) OUT=$(mktemp -p ./ tmp.XXXXXXXXXX.log) From debc9c16cc8ed4518bdfa99a7106677a8ee73a72 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Wed, 14 Sep 2022 15:42:07 +0800 Subject: [PATCH 061/775] seize: do not overwrite exit code from failpath Signed-off-by: Liu Hua --- criu/seize.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 1333d6db9..f2af12a0b 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -535,8 +535,10 @@ static int freeze_processes(void) } err: - if (exit_code == 0 || origin_freezer_state == THAWED) - exit_code = freezer_write_state(fd, THAWED); + if (exit_code == 0 || origin_freezer_state == THAWED) { + if (freezer_write_state(fd, THAWED)) + exit_code = -1; + } if (close(fd)) { pr_perror("Unable to thaw tasks"); From 461fa7271584596bcc074abdb3c087c802d992c0 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 2 Sep 2022 07:01:20 -0700 Subject: [PATCH 062/775] compel: Add APIs to facilitate testing Starting the daemon is the first time we run code in the victim using the parasite stack. It's useful for testing to be able to infect the victim without starting the daemon so that we can inspect the victim's state, set up stack guards, and so on before stack-related corruption can happen. Add compel_infect_no_daemon() to infect the victim but not start the daemon and compel_start_daemon() to start the daemon after the victim is infected. Add compel_get_stack() to get the victim's main and thread parasite stacks. Signed-off-by: Younes Manton --- compel/include/uapi/infect.h | 5 +++++ compel/src/lib/infect.c | 29 +++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 19d4da2b1..3bd36dda1 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -41,9 +41,12 @@ struct parasite_thread_ctl; extern struct parasite_ctl __must_check *compel_prepare(int pid); extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +extern int __must_check compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, + unsigned long args_size); extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); +extern int __must_check compel_start_daemon(struct parasite_ctl *ctl); extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); @@ -177,4 +180,6 @@ extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); +extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); + #endif diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 6413a1860..5aab7aa3e 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -967,7 +967,7 @@ static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) return ret; } -int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; @@ -1079,15 +1079,23 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l goto err; } - if (parasite_start_daemon(ctl)) - goto err; - return 0; err: return -1; } +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +{ + if (compel_infect_no_daemon(ctl, nr_threads, args_size)) + return -1; + + if (parasite_start_daemon(ctl)) + return -1; + + return 0; +} + struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; @@ -1427,6 +1435,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; } +int compel_start_daemon(struct parasite_ctl *ctl) +{ + return parasite_start_daemon(ctl); +} + int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { @@ -1772,3 +1785,11 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) { SET_REG_IP(tctl->th.regs, v); } + +void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack) +{ + if (rstack) + *rstack = ctl->rstack; + if (r_thread_stack) + *r_thread_stack = ctl->r_thread_stack; +} From 556ab0deaf525f3d2f881fad4c366e3bb5b5bd1f Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 27 Sep 2022 07:10:03 -0700 Subject: [PATCH 063/775] compel: Fix infect test to not override failures Signed-off-by: Younes Manton return zero on chk success Signed-off-by: Pavel Tikhomirov Co-authored-by: Pavel Tikhomirov --- compel/test/infect/spy.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index e7273b446..b10db4d47 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -94,15 +94,15 @@ static inline int chk(int fd, int val) int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) - return 0; + return 1; printf("%d, want %d\n", v, val); - return v == val; + return v != val; } int main(int argc, char **argv) { - int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; /* * Prepare IO-s and fork the victim binary @@ -142,9 +142,11 @@ int main(int argc, char **argv) return 1; printf("Checking the victim alive\n"); - pass = chk(p_out[0], 1); - pass = chk(p_out[0], 42); - if (!pass) + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) return 1; /* @@ -176,14 +178,14 @@ int main(int argc, char **argv) printf("Checking the result\n"); /* These two came from parasite */ - pass = chk(p_out[0], 138); - pass = chk(p_out[0], 403); + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); /* These two came from post-infect */ - pass = chk(p_out[0], 1234); - pass = chk(p_out[0], 4096); + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); - if (pass) + if (!err) printf("All OK\n"); else printf("Something went WRONG\n"); From 17ec53913254ca5fc10496ca755a8b7a186be035 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 30 Aug 2022 09:56:42 -0700 Subject: [PATCH 064/775] compel: Add test to check parasite stack setup Some ABIs allow functions to store data in caller frame, which means that we have to allocate an initial stack frame before executing code on the parasite stack. This test saves the contents of writable memory that follows the stack after the victim has been infected but before we start using the parasite stack. It later checks that the saved data matches the current contents of the two memory areas. This is done while the victim is halted so we expect a match unless executing parasite code caused memory corruption. The test doesn't detect cases where we corrupted memory by writing the same value. Signed-off-by: Younes Manton --- compel/test/Makefile | 8 +- compel/test/stack/.gitignore | 4 + compel/test/stack/Makefile | 32 +++ compel/test/stack/parasite.c | 38 ++++ compel/test/stack/spy.c | 405 ++++++++++++++++++++++++++++++++++ compel/test/stack/victim.c | 16 ++ test/zdtm/transition/Makefile | 1 + test/zdtm/transition/stack.c | 16 ++ 8 files changed, 518 insertions(+), 2 deletions(-) create mode 100644 compel/test/stack/.gitignore create mode 100644 compel/test/stack/Makefile create mode 100644 compel/test/stack/parasite.c create mode 100644 compel/test/stack/spy.c create mode 100644 compel/test/stack/victim.c create mode 100644 test/zdtm/transition/stack.c diff --git a/compel/test/Makefile b/compel/test/Makefile index 63fb76f80..f46a821ee 100644 --- a/compel/test/Makefile +++ b/compel/test/Makefile @@ -1,4 +1,4 @@ -all: fdspy infect rsys +all: fdspy infect rsys stack fdspy: $(Q) $(MAKE) -C fdspy @@ -10,8 +10,12 @@ infect: $(Q) $(MAKE) -C infect run .PHONY: infect - rsys: $(Q) $(MAKE) -C rsys $(Q) $(MAKE) -C rsys run .PHONY: rsys + +stack: + $(Q) $(MAKE) -C stack + $(Q) $(MAKE) -C stack run +.PHONY: stack diff --git a/compel/test/stack/.gitignore b/compel/test/stack/.gitignore new file mode 100644 index 000000000..0a554758d --- /dev/null +++ b/compel/test/stack/.gitignore @@ -0,0 +1,4 @@ +parasite.h +parasite.po +spy +victim diff --git a/compel/test/stack/Makefile b/compel/test/stack/Makefile new file mode 100644 index 000000000..bacfad962 --- /dev/null +++ b/compel/test/stack/Makefile @@ -0,0 +1,32 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +run: + ./spy +.PHONY: run + +clean: + rm -f victim + rm -f spy + rm -f parasite.h + rm -f parasite.po + rm -f parasite.o + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c parasite.h + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) + +parasite.h: parasite.po + $(COMPEL) hgen -o $@ -f $< + +parasite.po: parasite.o + ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) + +parasite.o: parasite.c + $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/compel/test/stack/parasite.c b/compel/test/stack/parasite.c new file mode 100644 index 000000000..ad13bd25d --- /dev/null +++ b/compel/test/stack/parasite.c @@ -0,0 +1,38 @@ +#include + +#include +#include + +/* + * Stubs for std compel plugin. + */ +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} +void parasite_cleanup(void) +{ +} + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +int parasite_daemon_cmd(int cmd, void *args) +{ + int v; + + switch (cmd) { + case PARASITE_CMD_INC: + v = (*(int *)args) + 1; + break; + case PARASITE_CMD_DEC: + v = (*(int *)args) - 1; + break; + default: + v = -1; + break; + } + + sys_write(1, &v, sizeof(int)); + return 0; +} diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c new file mode 100644 index 000000000..9b7c9a7f0 --- /dev/null +++ b/compel/test/stack/spy.c @@ -0,0 +1,405 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "parasite.h" + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) + +void *saved_data = NULL; + +#define SAVED_DATA_MAX page_size() + +void cleanup_saved_data(void) +{ + free(saved_data); +} + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static void *get_parasite_rstack_start(struct parasite_ctl *ctl) +{ + void *rstack, *r_thread_stack, *rstack_start; + + compel_get_stack(ctl, &rstack, &r_thread_stack); + + rstack_start = rstack; + if (r_thread_stack != NULL && r_thread_stack < rstack_start) + rstack_start = r_thread_stack; + + return rstack_start; +} + +static int page_writable(struct parasite_ctl *ctl, int pid, void *page) +{ + FILE *maps; + size_t maps_line_len = 0; + char *maps_line = NULL; + char victim_maps_path[6 + 11 + 5 + 1]; + int written; + int ret = 0; + + if (((uintptr_t)page & (page_size() - 1)) != 0) { + fprintf(stderr, "Page address not aligned\n"); + ret = -1; + goto done; + } + + written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); + if (written < 0 || written >= sizeof(victim_maps_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); + ret = -1; + goto done; + } + + maps = fopen(victim_maps_path, "r"); + if (maps == NULL) { + perror("Can't open victim's /proc/$pid/maps"); + ret = -1; + goto done; + } + + while (getline(&maps_line, &maps_line_len, maps) != -1) { + unsigned long vmstart, vmend; + char r, w; + + if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { + fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); + ret = -1; + goto free_linebuf; + } + + if (page >= (void *)vmstart && page < (void *)vmend) { + if (w == 'w') { + if (r != 'r') { + fprintf(stderr, "Expecting writable memory to also be readable"); + ret = -1; + goto free_linebuf; + } + ret = 1; + } + break; + } + } + + if (errno) { + perror("Can't read victim's /proc/$pid/maps"); + ret = -1; + } + +free_linebuf: + free(maps_line); + fclose(maps); +done: + return ret; +} + +static void *read_proc_mem(int pid, void *offset, size_t len) +{ + char victim_mem_path[6 + 11 + 4 + 1]; + int written; + int fd; + void *data; + ssize_t mem_read; + + written = snprintf(victim_mem_path, sizeof(victim_mem_path), "/proc/%d/mem", pid); + if (written < 0 || written >= sizeof(victim_mem_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/mem file\n", pid); + return NULL; + } + + fd = open(victim_mem_path, O_RDONLY); + if (fd < 0) { + perror("Failed to open victim's /proc/$pid/mem file"); + return NULL; + } + + data = malloc(len); + if (data == NULL) { + perror("Can't allocate memory to read victim's /proc/$pid/mem file"); + return NULL; + } + + mem_read = pread(fd, data, len, (off_t)offset); + if (mem_read == -1) { + perror("Failed to read victim's /proc/$pid/mem file"); + goto freebuf; + } + + return data; + +freebuf: + free(data); + return NULL; +} + +static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, + size_t *saved_data_size) +{ + size_t page_mask = page_size() - 1; + size_t saved_size = 0; + size_t stack_size_last_page = (uintptr_t)stack & page_mask; + void *next_page = stack; + + if (stack_size_last_page != 0) { + size_t empty_space_last_page = page_size() - stack_size_last_page; + saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); + next_page += page_size() - stack_size_last_page; + } + + while (saved_size < SAVED_DATA_MAX && next_page != NULL) { + switch (page_writable(ctl, pid, next_page)) { + case 1: + saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); + next_page += page_size(); + break; + case 0: + next_page = NULL; + break; + default: + return -1; + } + } + + if (saved_size > 0) { + void *sd; + + sd = read_proc_mem(pid, stack, saved_size); + if (sd == NULL) + return -1; + + *saved_data = sd; + } else { + *saved_data = NULL; + } + + *saved_data_size = saved_size; + + return 0; +} + +static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) +{ + if (saved_data != NULL) { + void *current_data; + + current_data = read_proc_mem(pid, stack, saved_data_size); + if (current_data == NULL) + return -1; + + if (memcmp(saved_data, current_data, saved_data_size) != 0) + return 1; + } + + return 0; +} + +static int do_infection(int pid) +{ + int state; + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + int *arg; + void *stack; + size_t saved_data_size; + int saved_data_check; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task\n"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection\n"); + + printf("Configuring contexts\n"); + + /* + * First -- the infection context. Most of the stuff + * is already filled by compel_prepare(), just set the + * log descriptor for parasite side, library cannot + * live w/o it. + */ + ictx = compel_infect_ctx(ctl); + ictx->log_fd = STDERR_FILENO; + + parasite_setup_c_header(ctl); + + printf("Infecting\n"); + if (compel_infect_no_daemon(ctl, 1, sizeof(int))) + err_and_ret("Can't infect victim\n"); + + if (atexit(cleanup_saved_data)) + err_and_ret("Can't register cleanup function with atexit\n"); + + stack = get_parasite_rstack_start(ctl); + if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) + err_and_ret("Can't save data above stack\n"); + + if (compel_start_daemon(ctl)) + err_and_ret("Can't start daemon in victim\n"); + + /* + * Now get the area with arguments and run two + * commands one by one. + */ + arg = compel_parasite_args(ctl, int); + + printf("Running cmd 1\n"); + *arg = 137; + if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) + err_and_ret("Can't run parasite command 1\n"); + + printf("Running cmd 2\n"); + *arg = 404; + if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) + err_and_ret("Can't run parasite command 2\n"); + + saved_data_check = check_saved_data(ctl, pid, stack, saved_data, saved_data_size); + if (saved_data_check == -1) + err_and_ret("Could not check saved data\n"); + if (saved_data_check != 0) + err_and_ret("Saved data unexpectedly modified\n"); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim\n"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task\n"); + + printf("Done\n"); + + return 0; +} + +static inline int chk(int fd, int val) +{ + int v = 0; + + if (read(fd, &v, sizeof(v)) != sizeof(v)) + return 1; + + printf("%d, want %d\n", v, val); + return v != val; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + pid = vfork(); + if (pid == 0) { + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); + + /* + * Tell the little guy some numbers + */ + i = 1; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 42; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + printf("Checking the victim alive\n"); + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) + return 1; + + /* + * Now do the infection with parasite.c + */ + + printf("Infecting the victim\n"); + if (do_infection(pid)) + return 1; + + /* + * Tell the victim some more stuff to check it's alive + */ + i = 1234; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 4096; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + /* + * Stop the victim and check the infection went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the result\n"); + + /* These two came from parasite */ + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); + + /* These two came from post-infect */ + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); + + if (!err) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/compel/test/stack/victim.c b/compel/test/stack/victim.c new file mode 100644 index 000000000..f94613fa1 --- /dev/null +++ b/compel/test/stack/victim.c @@ -0,0 +1,16 @@ +#include + +int main(int argc, char **argv) +{ + int i; + + while (1) { + if (read(0, &i, sizeof(i)) != sizeof(i)) + break; + + if (write(1, &i, sizeof(i)) != sizeof(i)) + break; + } + + return 0; +} diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile index 98440f4e2..ab735bdd4 100644 --- a/test/zdtm/transition/Makefile +++ b/test/zdtm/transition/Makefile @@ -25,6 +25,7 @@ TST_NOFILE = \ pidfd_store_sk \ rseq01 \ rseq02 \ + stack \ TST_FILE = \ diff --git a/test/zdtm/transition/stack.c b/test/zdtm/transition/stack.c new file mode 100644 index 000000000..9548b9182 --- /dev/null +++ b/test/zdtm/transition/stack.c @@ -0,0 +1,16 @@ +#include "zdtmtst.h" + +const char *test_doc = "Tests that parasite code does not write past the start of the stack"; +const char *test_author = "Younes Manton "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} From a39d416568ffd7939c5e2aaa7649399e36cd854e Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 30 Aug 2022 08:18:21 -0700 Subject: [PATCH 065/775] compel: Fix ppc64le parasite stack layout The ppc64le ABI allows functions to store data in caller frames. When initializing the stack pointer prior to executing parasite code we need to pre-allocating the minimum sized stack frame before jumping to the parasite code. Signed-off-by: Younes Manton --- compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h | 5 +++++ compel/arch/ppc64/src/lib/infect.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index eb12c9f7c..8cc94ba74 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -23,6 +23,11 @@ /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ #define USER_REDZONE_SIZE 512 +#if _CALL_ELF != 2 +#error Only supporting ABIv2. +#else +#define STACK_FRAME_MIN_SIZE 32 +#endif /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ #define TRAMP_SIZE 6 diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 61cd6e985..db999ce37 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -441,13 +441,13 @@ void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* - * OpenPOWER ABI requires that r12 is set to the calling function addressi + * OpenPOWER ABI requires that r12 is set to the calling function address * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) - regs->gpr[1] = (unsigned long)stack; + regs->gpr[1] = (unsigned long)stack - STACK_FRAME_MIN_SIZE; regs->trap = 0; } From 4c7f91afffe10e0b562bb3a0be19fef55f0e7cec Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 1 Oct 2022 22:19:24 +0100 Subject: [PATCH 066/775] ci: enable EPEL for CentOS 7 python2-future, python2-junit_xml, python-flake8 and libbsd-devel are now provided from EPEL. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 03ed79748..c7ed5027a 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -111,6 +111,8 @@ task: memory: 8G setup_script: | + # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel # Even with selinux in permissive mode the selinux tests will be executed From 4b4bf0421b3a9881735a6fa7041e30f8fbefd85b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:03:45 +0000 Subject: [PATCH 067/775] non-root: add infrastructure to run as non-root The idea behind the rootless CRIU code is, that CRIU reads out its effective capabilities and stores that in the global opts structure. Different parts of CRIU can then, based on the existing capabilities, automatically enable or disable certain code paths. Currently at least CAP_CHECKPOINT_RESTORE is required. CRIU will not start without this capability. Signed-off-by: Adrian Reber --- criu/config.c | 3 +++ criu/cr-restore.c | 4 ++++ criu/include/cr_options.h | 17 ++++++++++++++++- criu/include/restorer.h | 3 +++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/criu/config.c b/criu/config.c index 24c445c8b..c078848ec 100644 --- a/criu/config.c +++ b/criu/config.c @@ -705,6 +705,9 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, #undef BOOL_OPT + if (argv && argv[0]) + SET_CHAR_OPTS(argv_0, argv[0]); + ret = pre_parse(argc, argv, usage_error, &no_default_config, &cfg_file); if (ret) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9c480be78..cd8705822 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3748,6 +3748,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns prep_libc_rseq_info(&task_args->libc_rseq); + task_args->uid = opts.uid; + for (i = 0; i < CR_CAP_SIZE; i++) + task_args->cap_eff[i] = opts.cap_eff[i]; + /* * Fill up per-thread data. */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index e544a2d9a..6e85dff0a 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -1,10 +1,11 @@ #ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ -#include #include #include "common/config.h" #include "common/list.h" +#include "int.h" +#include "image.h" /* Configuration and CLI parsing order defines */ #define PARSING_GLOBAL_CONF 1 @@ -210,6 +211,20 @@ struct cr_options { enum criu_mode mode; int mntns_compat_mode; + + /* Remember the program name passed to main() so we can use it in + * error messages elsewhere. + */ + char *argv_0; + /* + * This contains the eUID of the current CRIU user. It + * will only be set to a non-zero value if CRIU has + * the necessary capabilities to run as non root. + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN + */ + uid_t uid; + /* This contains the value from /proc/pid/status: CapEff */ + u32 cap_eff[CR_CAP_SIZE]; }; extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 325804e44..d642765e3 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -235,6 +235,9 @@ struct task_restore_args { * unregister it before memory restoration procedure */ struct rst_rseq_param libc_rseq; + + uid_t uid; + u32 cap_eff[CR_CAP_SIZE]; } __aligned(64); /* From ce01f70d94daa6fe58c68407c3776ab8f7009427 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:08:07 +0000 Subject: [PATCH 068/775] non-root: add functions to work with capabilities This adds the function check_caps() which checks if CRIU is running with at least CAP_CHECKPOINT_RESTORE. That is the minimum capability CRIU needs to do a minimal checkpoint and restore from it. In addition helper functions are added to easily query for other capability for enhanced checkpoint/restore support. Co-authored-by: Younes Manton Signed-off-by: Adrian Reber Signed-off-by: Younes Manton --- criu/cr-check.c | 46 ++++++++++++++++++++++++++++++- criu/include/crtools.h | 1 + criu/include/util-caps.h | 58 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 criu/include/util-caps.h diff --git a/criu/cr-check.c b/criu/cr-check.c index 6c95ffb25..b90e6a9bf 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -29,7 +29,7 @@ #include "sockets.h" #include "crtools.h" #include "log.h" -#include "util-pie.h" +#include "util-caps.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" @@ -1655,3 +1655,47 @@ static char *feature_name(int (*func)(void)) } return NULL; } + +static int pr_set_dumpable(int value) +{ + int ret = prctl(PR_SET_DUMPABLE, value, 0, 0, 0); + if (ret < 0) + pr_perror("Unable to set PR_SET_DUMPABLE"); + return ret; +} + +int check_caps(void) +{ + struct proc_status_creds creds; + int exit_code = -1; + + if (parse_pid_status(PROC_SELF, &creds.s, NULL)) + goto out; + + memcpy(&opts.cap_eff, &creds.cap_eff, sizeof(u32) * PROC_CAP_SIZE); + + if (!has_cap_checkpoint_restore(opts.cap_eff)) + goto out; + + /* For some things we need to know if we are running as root. */ + opts.uid = geteuid(); + + if (opts.uid) { + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + } + + exit_code = 0; +out: + if (exit_code) { + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + } + + return exit_code; +} diff --git a/criu/include/crtools.h b/criu/include/crtools.h index b9309654f..b54b9d929 100644 --- a/criu/include/crtools.h +++ b/criu/include/crtools.h @@ -26,6 +26,7 @@ extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); +extern int check_caps(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); diff --git a/criu/include/util-caps.h b/criu/include/util-caps.h new file mode 100644 index 000000000..7ccd162f5 --- /dev/null +++ b/criu/include/util-caps.h @@ -0,0 +1,58 @@ +#ifndef __CR_UTIL_CAPS_H__ +#define __CR_UTIL_CAPS_H__ + +#include + +#ifndef CAP_CHECKPOINT_RESTORE +#define CAP_CHECKPOINT_RESTORE 40 +#endif + +static inline bool has_capability(int cap, u32 *cap_eff) +{ + int mask = CAP_TO_MASK(cap); + int index = CAP_TO_INDEX(cap); + u32 effective; + + effective = cap_eff[index]; + + if (!(mask & effective)) { + pr_debug("Effective capability %d missing\n", cap); + return false; + } + + return true; +} + +static inline bool has_cap_checkpoint_restore(u32 *cap_eff) +{ + /* + * Everything guarded by CAP_CHECKPOINT_RESTORE is also + * guarded by CAP_SYS_ADMIN. Check for both capabilities. + */ + if (has_capability(CAP_CHECKPOINT_RESTORE, cap_eff) || has_capability(CAP_SYS_ADMIN, cap_eff)) + return true; + + return false; +} + +static inline bool has_cap_net_admin(u32 *cap_eff) +{ + return has_capability(CAP_NET_ADMIN, cap_eff); +} + +static inline bool has_cap_sys_chroot(u32 *cap_eff) +{ + return has_capability(CAP_SYS_CHROOT, cap_eff); +} + +static inline bool has_cap_setuid(u32 *cap_eff) +{ + return has_capability(CAP_SETUID, cap_eff); +} + +static inline bool has_cap_sys_resource(u32 *cap_eff) +{ + return has_capability(CAP_SYS_RESOURCE, cap_eff); +} + +#endif /* __CR_UTIL_CAPS_H__ */ From 6a30c7d1ed5bdad67921e8e6262f839bd8c48d08 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 12 Aug 2022 11:56:53 -0700 Subject: [PATCH 069/775] non-root: enable non-root checkpoint/restore This commit enables checkpointing and restoring of applications as non-root. First goal was to enable checkpoint and restore of the env00 and pthread00 test case. This uses the information from opts.unprivileged and opts.cap_eff to skip certain code paths which do not work as non-root. Co-authored-by: Adrian Reber Signed-off-by: Younes Manton --- criu/cgroup.c | 6 ++++ criu/config.c | 1 + criu/cr-check.c | 71 ++++++++++++++++++++++++--------------- criu/cr-restore.c | 3 ++ criu/cr-service.c | 7 ++++ criu/crtools.c | 5 +++ criu/fdstore.c | 16 +++++++-- criu/files.c | 46 +++++++++++++++++++++---- criu/image.c | 3 +- criu/include/cr_options.h | 11 ++++-- criu/include/util.h | 2 ++ criu/namespaces.c | 11 +++--- criu/pie/restorer.c | 26 ++++++++------ criu/timens.c | 4 +++ criu/util.c | 22 ++++++++++++ images/rpc.proto | 1 + lib/c/criu.c | 11 ++++++ lib/c/criu.h | 1 + 18 files changed, 194 insertions(+), 53 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 325df6a1d..d886ce9f2 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -734,6 +734,9 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ unsigned int n_ctls = 0; struct cg_set *cs; + if (opts.unprivileged) + return 0; + if (item) pid = item->pid->real; else @@ -989,6 +992,9 @@ int dump_cgroups(void) CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; + if (opts.unprivileged) + return 0; + BUG_ON(!criu_cgset || !root_cgset); /* diff --git a/criu/config.c b/criu/config.c index c078848ec..9ba79c8ef 100644 --- a/criu/config.c +++ b/criu/config.c @@ -700,6 +700,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), + BOOL_OPT("unprivileged", &opts.unprivileged), {}, }; diff --git a/criu/cr-check.c b/criu/cr-check.c index b90e6a9bf..b54c79387 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "../soccr/soccr.h" @@ -515,6 +516,14 @@ static int check_ipc(void) { int ret; + /* + * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however + * for non-root users access() runs with an empty set of caps and will therefore always + * fail. + */ + if (opts.uid) + return 0; + ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; @@ -1039,10 +1048,14 @@ static int check_tcp(void) } val = 1; - ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); - if (ret < 0) { - pr_perror("Can't turn TCP repair mode ON"); - goto out; + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; + } + } else { + pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); } optlen = sizeof(val); @@ -1394,9 +1407,6 @@ int cr_check(void) struct ns_id *ns; int ret = 0; - if (!is_root_user()) - return -1; - root_item = alloc_pstree_item(); if (root_item == NULL) return -1; @@ -1666,36 +1676,43 @@ static int pr_set_dumpable(int value) int check_caps(void) { - struct proc_status_creds creds; - int exit_code = -1; - - if (parse_pid_status(PROC_SELF, &creds.s, NULL)) + /* Read out effective capabilities and store in opts.cap_eff. */ + if (set_opts_cap_eff()) goto out; - memcpy(&opts.cap_eff, &creds.cap_eff, sizeof(u32) * PROC_CAP_SIZE); - + /* + * No matter if running as root or not. CRIU always needs + * at least these capabilities. + */ if (!has_cap_checkpoint_restore(opts.cap_eff)) goto out; /* For some things we need to know if we are running as root. */ opts.uid = geteuid(); - if (opts.uid) { - /* - * At his point we know we are running as non-root with the necessary - * capabilities available. Now we have to make the process dumpable - * so that /proc/self is not owned by root. - */ - if (pr_set_dumpable(1)) - return -1; + if (!opts.uid) { + /* CRIU is running as root. No further checks are necessary. */ + return 0; } - exit_code = 0; + if (!opts.unprivileged) { + pr_msg("Running as non-root requires '--unprivileged'\n"); + pr_msg("Please consult the documentation for limitations when running as non-root\n"); + return -1; + } + + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + + return 0; out: - if (exit_code) { - pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); - pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); - } + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); - return exit_code; + return -1; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index cd8705822..d7d3d8edb 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1809,6 +1809,9 @@ static int restore_task_with_children(void *_arg) goto err; } + if (set_opts_cap_eff()) + goto err; + /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; diff --git a/criu/cr-service.c b/criu/cr-service.c index 1d9f0aca3..73c48f5a6 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "version.h" #include "crtools.h" @@ -409,6 +410,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req) pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); } + if (req->has_unprivileged) + opts.unprivileged = req->unprivileged; + + if (check_caps()) + return 1; + if (kerndat_init()) return 1; diff --git a/criu/crtools.c b/criu/crtools.c index 8bcbe8e38..ac05bc821 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -185,6 +185,9 @@ int main(int argc, char *argv[], char *envp[]) return cr_service_work(atoi(argv[optind + 1])); } + if (check_caps()) + return 1; + if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); @@ -414,6 +417,8 @@ usage: " --network-lock METHOD\n" " network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" + " --unprivileged accept limitations when running as non-root\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/fdstore.c b/criu/fdstore.c index 6a7f73a59..03afa9f17 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -13,6 +13,8 @@ #include "rst-malloc.h" #include "log.h" #include "util.h" +#include "cr_options.h" +#include "util-caps.h" /* clang-format off */ static struct fdstore_desc { @@ -27,6 +29,8 @@ int fdstore_init(void) uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 }; struct sockaddr_un addr; unsigned int addrlen; + int rcv_opt_name; + int snd_opt_name; struct stat st; int sk, ret; @@ -49,8 +53,16 @@ int fdstore_init(void) return -1; } - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + rcv_opt_name = SO_RCVBUFFORCE; + snd_opt_name = SO_SNDBUFFORCE; + } else { + rcv_opt_name = SO_RCVBUF; + snd_opt_name = SO_SNDBUF; + } + + if (setsockopt(sk, SOL_SOCKET, snd_opt_name, &buf[0], sizeof(buf[0])) < 0 || + setsockopt(sk, SOL_SOCKET, rcv_opt_name, &buf[1], sizeof(buf[1])) < 0) { pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); close(sk); return -1; diff --git a/criu/files.c b/criu/files.c index 8a2250e19..38dc076d2 100644 --- a/criu/files.c +++ b/criu/files.c @@ -21,7 +21,7 @@ #include "image.h" #include "common/list.h" #include "rst-malloc.h" -#include "util-pie.h" +#include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" @@ -1346,10 +1346,35 @@ static int fchroot(int fd) return chroot("."); } +static int need_chroot(int saved_root) +{ + struct stat saved_root_stat, cur_root_stat; + int psd; + + if (fstat(saved_root, &saved_root_stat) == -1) { + pr_perror("Failed to stat saved root dir"); + return -1; + } + + psd = open_pid_proc(PROC_SELF); + if (psd < 0) { + pr_perror("Failed to open PROC_SELF"); + return -1; + } + + if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { + pr_perror("Failed to stat current root dir"); + return -1; + } + + return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; +} + int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); + bool do_chroot = true; /* * First -- open both descriptors. We will not @@ -1368,15 +1393,24 @@ int restore_fs(struct pstree_item *me) goto out; } + /* + * In unprivileged mode chroot() may fail if we don't have + * sufficient privileges, therefore only do it if the process + * is actually chrooted. + */ + if (opts.unprivileged) + do_chroot = need_chroot(dd_root); + /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ - - ret = fchroot(dd_root); - if (ret < 0) { - pr_perror("Can't change root"); - goto out; + if (do_chroot) { + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; + } } ret = fchdir(dd_cwd); diff --git a/criu/image.c b/criu/image.c index 353de48e8..3c2127ac6 100644 --- a/criu/image.c +++ b/criu/image.c @@ -226,7 +226,8 @@ int prepare_inventory(InventoryEntry *he) if (get_task_ids(&crt.i)) return -1; - he->has_root_cg_set = true; + if (!opts.unprivileged) + he->has_root_cg_set = true; if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) return -1; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 6e85dff0a..eacaa03a6 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -2,6 +2,7 @@ #define __CR_OPTIONS_H__ #include +#include #include "common/config.h" #include "common/list.h" #include "int.h" @@ -223,8 +224,14 @@ struct cr_options { * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN */ uid_t uid; - /* This contains the value from /proc/pid/status: CapEff */ - u32 cap_eff[CR_CAP_SIZE]; + /* This contains the value from capget()->effective */ + u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; + /* + * If CRIU should be running as non-root with the help of + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should + * explicitly request it as it comes with many limitations. + */ + int unprivileged; }; extern struct cr_options opts; diff --git a/criu/include/util.h b/criu/include/util.h index 4e29c079e..3a0403113 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -386,6 +386,8 @@ extern int mount_detached_fs(const char *fsname); extern char *get_legacy_iptables_bin(bool ipv6); +extern int set_opts_cap_eff(void); + extern ssize_t read_all(int fd, void *buf, size_t size); extern ssize_t write_all(int fd, const void *buf, size_t size); diff --git a/criu/namespaces.c b/criu/namespaces.c index 7356fe8c2..286073ff6 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -28,6 +28,7 @@ #include "cgroup.h" #include "fdstore.h" #include "kerndat.h" +#include "util-caps.h" #include "protobuf.h" #include "util.h" @@ -1623,10 +1624,12 @@ int collect_namespaces(bool for_dump) int prepare_userns_creds(void) { - /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ - if (setuid(0) || setgid(0) || setgroups(0, NULL)) { - pr_perror("Unable to initialize id-s"); - return -1; + if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; + } } /* diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index f80b68359..0e98cb3da 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -184,7 +184,7 @@ static int lsm_set_label(char *label, char *type, int procfd) return 0; } -static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type) +static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type, uid_t uid) { CredsEntry *ce = &args->creds; int b, i, ret; @@ -211,10 +211,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * lose caps bits when changing xids. */ - ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); - if (ret) { - pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); + if (ret) { + pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); + return -1; + } } /* @@ -252,10 +254,12 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * special state any longer. */ - ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); - if (ret) { - pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); + return -1; + } } /* @@ -634,7 +638,7 @@ long __export_restore_thread(struct thread_restore_args *args) if (restore_seccomp(args)) BUG(); - ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); + ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type, args->ta->uid); ret = ret || restore_dumpable_flag(&args->ta->mm); ret = ret || restore_pdeath_sig(args); if (ret) @@ -1915,7 +1919,7 @@ long __export_restore_task(struct task_restore_args *args) * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ - ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); + ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type, args->uid); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper); diff --git a/criu/timens.c b/criu/timens.c index 5803fc359..66c0c02a4 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -5,6 +5,7 @@ #include "proc_parse.h" #include "namespaces.h" #include "timens.h" +#include "cr_options.h" #include "protobuf.h" #include "images/timens.pb-c.h" @@ -57,6 +58,9 @@ int prepare_timens(int id) struct timespec ts; struct timespec prev_moff = {}, prev_boff = {}; + if (opts.unprivileged) + return 0; + img = open_image(CR_FD_TIMENS, O_RSTR, id); if (!img) return -1; diff --git a/criu/util.c b/criu/util.c index 060ca3bd4..b3b2b6659 100644 --- a/criu/util.c +++ b/criu/util.c @@ -41,6 +41,7 @@ #include "namespaces.h" #include "criu-log.h" #include "syscall.h" +#include "util-caps.h" #include "clone-noasan.h" #include "cr_options.h" @@ -1426,6 +1427,9 @@ void rlimit_unlimit_nofile(void) { struct rlimit new; + if (opts.unprivileged && !has_cap_sys_resource(opts.cap_eff)) + return; + new.rlim_cur = kdat.sysctl_nr_open; new.rlim_max = kdat.sysctl_nr_open; @@ -2064,3 +2068,21 @@ out: xfree(free_path); return mp_path; } + +int set_opts_cap_eff(void) +{ + struct __user_cap_header_struct cap_header; + struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3]; + int i; + + cap_header.version = _LINUX_CAPABILITY_VERSION_3; + cap_header.pid = getpid(); + + if (capget(&cap_header, &cap_data[0])) + return -1; + + for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++) + memcpy(&opts.cap_eff[i], &cap_data[i].effective, sizeof(u32)); + + return 0; +} diff --git a/images/rpc.proto b/images/rpc.proto index 3cf431639..afd2c7b43 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -139,6 +139,7 @@ message criu_opts { optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; optional bool mntns_compat_mode = 65; optional bool skip_file_rwx_check = 66; + optional bool unprivileged = 67; /* optional bool check_mounts = 128; */ } diff --git a/lib/c/criu.c b/lib/c/criu.c index 8171f7a12..fc8159999 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -566,6 +566,17 @@ void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); } +void criu_local_set_unprivileged(criu_opts *opts, bool unprivileged) +{ + opts->rpc->has_unprivileged = true; + opts->rpc->unprivileged = unprivileged; +} + +void criu_set_unprivileged(bool unprivileged) +{ + criu_local_set_unprivileged(global_opts, unprivileged); +} + void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; diff --git a/lib/c/criu.h b/lib/c/criu.h index c32a8a646..28a083d88 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -79,6 +79,7 @@ void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); +void criu_set_unprivileged(bool unprivileged); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); From 47b07d0110e95ef6a4b6d2853cd483347bdcd89b Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 12 Aug 2022 11:58:01 -0700 Subject: [PATCH 070/775] non-root: Introduce unprivileged mode to kerndat This patch modifies how kerndat is handled in unprivileged mode. Initialization and functionality that can only be done as root is made separate from common code. The kerndat file's location is defined as $XDG_RUNTIME_DIR/criu.kdat in unprivileged mode. Since we expect that directory to be on tmpfs we maintain the same behavior as the root-mode kerndat which lives in /run. Co-authored-by: Adrian Reber Signed-off-by: Younes Manton --- criu/kerndat.c | 186 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 142 insertions(+), 44 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 0f7d5fc8f..a209190ee 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -21,6 +21,7 @@ #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include #endif +#include #include "common/config.h" #include "int.h" @@ -51,6 +52,7 @@ #include "sched.h" #include "memfd.h" #include "mount-v2.h" +#include "util-caps.h" struct kerndat_s kdat = {}; @@ -1075,19 +1077,66 @@ static int kerndat_has_openat2(void) return 0; } -#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" -#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" +#define KERNDAT_CACHE_NAME "criu.kdat" +#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME +/* + * Returns: + * -1 if kdat_file was not written due to error + * 0 if kdat_file was written + * 1 if kdat_file was not written because cache directory undefined in env (non-root mode) + */ +static int get_kerndat_filename(char **kdat_file) +{ + int ret; + + /* + * Running as non-root, even with CAP_CHECKPOINT_RESTORE, does not + * allow to write to KDAT_RUNDIR which usually is only writable by root. + * Let's write criu.kdat file to XDG_RUNTIME_DIR for non-root cases. + * Note that XDG_RUNTIME_DIR is not always defined (e.g. when executing + * via su/sudo). + */ + if (opts.unprivileged) { + const char *cache_dir = getenv("XDG_RUNTIME_DIR"); + if (!cache_dir) { + pr_warn("$XDG_RUNTIME_DIR not set. Cannot find location for kerndat file\n"); + return 1; + } + ret = asprintf(kdat_file, "%s/%s", cache_dir, KERNDAT_CACHE_NAME); + } else { + ret = asprintf(kdat_file, "%s", KERNDAT_CACHE_FILE); + } + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return -1; + } + + return 0; +} + +/* + * Returns: + * -1 if error + * 0 if cache was loaded + * 1 if cache does not exist or is stale or cache directory undefined in env (non-root mode) + */ static int kerndat_try_load_cache(void) { + cleanup_free char *kdat_file = NULL; int fd, ret; - fd = open(KERNDAT_CACHE_FILE, O_RDONLY); + ret = get_kerndat_filename(&kdat_file); + if (ret) + return ret; + + fd = open(kdat_file, O_RDONLY); if (fd < 0) { if (ENOENT == errno) - pr_debug("File %s does not exist\n", KERNDAT_CACHE_FILE); + pr_debug("File %s does not exist\n", kdat_file); else - pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); + pr_warn("Can't load %s\n", kdat_file); return 1; } @@ -1101,12 +1150,12 @@ static int kerndat_try_load_cache(void) close(fd); if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { - pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); - unlink(KERNDAT_CACHE_FILE); + pr_warn("Stale %s file\n", kdat_file); + unlink(kdat_file); return 1; } - pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); + pr_info("Loaded kdat cache from %s\n", kdat_file); return 0; } @@ -1114,8 +1163,20 @@ static void kerndat_save_cache(void) { int fd, ret; struct statfs s; + cleanup_free char *kdat_file = NULL; + cleanup_free char *kdat_file_tmp = NULL; - fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (get_kerndat_filename(&kdat_file)) + return; + + ret = asprintf(&kdat_file_tmp, "%s.tmp", kdat_file); + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return; + } + + fd = open(kdat_file_tmp, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu @@ -1124,6 +1185,10 @@ static void kerndat_save_cache(void) */ return; + /* + * If running as root we store the cache file on a tmpfs (/run), + * because the file should be gone after reboot. + */ if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); @@ -1137,20 +1202,21 @@ static void kerndat_save_cache(void) */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; + ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) - ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); + ret = rename(kdat_file_tmp, kdat_file); else { ret = -1; errno = EIO; } if (ret < 0) { - pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); + pr_perror("Couldn't save %s", kdat_file); unl: - unlink(KERNDAT_CACHE_FILE_TMP); + unlink(kdat_file); } } @@ -1158,6 +1224,14 @@ static int kerndat_uffd(void) { int uffd, err = 0; + if (opts.unprivileged) + /* + * If running as non-root uffd_open() fails with + * 'Operation not permitted'. Just ignore uffd for + * non-root for now. + */ + return 0; + kdat.uffd_features = 0; uffd = uffd_open(0, &kdat.uffd_features, &err); @@ -1499,6 +1573,45 @@ int kerndat_try_load_new(void) return 0; } +static int root_only_init(void) +{ + int ret = 0; + + if (opts.unprivileged) + return 0; + + if (!ret && kerndat_loginuid()) { + pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_tun_netns()) { + pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_unix_file()) { + pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_link_nsid()) { + pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_netns()) { + pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_nftables_concat()) { + pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_move_mount_set_group()) { + pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); + ret = -1; + } + + return ret; +} + int kerndat_init(void) { int ret; @@ -1516,7 +1629,16 @@ int kerndat_init(void) memset(&kdat, 0, sizeof(kdat)); preload_socket_modules(); - preload_netfilter_modules(); + if (!opts.unprivileged) + /* + * This uses 'iptables -L' to implicitly load necessary modules. + * If the non nft backed iptables is used it does a + * openat(AT_FDCWD, "/run/xtables.lock", O_RDONLY|O_CREAT, 0600) = -1 EACCES + * which will fail as non-root. There are no capabilities to + * change this. The iptables nft backend fails with + * openat(AT_FDCWD, "/proc/net/ip_tables_names", O_RDONLY) = -1 EACCES + */ + preload_netfilter_modules(); if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); @@ -1554,10 +1676,14 @@ int kerndat_init(void) pr_err("get_ipv6 failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_loginuid()) { - pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + if (!ret && kerndat_nsid()) { + pr_err("kerndat_nsid failed when initializing kerndat.\n"); ret = -1; } + + if (!ret && root_only_init()) + ret = -1; + if (!ret && kerndat_iptables_has_xtlocks()) { pr_err("kerndat_iptables_has_xtlocks failed when initializing kerndat.\n"); ret = -1; @@ -1570,22 +1696,6 @@ int kerndat_init(void) pr_err("kerndat_compat_restore failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_tun_netns()) { - pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_socket_unix_file()) { - pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_nsid()) { - pr_err("kerndat_nsid failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_link_nsid()) { - pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_memfd_create()) { pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); ret = -1; @@ -1616,10 +1726,6 @@ int kerndat_init(void) pr_err("kerndat_vdso_preserves_hint failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_socket_netns()) { - pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_x86_has_ptrace_fpu_xsave_bug()) { pr_err("kerndat_x86_has_ptrace_fpu_xsave_bug failed when initializing kerndat.\n"); ret = -1; @@ -1644,7 +1750,7 @@ int kerndat_init(void) pr_err("has_time_namespace failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_newifindex()) { + if (!ret && (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) && kerndat_has_newifindex()) { pr_err("kerndat_has_newifindex failed when initializing kerndat.\n"); ret = -1; } @@ -1658,18 +1764,10 @@ int kerndat_init(void) pr_err("kerndat_has_nspid failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_nftables_concat()) { - pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_sockopt_buf_lock()) { pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_move_mount_set_group()) { - pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_openat2()) { pr_err("kerndat_has_openat2 failed when initializing kerndat.\n"); ret = -1; From 251939992a6a8d41742f10ba55d34e876c47c2d0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 3 May 2021 14:14:28 +0000 Subject: [PATCH 071/775] Documentation: add details about --unprivileged This adds the non-root section and information about the parameter --unprivileged to the man page. Co-authored-by: Anna Singleton Signed-off-by: Adrian Reber Signed-off-by: Anna Singleton --- Documentation/criu.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 8d2e91443..3b68f16a4 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,12 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty +*--unprivileged*:: + This option tells *criu* to accept the limitations when running + as non-root. Running as non-root requires *criu* at least to have + *CAP_SYS_ADMIN* or *CAP_CHECKPOINT_RESTORE*. For details about running + *criu* as non-root please consult the *NON-ROOT* section. + *-V*, *--version*:: Print program version and exit. @@ -877,6 +883,32 @@ configuration file will overwrite all other configuration file settings or RPC options. *This can lead to undesired behavior of criu and should only be used carefully.* +NON-ROOT +-------- +*criu* can be used as non-root with either the *CAP_SYS_ADMIN* capability +or with the *CAP_CHECKPOINT_RESTORE* capability introduces in Linux kernel 5.9. +*CAP_CHECKPOINT_RESTORE* is the minimum that is required. + +*criu* also needs either *CAP_SYS_PTRACE* or a value of 0 in +*/proc/sys/kernel/yama/ptrace_scope* (see *ptrace*(2)) to be able to interrupt +the process for dumping. + +Running *criu* as non-root has many limitations and depending on the process +to checkpoint and restore it may not be possible. + +In addition to *CAP_CHECKPOINT_RESTORE* it is possible to give *criu* additional +capabilities to enable additional features in non-root mode. + +Currently *criu* can benefit from the following additional capabilities: + + - *CAP_NET_ADMIN* + - *CAP_SYS_CHROOT* + - *CAP_SETUID* + - *CAP_SYS_RESOURCE* + +Independent of the capabilities it is always necessary to use "*--unprivileged*" to +accept *criu*'s limitation in non-root mode. + EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into From 6743d608cf0854c9b90aff54e78c7c2e141034f0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:18:31 +0000 Subject: [PATCH 072/775] non-root: extend zdtm.py to be able to run tests as non-root These are the minimal changes to make zdtm.py successfully run the env00 and pthread test case as non-root using the '--rootless' zdtm option. Co-authored-by: Younes Manton Signed-off-by: Adrian Reber Signed-off-by: Younes Manton --- test/zdtm.py | 56 +++++++++++++++++++++++++++++++++-------- test/zdtm/lib/test.c | 47 ++++++++++++++++++---------------- test/zdtm_ct.c | 60 ++++++++++++++++++++++++-------------------- 3 files changed, 103 insertions(+), 60 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index aefcb36a4..a311610c3 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -41,6 +41,8 @@ STREAMED_IMG_FILE_NAME = "img.criu" prev_line = None uuid = uuid.uuid4() +NON_ROOT_UID = 65534 + def alarm(*args): print("==== ALARM ====") @@ -392,10 +394,11 @@ class test_fail_expected_exc(Exception): class zdtm_test: - def __init__(self, name, desc, flavor, freezer): + def __init__(self, name, desc, flavor, freezer, rootless): self.__name = name self.__desc = desc self.__freezer = None + self.__rootless = rootless self.__make_action('cleanout') self.__pid = 0 self.__flavor = flavor @@ -439,6 +442,8 @@ class zdtm_test: wait_pid_die(int(self.__pid), self.__name, self.__timeout) def __add_wperms(self): + if os.getuid() != 0: + return # Add write perms for .out and .pid files for b in self._bins: p = os.path.dirname(b) @@ -457,6 +462,9 @@ class zdtm_test: env['ZDTM_NOTIFY_FDIN'] = "100" env['ZDTM_NOTIFY_FDOUT'] = "101" + if self.__rootless: + env['ZDTM_ROOTLESS'] = "1" + if not test_flag(self.__desc, 'suid'): # Numbers should match those in criu env['ZDTM_UID'] = "18943" @@ -618,11 +626,15 @@ class zdtm_test: ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) + if opts['rootless']: + return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups", str(uuid)]) @staticmethod def cleanup(): + if opts['rootless']: + return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_umount_cgroups", str(uuid)]) @@ -640,7 +652,9 @@ def load_module_from_file(name, path): class inhfd_test: - def __init__(self, name, desc, flavor, freezer): + def __init__(self, name, desc, flavor, freezer, rootless): + if rootless: + raise test_fail_exc("This kind of test does not currently support rootless mode") self.__name = os.path.basename(name) print("Load %s" % name) self.__fdtyp = load_module_from_file(self.__name, name) @@ -801,8 +815,8 @@ class inhfd_test: class groups_test(zdtm_test): - def __init__(self, name, desc, flavor, freezer): - zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer) + def __init__(self, name, desc, flavor, freezer, rootless): + zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer, rootless) if flavor.ns: self.__real_name = name with open(name) as fd: @@ -1039,6 +1053,7 @@ class criu: self.__dedup = bool(opts['dedup']) self.__mdedup = bool(opts['noauto_dedup']) self.__user = bool(opts['user']) + self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) self.__stream = bool(opts['stream']) self.__show_stats = bool(opts['show_stats']) @@ -1138,6 +1153,9 @@ class criu: print("Run criu " + action) + if self.__rootless: + s_args += ["--unprivileged"] + strace = [] if self.__sat: fname = os.path.join(self.__ddir(), action + '.strace') @@ -1156,7 +1174,10 @@ class criu: if action == "restore": preexec = None else: - preexec = self.__user and self.set_user_id or None + if os.getuid(): + preexec = None + else: + preexec = self.__user and self.set_user_id or None __ddir = self.__ddir() @@ -1476,10 +1497,11 @@ class criu: except Exception: return False - return criu_cli.run( - "check", - ["--no-default-config", "--verbosity=0", "--feature", feature], - opts['criu_bin']) == 0 + args = ["--no-default-config", "-verbosity=0", "--feature", feature] + if opts['rootless']: + args += ["--unprivileged"] + + return criu_cli.run("check", args, opts['criu_bin']) == 0 @staticmethod def available(): @@ -1900,7 +1922,7 @@ def do_run_test(tname, tdesc, flavs, opts): if opts['dry_run']: continue flav = flavors[f](opts) - t = tclass(tname, tdesc, flav, fcg) + t = tclass(tname, tdesc, flav, fcg, opts['rootless']) cr_api = criu(opts) try: @@ -2051,7 +2073,8 @@ class Launcher: 'sat', 'script', 'rpc', 'criu_config', 'lazy_pages', 'join_ns', 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', - 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode') + 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', + 'rootless') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2061,6 +2084,9 @@ class Launcher: logf = None log = None + if opts['rootless'] and os.getuid() == 0: + os.setgid(NON_ROOT_UID) + os.setuid(NON_ROOT_UID) sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], env=dict(os.environ, CR_CT_TEST_INFO=arg), stdout=log, @@ -2600,6 +2626,10 @@ def set_nr_hugepages(nr): with open("/proc/sys/vm/nr_hugepages", "w") as f: f.write("{}\n".format(nr)) return orig_hugepages + except PermissionError as err: + # EACCES is expected when running as non-root, otherwise re-raise the exception. + if err.errno != errno.EACCES or os.getuid() == 0: + raise except OSError as err: if err.errno != errno.EOPNOTSUPP: raise @@ -2673,6 +2703,10 @@ def get_cli_args(): rp.add_argument("--freezecg", help="Use freeze cgroup (path:state)") rp.add_argument("--user", help="Run CRIU as regular user", action='store_true') + rp.add_argument( + "--rootless", + help="Run CRIU rootless (uid!=0) (needs CAP_CHECKPOINT_RESTORE)", + action='store_true') rp.add_argument("--rpc", help="Run CRIU via RPC rather than CLI", action='store_true') diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 57eb42046..6291ea4a7 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -239,34 +239,37 @@ void test_init(int argc, char **argv) exit(1); } - val = getenv("ZDTM_GROUPS"); - if (val) { - char *tok = NULL; - unsigned int size = 0, groups[NGROUPS_MAX]; + val = getenv("ZDTM_ROOTLESS"); + if (!val) { + val = getenv("ZDTM_GROUPS"); + if (val) { + char *tok = NULL; + unsigned int size = 0, groups[NGROUPS_MAX]; - tok = strtok(val, " "); - while (tok) { - size++; - groups[size - 1] = atoi(tok); - tok = strtok(NULL, " "); + tok = strtok(val, " "); + while (tok) { + size++; + groups[size - 1] = atoi(tok); + tok = strtok(NULL, " "); + } + + if (setgroups(size, groups)) { + fprintf(stderr, "Can't set groups: %m"); + exit(1); + } } - if (setgroups(size, groups)) { - fprintf(stderr, "Can't set groups: %m"); + val = getenv("ZDTM_GID"); + if (val && (setgid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); exit(1); } - } - val = getenv("ZDTM_GID"); - if (val && (setgid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); - } - - val = getenv("ZDTM_UID"); - if (val && (setuid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); + val = getenv("ZDTM_UID"); + if (val && (setuid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); + } } if (prctl(PR_SET_DUMPABLE, 1)) { diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 0e8eeff8a..5e849b904 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -93,44 +93,50 @@ static int create_timens(void) int main(int argc, char **argv) { + uid_t uid; pid_t pid; int status; + uid = getuid(); + /* * pidns is used to avoid conflicts * mntns is used to mount /proc * net is used to avoid conflicts of parasite sockets */ - if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) - return 1; + if (!uid) + if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) + return 1; pid = fork(); if (pid == 0) { - if (create_timens()) - exit(1); - if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { - fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); - return 1; + if (!uid) { + if (create_timens()) + exit(1); + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { + fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); + return 1; + } + umount2("/proc", MNT_DETACH); + umount2("/dev/pts", MNT_DETACH); + if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { + fprintf(stderr, "mount(/proc): %m"); + return 1; + } + if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { + fprintf(stderr, "mount(pts): %m"); + return 1; + } + if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { + fprintf(stderr, "mount(binfmt_misc): %m"); + return 1; + } + if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { + fprintf(stderr, "mount(ptmx): %m"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; } - umount2("/proc", MNT_DETACH); - umount2("/dev/pts", MNT_DETACH); - if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { - fprintf(stderr, "mount(/proc): %m"); - return 1; - } - if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { - fprintf(stderr, "mount(pts): %m"); - return 1; - } - if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { - fprintf(stderr, "mount(binfmt_misc): %m"); - return 1; - } - if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { - fprintf(stderr, "mount(ptmx): %m"); - return 1; - } - if (system("ip link set up dev lo")) - return 1; execv(argv[1], argv + 1); fprintf(stderr, "execve: %m"); return 1; From 1cba559da4684649bb344bb60217cef70705af0a Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jul 2020 16:21:51 +0000 Subject: [PATCH 073/775] non-root: add non-root test case to cirrus runs Run env00 and pthread00 test as non-root as initial proof of concept. Signed-off-by: Adrian Reber --- .cirrus.yml | 21 +++++++++++++++++++++ scripts/ci/Makefile | 5 ++++- scripts/ci/vagrant.sh | 12 ++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index c7ed5027a..bad3a12b4 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -68,6 +68,27 @@ task: build_script: | make -C scripts/ci vagrant-fedora-rawhide +task: + name: Vagrant Fedora based test (non-root) + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: cirrus-images + image: family/docker-kvm + platform: linux + cpu: 4 + memory: 16G + nested_virtualization: true + + setup_script: | + scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + sudo kvm-ok + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + build_script: | + make -C scripts/ci vagrant-fedora-non-root + task: name: CentOS Stream 8 based test environment: diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 3a1634fb8..30dd9ebeb 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -97,7 +97,10 @@ vagrant-fedora-no-vdso: setup-vagrant vagrant-fedora-rawhide: setup-vagrant ./vagrant.sh fedora-rawhide -.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide +vagrant-fedora-non-root: setup-vagrant + ./vagrant.sh fedora-non-root + +.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index af0f7335a..e23486f29 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -68,4 +68,16 @@ fedora-rawhide() { ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } +fedora-non-root() { + ssh default uname -a + ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + # Setting the capability should be the only line needed to run as non-root on Fedora + # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu + ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' + # Run it once as non-root + ssh default 'cd /vagrant/criu; criu/criu check --unprivileged; ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' + # Run it as root with '--rootless' + ssh default 'cd /vagrant/criu; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h; sudo chmod 777 test/dump/zdtm/static/{env00,pthread00}; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' +} + $1 From 3db8d1a6c68556b0465f7513446f7ce97e3c29d7 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 20 Oct 2022 18:25:11 +0300 Subject: [PATCH 074/775] cgroup: add a comment to restore_cgroup_prop about path argument requirements In Virtuozzo we've faced out-of-bound access when calling this function on short path string, which corrupted other memory and lead to segmentation fault. So it may be useful to have this comment in code to avoid such a missuse of this function in future. Signed-off-by: Pavel Tikhomirov --- criu/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/criu/cgroup.c b/criu/cgroup.c index d886ce9f2..6f6117c21 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1303,6 +1303,10 @@ static int restore_perms(int fd, const char *path, CgroupPerms *perms) return 0; } +/* + * Note: The path string can be modified in this function, + * the length of path string should be at least PATH_MAX. + */ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { From 840735aa08ce2a13d01ddb59112963d6c4327d7a Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 23 Oct 2022 14:16:21 +0700 Subject: [PATCH 075/775] ipc_sysctl: Prioritize restoring IPC variables using non usernsd approach Since commit https://github.com/torvalds/linux/commit/5563cabdde, user with enough capability can open IPC sysctl files and write to them. Therefore, we don't need to use usernsd process in the outside user namespace to help with that anymore. Furthermore, some later commits: https://github.com/torvalds/linux/commit/1f5c135ee5, https://github.com/torvalds/linux/commit/0889f44e28 bind the IPC namespace to the opened file descriptor of IPC sysctl at the open() time, the changed value does not depend on the IPC namespace of write() time anymore. This breaks the current usernsd approach. So, we prioritize opening/writing IPC sysctl files in the context of restored process directly without usernsd help. This approach succeeds in the newer kernel since the restored process has enough capabilities at this restore stage. With older kernel, the open() fails and we fallback to the usernsd approach. Signed-off-by: Bui Quang Minh --- criu/include/sysctl.h | 7 ++++--- criu/ipc_ns.c | 11 ++++++++--- criu/sysctl.c | 35 +++++++++++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index ac7924dcd..cb3eba817 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -34,8 +34,9 @@ enum { /* * Some entries might be missing mark them as optional. */ -#define CTL_FLAGS_OPTIONAL 1 -#define CTL_FLAGS_HAS 2 -#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_OPTIONAL 1 +#define CTL_FLAGS_HAS 2 +#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_IPC_EACCES_SKIP 5 #endif /* __CR_SYSCTL_H__ */ diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c index 4fe082fbb..7e95be8c5 100644 --- a/criu/ipc_ns.c +++ b/criu/ipc_ns.c @@ -292,6 +292,8 @@ static void pr_info_ipc_shm(const IpcShmEntry *shm) static int ipc_sysctl_req(IpcVarEntry *e, int op) { + int i; + struct sysctl_req req[] = { { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, @@ -332,6 +334,9 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op) if (e->has_shm_next_id) req[nr++] = req[16]; + for (i = 0; i < nr; i++) + req[i].flags = CTL_FLAGS_IPC_EACCES_SKIP; + return sysctl_op(req, nr, op, CLONE_NEWIPC); } @@ -570,7 +575,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { - { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, + { "kernel/sem_next_id", &sem->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct semid_ds semid; @@ -703,7 +708,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { - { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, + { "kernel/msg_next_id", &msq->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct msqid_ds msqid; @@ -841,7 +846,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { int ret, id, hugetlb_flag = 0; struct sysctl_req req[] = { - { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, + { "kernel/shm_next_id", &shm->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct shmid_ds shmid; diff --git a/criu/sysctl.c b/criu/sysctl.c index b06688712..99026acf4 100644 --- a/criu/sysctl.c +++ b/criu/sysctl.c @@ -203,6 +203,17 @@ static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) * 2. forks a task * 3. setns()es to the UTS/IPC namespace of the caller * 4. write()s to the files and exits + * + * For the IPC namespace, since + * https://github.com/torvalds/linux/commit/5563cabdde, user with + * enough capability can open IPC sysctl files and write to it. Later + * commit https://github.com/torvalds/linux/commit/1f5c135ee5 and + * https://github.com/torvalds/linux/commit/0889f44e28 bind the IPC + * namespace at the open() time so the changed value does not depend + * on the IPC namespace at the write() time. Also, the permission check + * changes a little bit which makes the above approach unusable but we + * can simply use nonuserns version for restoring as IPC sysctl as the + * restored process currently has enough capability. */ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY); if (dir < 0) { @@ -335,9 +346,12 @@ out: return ret; } -static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) +/* exit_code = 1 in case nonuserns failed but we want to fallback to userns approach */ +static int __nonuserns_sysctl_op(struct sysctl_req **orig_req, size_t *orig_nr_req, int op) { int ret, exit_code = -1; + struct sysctl_req *req = *orig_req; + size_t nr_req = *orig_nr_req; while (nr_req--) { int fd; @@ -351,6 +365,14 @@ static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) req++; continue; } + if (errno == EACCES && (req->flags & CTL_FLAGS_IPC_EACCES_SKIP)) { + /* The remaining requests are restored using userns approach */ + *orig_req = req; + *orig_nr_req = nr_req + 1; + exit_code = 1; + goto out; + } + pr_perror("Can't open sysctl %s", req->name); goto out; } @@ -404,7 +426,16 @@ int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns) * so we can do those in process as well. */ if (!ns || ns & CLONE_NEWNET || op == CTL_READ) - return __nonuserns_sysctl_op(req, nr_req, op); + return __nonuserns_sysctl_op(&req, &nr_req, op); + + /* Try to use nonuserns for restoring IPC sysctl and fallback to + * userns approach when the returned code is 1. + */ + if (ns & CLONE_NEWIPC && op == CTL_WRITE) { + ret = __nonuserns_sysctl_op(&req, &nr_req, op); + if (ret <= 0) + return ret; + } /* * In order to avoid lots of opening of /proc/sys for each struct sysctl_req, From 8a336ab22673bd8dbc838076edfc75dce809e781 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 25 Oct 2022 17:36:58 +0200 Subject: [PATCH 076/775] Switch aarch64 builds to Cirrus CI It seems like drone.io no longer provides free aarch64/armhf CI runs. This switches the aarch64 CI runs to Cirrus CI. armhf CI runs have been dropped for now as they are not directly supported. Signed-off-by: Adrian Reber --- .cirrus.yml | 37 ++++++++++++++++++++++++ .drone.yml | 82 ----------------------------------------------------- 2 files changed, 37 insertions(+), 82 deletions(-) delete mode 100644 .drone.yml diff --git a/.cirrus.yml b/.cirrus.yml index bad3a12b4..dbfb899ff 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -148,3 +148,40 @@ task: build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" + +task: + name: aarch64 build GCC (native) + arm_container: + image: docker.io/library/ubuntu:jammy + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/apt-install make + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci local + +task: + name: aarch64 build CLANG (native) + arm_container: + image: docker.io/library/ubuntu:jammy + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/apt-install make + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci local CLANG=1 + +task: + name: aarch64 Fedora Rawhide + arm_container: + image: registry.fedoraproject.org/fedora:rawhide + cpu: 4 + memory: 4G + script: uname -a + build_script: | + scripts/ci/prepare-for-fedora-rawhide.sh + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 + make -C test/zdtm -j 4 diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index 07eb8be65..000000000 --- a/.drone.yml +++ /dev/null @@ -1,82 +0,0 @@ ---- -kind: pipeline -type: docker -name: aarch64 build GCC (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: aarch64 build CLANG (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: armhf build GCC (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: armhf build CLANG (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: aarch64 Fedora Rawhide - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: registry.fedoraproject.org/fedora:rawhide - commands: - - scripts/ci/prepare-for-fedora-rawhide.sh - - make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 - - make -C test/zdtm -j 4 From 167cfd366edc2a10e61ae8c1ceb8e75bfe422937 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 14:46:33 +0700 Subject: [PATCH 077/775] cgroup-v2: Checkpoint and restore some global properties This commit supports checkpoint/restore some new global properties in cgroup-v2 cgroup.subtree_control cgroup.max.descendants cgroup.max.depth cgroup.freeze cgroup.type Only cgroup.subtree_control, cgroup.type need some more code to handle. cgroup.subtree_control value needs to be set with "+", "-" prefix and cgroup.type can only be written with value "threaded" if we want to make this controller threaded. cgroup.type is a special property because this property must be restored before any processes can move into this controller. Signed-off-by: Bui Quang Minh --- criu/cgroup-props.c | 17 ++++++ criu/cgroup.c | 108 ++++++++++++++++++++++++++++++++---- criu/include/cgroup-props.h | 1 + 3 files changed, 116 insertions(+), 10 deletions(-) diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index 5bed7dd9d..1b85c5b5a 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -35,12 +35,29 @@ static const char *____criu_global_props____[] = { "tasks", }; +/* cgroup2 global properties */ +// clang-format off +static const char *____criu_global_props_v2____[] = { + "cgroup.subtree_control", + "cgroup.max.descendants", + "cgroup.max.depth", + "cgroup.freeze", + "cgroup.type", +}; +// clang-format on + cgp_t cgp_global = { .name = "____criu_global_props____", .nr_props = ARRAY_SIZE(____criu_global_props____), .props = ____criu_global_props____, }; +cgp_t cgp_global_v2 = { + .name = "____criu_global_props_v2____", + .nr_props = ARRAY_SIZE(____criu_global_props_v2____), + .props = ____criu_global_props_v2____, +}; + typedef struct { struct list_head list; cgp_t cgp; diff --git a/criu/cgroup.c b/criu/cgroup.c index 6f6117c21..4f68836be 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -441,7 +441,15 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru pr_err("dumping known properties failed\n"); return -1; } + } + /* cgroup v2 */ + if (controller->controllers[0][0] == 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global_v2) < 0) { + pr_err("dumping global properties v2 failed\n"); + return -1; + } + } else { if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { pr_err("dumping global properties failed\n"); return -1; @@ -1061,8 +1069,15 @@ static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt * it. We restore these properties as soon as the cgroup is created. */ static const char *special_props[] = { - "cpuset.cpus", "cpuset.mems", "devices.list", "memory.kmem.limit_in_bytes", - "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", NULL, + "cpuset.cpus", + "cpuset.mems", + "devices.list", + "memory.kmem.limit_in_bytes", + "memory.swappiness", + "memory.oom_control", + "memory.use_hierarchy", + "cgroup.type", + NULL, }; bool is_special_property(const char *prop) @@ -1303,6 +1318,65 @@ static int restore_perms(int fd, const char *path, CgroupPerms *perms) return 0; } +static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) +{ + char *current, *next; + size_t len, off = 0; + + current = input; + do { + next = strchrnul(current, ' '); + len = next - current; + + output[off] = prefix; + off++; + memcpy(output + off, current, len); + off += len; + output[off] = ' '; + off++; + + current = next + 1; + } while (*next != '\0'); + + return off; +} + +static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p, int fd) +{ + char buf[1024]; + char line[1024]; + int ret, off = 0; + + ret = read(fd, buf, sizeof(buf) - 1); + if (ret < 0) { + pr_perror("read from cgroup.subtree_control"); + return ret; + } + /* Remove the trailing newline */ + buf[ret] = '\0'; + + /* Remove all current subsys in subtree_control */ + if (buf[0] != '\0') + off = add_subtree_control_prop_prefix(buf, line, '-'); + + /* Add subsys need to be restored in subtree_control */ + if (cg_prop_entry_p->value[0] != '\0') + off += add_subtree_control_prop_prefix(cg_prop_entry_p->value, line + off, '+'); + + /* Remove the trailing space */ + if (off != 0) { + off--; + line[off] = '\0'; + } + + if (write(fd, line, off) != off) { + pr_perror("write to cgroup.subtree_control"); + return -1; + } + + return 0; +} + /* * Note: The path string can be modified in this function, * the length of path string should be at least PATH_MAX. @@ -1310,8 +1384,9 @@ static int restore_perms(int fd, const char *path, CgroupPerms *perms) static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { - int cg, fd, ret = -1; + int cg, fd, ret = -1, flag; CgroupPerms *perms = cg_prop_entry_p->perms; + int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); if (opts.manage_cgroups == CG_MODE_IGNORE) return 0; @@ -1328,8 +1403,13 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); + if (is_subtree_control) + flag = O_RDWR; + else + flag = O_WRONLY; + cg = get_service_fd(CGROUP_YARD); - fd = openat(cg, path, O_WRONLY); + fd = openat(cg, path, flag); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; @@ -1344,6 +1424,17 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat goto out; } + if (is_subtree_control) { + ret = restore_cgroup_subtree_control(cg_prop_entry_p, fd); + goto out; + } + + /* skip restoring cgroup.type if its value is not "threaded" */ + if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { + ret = 0; + goto out; + } + if (split_lines) { char *line = cg_prop_entry_p->value; char *next_line; @@ -1688,12 +1779,9 @@ static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux return -1; for (j = 0; j < n_controllers; j++) { - if (!strcmp(controllers[j], "cpuset") || !strcmp(controllers[j], "memory") || - !strcmp(controllers[j], "devices")) { - if (restore_special_props(paux, off2, e) < 0) { - pr_err("Restoring special cpuset props failed!\n"); - return -1; - } + if (restore_special_props(paux, off2, e) < 0) { + pr_err("Restoring special cpuset props failed!\n"); + return -1; } } } else { diff --git a/criu/include/cgroup-props.h b/criu/include/cgroup-props.h index 11b677548..10a7061b8 100644 --- a/criu/include/cgroup-props.h +++ b/criu/include/cgroup-props.h @@ -10,6 +10,7 @@ typedef struct { } cgp_t; extern cgp_t cgp_global; +extern cgp_t cgp_global_v2; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); From d7e8746598015cfeee0d114fed89cdb5531abd2c Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Tue, 13 Sep 2022 21:42:48 +0700 Subject: [PATCH 078/775] zdtm: Add write_value/read_value helpers into zdtm library Add write_value/read_value helpers to write/read buffer to/from files into zdmt library. Signed-off-by: Bui Quang Minh --- test/zdtm/lib/Makefile | 2 +- test/zdtm/lib/file.c | 46 +++++++++++++++++++++++++++++++++++++ test/zdtm/lib/zdtmtst.h | 3 +++ test/zdtm/static/cgroup04.c | 20 ---------------- 4 files changed, 50 insertions(+), 21 deletions(-) create mode 100644 test/zdtm/lib/file.c diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 3ec58dfaf..949dc123a 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -4,7 +4,7 @@ CFLAGS += $(USERCFLAGS) LIB := libzdtmtst.a -LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c file.c PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') diff --git a/test/zdtm/lib/file.c b/test/zdtm/lib/file.c new file mode 100644 index 000000000..57d85421d --- /dev/null +++ b/test/zdtm/lib/file.c @@ -0,0 +1,46 @@ +#include +#include +#include "zdtmtst.h" + +int write_value(const char *path, const char *value) +{ + int fd, l; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + l = write(fd, value, strlen(value)); + if (l < 0) { + pr_perror("failed to write %s to %s", value, path); + close(fd); + return -1; + } + + close(fd); + return 0; +} + +int read_value(const char *path, char *value, int size) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + ret = read(fd, (void *)value, size); + if (ret < 0) { + pr_perror("read %s", path); + close(fd); + return -1; + } + + value[ret] = '\0'; + close(fd); + return 0; +} diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index d91886d25..105f3c11a 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -216,4 +216,7 @@ static inline void cleanup_closep(void *p) TEMP_FAILURE_RETRY(close(*pp)); } +extern int write_value(const char *path, const char *value); +extern int read_value(const char *path, char *value, int size); + #endif /* _VIMITESU_H_ */ diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 5a424be12..8c40ffd6b 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -19,26 +19,6 @@ char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); static const char *cgname = "zdtmtst"; -int write_value(const char *path, const char *value) -{ - int fd, l; - - fd = open(path, O_WRONLY); - if (fd < 0) { - pr_perror("open %s", path); - return -1; - } - - l = write(fd, value, strlen(value)); - close(fd); - if (l < 0) { - pr_perror("failed to write %s to %s", value, path); - return -1; - } - - return 0; -} - int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; From ad3936e81e1484d1abfbcec2c4ca1b41ae0d570b Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:05:10 +0700 Subject: [PATCH 079/775] zdtm: Add test to check global properties of cgroup-v2 are preserved Check that CRIU can checkpoint/restore global properties in cgroup-v2 properly. Signed-off-by: Bui Quang Minh --- test/zdtm/static/Makefile | 1 + test/zdtm/static/cgroupv2_00.c | 86 ++++++++++++++++++++++++++ test/zdtm/static/cgroupv2_00.checkskip | 11 ++++ test/zdtm/static/cgroupv2_00.desc | 1 + test/zdtm/static/cgroupv2_00.hook | 16 +++++ 5 files changed, 115 insertions(+) create mode 100644 test/zdtm/static/cgroupv2_00.c create mode 100755 test/zdtm/static/cgroupv2_00.checkskip create mode 100644 test/zdtm/static/cgroupv2_00.desc create mode 100755 test/zdtm/static/cgroupv2_00.hook diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 0ac22731b..915e565bd 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -386,6 +386,7 @@ TST_DIR = \ cgroup02 \ cgroup03 \ cgroup04 \ + cgroupv2_00 \ cgroup_ifpriomap \ cgroup_ignore \ cgroup_stray \ diff --git a/test/zdtm/static/cgroupv2_00.c b/test/zdtm/static/cgroupv2_00.c new file mode 100644 index 000000000..2c6780e0c --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.c @@ -0,0 +1,86 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that some cgroup-v2 properties in kernel controllers are preserved"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg00"; + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + int ret = -1; + + test_init(argc, argv); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + test_daemon(); + test_waitsig(); + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "cpuset\n")) { + fail("cgroup.subtree_control mismatches"); + goto out; + } + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "threaded\n")) { + fail("cgroup.type mismatches"); + goto out; + } + + pass(); + + ret = 0; + +out: + sprintf(path, "%s", dirname); + umount(path); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_00.checkskip b/test/zdtm/static/cgroupv2_00.checkskip new file mode 100755 index 000000000..375ed3564 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc new file mode 100644 index 000000000..4bfd4b265 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_00.hook b/test/zdtm/static/cgroupv2_00.hook new file mode 100755 index 000000000..1002b1ec5 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.hook @@ -0,0 +1,16 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg00" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" +echo "-cpuset" > "$tname/$cgname/cgroup.subtree_control" + +set +e +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" From 17d1d8810e8ba7b5fa4a117a80b86a39cc98da2e Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:31:13 +0700 Subject: [PATCH 080/775] cgroup-v2: Dump cgroup controllers of every threads in a process Currently, we assume all threads in process are in the same cgroup controllers. However, with threaded controllers, threads in a process may be in different controllers. So we need to dump cgroup controllers of every threads in process and fixup the procfs cgroup parsing to parse from self/task//cgroup. Signed-off-by: Bui Quang Minh --- criu/cgroup.c | 38 ++++++++++++++++++++--------- criu/cr-dump.c | 53 +++++++++++++++++++++++++++++++++++++++-- criu/image.c | 2 +- criu/include/cgroup.h | 8 +++++-- criu/include/parasite.h | 7 +++++- criu/parasite-syscall.c | 1 + criu/pie/parasite.c | 2 +- criu/proc_parse.c | 5 ++-- images/cgroup.proto | 1 + images/core.proto | 1 + 10 files changed, 98 insertions(+), 20 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 4f68836be..b238b6402 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -174,6 +174,7 @@ struct cg_controller *new_controller(const char *name) nc->n_controllers = 1; nc->n_heads = 0; + nc->is_threaded = false; INIT_LIST_HEAD(&nc->heads); return nc; @@ -371,7 +372,8 @@ static void free_all_cgroup_props(struct cgroup_dir *ncd) ncd->n_properties = 0; } -static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) +static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp, + struct cg_controller *controller) { int j; char buf[PATH_MAX]; @@ -422,6 +424,13 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; } + /* + * Set the is_threaded flag if cgroup.type's value is threaded, + * ignore all other values. + */ + if (!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) + controller->is_threaded = true; + pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; @@ -437,7 +446,7 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); - if (dump_cg_props_array(fpath, ncd, cgp) < 0) { + if (dump_cg_props_array(fpath, ncd, cgp, controller) < 0) { pr_err("dumping known properties failed\n"); return -1; } @@ -445,12 +454,12 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru /* cgroup v2 */ if (controller->controllers[0][0] == 0) { - if (dump_cg_props_array(fpath, ncd, &cgp_global_v2) < 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global_v2, controller) < 0) { pr_err("dumping global properties v2 failed\n"); return -1; } } else { - if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global, controller) < 0) { pr_err("dumping global properties failed\n"); return -1; } @@ -735,9 +744,9 @@ static int collect_cgroups(struct list_head *ctls) return 0; } -int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) +int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args, int id) { - int pid; + int pid, tid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; @@ -750,8 +759,13 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ else pid = getpid(); - pr_info("Dumping cgroups for %d\n", pid); - if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) + if (id < 0) + tid = pid; + else + tid = item->threads[id].real; + + pr_info("Dumping cgroups for thread %d\n", tid); + if (parse_thread_cgroup(pid, tid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); @@ -764,9 +778,10 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { - BUG_ON(root_cgset); - root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); + if (!root_cgset) { + root_cgset = cs; + pr_info("Set %d is root one\n", cs->id); + } } else { struct cg_ctl *root, *stray; @@ -913,6 +928,7 @@ static int dump_controllers(CgroupEntry *cg) list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); + ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 210f66232..e31b2f702 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -759,6 +759,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item pid_t pid = item->pid->real; int ret = -1; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; + u32 *cg_set; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); @@ -804,13 +805,23 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; + strcpy(cgroup_args.thread_cgrp, "self/cgroup"); ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } - core->tc->has_cg_set = true; - ret = dump_task_cgroup(item, &core->tc->cg_set, info); + /* + * We don't support multithreads zombie tasks so there is + * no thread_core in zombie tasks, store the cg_set in + * task_core in these cases. + */ + cg_set = &core->thread_core->cg_set; + if (item->pid->state == TASK_THREAD) { + core->tc->has_cg_set = true; + cg_set = &core->tc->cg_set; + } + ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; @@ -1409,6 +1420,38 @@ err: return ret; } +static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +{ + struct parasite_dump_cgroup_args cgroup_args, *info; + int i; + + BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); + for (i = 0; i < item->nr_threads; i++) { + CoreEntry *core = item->core[i]; + + /* Leader is already dumped */ + if (item->pid->real == item->threads[i].real) + continue; + + /* For now, we only need to dump the root task's cgroup ns, because we + * know all the tasks are in the same cgroup namespace because we don't + * allow nesting. + */ + info = NULL; + if (item->ids->has_cgroup_ns_id && !item->parent) { + info = &cgroup_args; + sprintf(cgroup_args.thread_cgrp, "self/task/%d/cgroup", item->threads[i].ns[0].virt); + if (parasite_dump_cgroup(parasite_ctl, &cgroup_args)) + return -1; + } + + if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) + return -1; + } + + return 0; +} + static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; @@ -1681,6 +1724,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; } + ret = dump_task_cgroup(parasite_ctl, item); + if (ret) { + pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); diff --git a/criu/image.c b/criu/image.c index 3c2127ac6..9fb390ab7 100644 --- a/criu/image.c +++ b/criu/image.c @@ -228,7 +228,7 @@ int prepare_inventory(InventoryEntry *he) if (!opts.unprivileged) he->has_root_cg_set = true; - if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) + if (dump_thread_cgroup(NULL, &he->root_cg_set, NULL, -1)) return -1; he->root_ids = crt.i.ids; diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 2e9b8933c..5a254559d 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -7,7 +7,7 @@ struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; -int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); +int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); int prepare_task_cgroup(struct pstree_item *); int prepare_cgroup(void); @@ -60,6 +60,9 @@ struct cg_controller { /* for cgroup list in cgroup.c */ struct list_head l; + + /* controller is a threaded cgroup or not */ + int is_threaded; }; struct cg_controller *new_controller(const char *name); @@ -87,7 +90,8 @@ struct cg_ctl { */ struct list_head; struct parasite_dump_cgroup_args; -extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); +extern int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *l, + unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); diff --git a/criu/include/parasite.h b/criu/include/parasite.h index d2a06889f..787c927be 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -241,7 +241,12 @@ struct parasite_dump_cgroup_args { * * The string is null terminated. */ - char contents[1 << 12]; + char contents[(1 << 12) - 32]; + /* + * Contains the path to thread cgroup procfs. + * "self/task//cgroup" + */ + char thread_cgrp[32]; }; #endif /* !__ASSEMBLY__ */ diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index ee4fa86f4..d3541d996 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -513,6 +513,7 @@ int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_a struct parasite_dump_cgroup_args *ca; ca = compel_parasite_args(ctl, struct parasite_dump_cgroup_args); + memcpy(ca->thread_cgrp, cgroup->thread_cgrp, sizeof(ca->thread_cgrp)); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_CGROUP, ctl); if (ret) { pr_err("Parasite failed to dump /proc/self/cgroup\n"); diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index f75fe13bb..2303f41c3 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -745,7 +745,7 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return -1; } - cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0); + cgroup = sys_openat(proc, args->thread_cgrp, O_RDONLY, 0); sys_close(proc); if (cgroup < 0) { pr_err("can't get /proc/self/cgroup fd\n"); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 946b0fc40..abac5908b 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -2549,7 +2549,8 @@ err: return -1; } -int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) +int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *retl, + unsigned int *n) { FILE *f; int ret; @@ -2557,7 +2558,7 @@ int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct li unsigned int n_internal = 0; struct cg_ctl *intern, *ext; - f = fopen_proc(pid, "cgroup"); + f = fopen_proc(pid, "task/%d/cgroup", tid); if (!f) return -1; diff --git a/images/cgroup.proto b/images/cgroup.proto index ee0354124..5c7d16c6d 100644 --- a/images/cgroup.proto +++ b/images/cgroup.proto @@ -24,6 +24,7 @@ message cgroup_dir_entry { message cg_controller_entry { repeated string cnames = 1; repeated cgroup_dir_entry dirs = 2; + required bool is_threaded = 3; } message cg_member_entry { diff --git a/images/core.proto b/images/core.proto index 345bdca53..1ee32bfda 100644 --- a/images/core.proto +++ b/images/core.proto @@ -105,6 +105,7 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; optional rseq_entry rseq_entry = 15; + required uint32 cg_set = 16; } message task_rlimits_entry { From 20ea8a064785d4900c0e9e01e83c52c88572a9a1 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:41:17 +0700 Subject: [PATCH 081/775] cgroup-v2: Restore threads in a process into correct threaded controllers As threads in a process may be in different threaded controllers, we need to move thoses threads to the correct controllers. Because the threads of a process are restored in later stage in restorer.c, we need to create a cgroupd service to help to move those threads into correct controllers when they are restored. We cannot use usernsd as the code in restorer does not know the address of outside function to pass to userns_call. However, this cgroupd service still reuses a lot of code from usernsd. The main logic is that restored threads receive the cg_set number they belong to before restorer stage in case their cg_set are different from main thread. When these threads are restored, they send the cg_set number and their thread ids through unix socket to cgroupd. cgroupd receives the cg_set number and thread ids and moves those threads into correct controllers. Thread ids are sent through SCM_CREDENTIALS of unix socket so they are translated into correct thread ids in the receiving end. Signed-off-by: Bui Quang Minh --- criu/cgroup.c | 140 +++++++++++++++++++++++++++++++++++++- criu/cr-restore.c | 19 +++++- criu/include/cgroup.h | 2 + criu/include/namespaces.h | 17 +++++ criu/include/restorer.h | 2 + criu/include/servicefd.h | 1 + criu/namespaces.c | 65 +++++++++--------- criu/pie/restorer.c | 107 +++++++++++++++++++++++++++++ 8 files changed, 319 insertions(+), 34 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index b238b6402..918827d99 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "common/list.h" #include "xmalloc.h" @@ -55,6 +56,7 @@ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; +static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { @@ -1935,6 +1937,136 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +/* + * If a thread is a different cgroup set than the main thread in process, + * it means it is in a threaded controller. This daemon receives the cg_set + * number from the restored thread and move this thread to the correct + * cgroup controllers + */ +static int cgroupd(int sk) +{ + pr_info("cgroud: Daemon started\n"); + + while (1) { + struct unsc_msg um; + uns_call_t call; + pid_t tid; + int fd, cg_set, i; + CgSetEntry *cg_set_entry; + int ret; + + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); + ret = recvmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("cgroupd: recv req error"); + return -1; + } + + unsc_msg_pid_fd(&um, &tid, &fd); + pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); + + cg_set_entry = find_rst_set_by_id(cg_set); + if (!cg_set_entry) { + pr_err("cgroupd: No set found %d\n", cg_set); + return -1; + } + + for (i = 0; i < cg_set_entry->n_ctls; i++) { + int j, aux_off; + CgMemberEntry *ce = cg_set_entry->ctls[i]; + char aux[PATH_MAX]; + CgControllerEntry *ctrl = NULL; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + /* + * This is not a threaded controller, all threads in this + * process must be in this controller. Main thread has been + * restored, so this thread is in this controller already. + */ + if (!ctrl->is_threaded) + continue; + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); + + /* + * Cgroupd runs outside of the namespaces so we don't + * need to use userns_call here + */ + if (userns_move(aux, 0, tid)) { + pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); + return -1; + } + } + + /* + * We only want to send the cred which contains thread id back. + * The restored thread recvmsg(MSG_PEEK) until it gets its own + * thread id. + */ + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); + if (sendmsg(sk, &um.h, 0) <= 0) { + pr_perror("cgroupd: send req error"); + return -1; + } + } + + return 0; +} + +int stop_cgroupd(void) +{ + if (cgroupd_pid) { + sigset_t blockmask, oldmask; + + /* + * Block the SIGCHLD signal to avoid triggering + * sigchld_handler() + */ + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + kill(cgroupd_pid, SIGTERM); + waitpid(cgroupd_pid, NULL, 0); + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + } + + return 0; +} + +static int prepare_cgroup_thread_sfd(void) +{ + int sk; + + sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); + if (sk < 0) { + pr_err("failed to start cgroupd\n"); + return -1; + } + + if (install_service_fd(CGROUPD_SK, sk) < 0) { + kill(cgroupd_pid, SIGKILL); + waitpid(cgroupd_pid, NULL, 0); + return -1; + } + + return 0; +} + static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); @@ -2089,15 +2221,19 @@ int prepare_cgroup(void) n_controllers = ce->n_controllers; controllers = ce->controllers; - if (n_sets) + if (n_sets) { /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); - else + if (ret < 0) + return ret; + ret = prepare_cgroup_thread_sfd(); + } else { ret = 0; + } return ret; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d7d3d8edb..78f2a9701 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1349,7 +1349,12 @@ static inline int fork_with_pid(struct pstree_item *item) return -1; item->pid->state = ca.core->tc->task_state; - rsti(item)->cg_set = ca.core->tc->cg_set; + + /* Zombie task's cg_set is stored in task_core */ + if (item->pid->state == TASK_DEAD) + rsti(item)->cg_set = ca.core->tc->cg_set; + else + rsti(item)->cg_set = ca.core->thread_core->cg_set; if (ca.core->tc->has_stop_signo) item->pid->stop_signo = ca.core->tc->stop_signo; @@ -2376,6 +2381,10 @@ skip_ns_bouncing: if (ret < 0) goto out_kill; + ret = stop_cgroupd(); + if (ret < 0) + goto out_kill; + ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; @@ -3812,6 +3821,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); + if (rsti(current)->cg_set != tcore->thread_core->cg_set) { + thread_args[i].cg_set = tcore->thread_core->cg_set; + thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); + } else { + thread_args[i].cg_set = -1; + } + ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); if (ret) goto err; @@ -3906,6 +3922,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); + close_service_fd(CGROUPD_SK); __gcov_flush(); diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 5a254559d..93f61539c 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -96,4 +96,6 @@ extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); +int stop_cgroupd(void); + #endif /* __CR_CGROUP_H__ */ diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index e2ea6e17f..183a3b852 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -1,6 +1,8 @@ #ifndef __CR_NS_H__ #define __CR_NS_H__ +#include + #include "common/compiler.h" #include "files.h" #include "common/list.h" @@ -224,4 +226,19 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + +extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); +extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); +extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); + #endif /* __CR_NS_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index d642765e3..bc0beb5cb 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -121,6 +121,8 @@ struct thread_restore_args { bool seccomp_force_tsync; char comm[TASK_COMM_LEN]; + int cg_set; + int cgroupd_sk; } __aligned(64); typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index c6979de7f..4265d94ed 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -24,6 +24,7 @@ enum sfd_type { */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, + CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ USERNSD_SK, /* Socket for usernsd */ NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ diff --git a/criu/namespaces.c b/criu/namespaces.c index 286073ff6..0dc19d5b6 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -1218,20 +1217,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) return 0; } -struct unsc_msg { - struct msghdr h; - /* - * 0th is the call address - * 1st is the flags - * 2nd is the optional (NULL in response) arguments - */ - struct iovec iov[3]; - char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; -}; - static int usernsd_pid; -static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd) +inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) { struct cmsghdr *ch; struct ucred *ucred; @@ -1269,7 +1257,10 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *)CMSG_DATA(ch); - ucred->pid = getpid(); + if (pid) + ucred->pid = *pid; + else + ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); @@ -1284,7 +1275,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void } } -static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1322,7 +1313,7 @@ static int usernsd(int sk) int flags, fd, ret; pid_t pid; - unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; @@ -1367,7 +1358,7 @@ static int usernsd(int sk) else fd = -1; - unsc_msg_init(&um, &call, &ret, NULL, 0, fd); + unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; @@ -1418,7 +1409,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Send the request */ - unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); @@ -1433,7 +1424,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Get the response back */ - unsc_msg_init(&um, &call, &res, NULL, 0, 0); + unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); @@ -1454,14 +1445,11 @@ out: return ret; } -static int start_usernsd(void) +int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) { int sk[2]; int one = 1; - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - /* * Seqpacket to * @@ -1490,24 +1478,39 @@ static int start_usernsd(void) return -1; } - usernsd_pid = fork(); - if (usernsd_pid < 0) { - pr_perror("Can't fork usernsd"); + *pid = fork(); + if (*pid < 0) { + pr_perror("Can't unix daemon"); close(sk[0]); close(sk[1]); return -1; } - if (usernsd_pid == 0) { + if (*pid == 0) { int ret; - close(sk[0]); - ret = usernsd(sk[1]); + ret = daemon_func(sk[1]); exit(ret); } - close(sk[1]); - if (install_service_fd(USERNSD_SK, sk[0]) < 0) { + + return sk[0]; +} + +static int start_usernsd(void) +{ + int sk; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + sk = start_unix_cred_daemon(&usernsd_pid, usernsd); + if (sk < 0) { + pr_err("failed to start usernsd\n"); + return -1; + } + + if (install_service_fd(USERNSD_SK, sk) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0e98cb3da..99cff1f7d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "linux/userfaultfd.h" @@ -586,6 +587,103 @@ static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sig ARCH_RT_SIGRETURN(new_sp, sigframe); } +static int send_cg_set(int sk, int cg_set) +{ + struct cmsghdr *ch; + struct msghdr h; + /* + * 0th is the dummy call address for compatibility with userns helper + * 1st is the cg_set + */ + struct iovec iov[2]; + char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {}; + int ret, *dummy = NULL; + struct ucred *ucred; + + iov[0].iov_base = &dummy; + iov[0].iov_len = sizeof(dummy); + iov[1].iov_base = &cg_set; + iov[1].iov_len = sizeof(cg_set); + + h.msg_iov = iov; + h.msg_iovlen = sizeof(iov) / sizeof(struct iovec); + h.msg_name = NULL; + h.msg_namelen = 0; + h.msg_flags = 0; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_CREDENTIALS; + + ucred = (struct ucred *)CMSG_DATA(ch); + /* + * We still have privilege in this namespace so we can send + * thread id instead of pid of main thread, uid, gid as 0 + * since these 2 are ignored in cgroupd + */ + ucred->pid = sys_gettid(); + ucred->uid = 0; + ucred->gid = 0; + + ret = sys_sendmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to send packet to cgroupd %d\n", ret); + return -1; + } + + return 0; +} + +/* + * As this socket is shared among threads, recvmsg(MSG_PEEK) + * from the socket until getting its own thread id as an + * acknowledge of successful threaded cgroup fixup + */ +static int recv_cg_set_restore_ack(int sk) +{ + struct cmsghdr *ch; + struct msghdr h = {}; + char cmsg[CMSG_SPACE(sizeof(struct ucred))]; + struct ucred *cred; + int ret; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + + while (1) { + ret = sys_recvmsg(sk, &h, MSG_PEEK); + if (ret < 0) { + pr_err("Unable to peek from cgroupd %d\n", ret); + return -1; + } + + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } + + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) + continue; + + /* + * Actual remove message from recv queue of socket + */ + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } + + break; + } + return 0; +} + /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. @@ -613,6 +711,15 @@ long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; + if (args->cg_set != -1) { + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); + if (send_cg_set(args->cgroupd_sk, args->cg_set)) + goto core_restore_end; + if (recv_cg_set_restore_ack(args->cgroupd_sk)) + goto core_restore_end; + sys_close(args->cgroupd_sk); + } + if (restore_thread_common(args)) goto core_restore_end; From 07d538cefcb6a72cea13ea7a2a34777f458e5b3b Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 4 Sep 2022 15:55:24 +0700 Subject: [PATCH 082/775] zdtm: Check threads are restored into correct threaded controllers This test creates a process with 2 threads in different threaded controllers and check if CRIU restores these threads' cgroup controllers properly. Signed-off-by: Bui Quang Minh --- test/zdtm/static/Makefile | 3 + test/zdtm/static/cgroupv2_01.c | 180 +++++++++++++++++++++++++ test/zdtm/static/cgroupv2_01.checkskip | 11 ++ test/zdtm/static/cgroupv2_01.desc | 1 + test/zdtm/static/cgroupv2_01.hook | 24 ++++ 5 files changed, 219 insertions(+) create mode 100644 test/zdtm/static/cgroupv2_01.c create mode 100755 test/zdtm/static/cgroupv2_01.checkskip create mode 100644 test/zdtm/static/cgroupv2_01.desc create mode 100755 test/zdtm/static/cgroupv2_01.hook diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 915e565bd..edac92c83 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -387,6 +387,7 @@ TST_DIR = \ cgroup03 \ cgroup04 \ cgroupv2_00 \ + cgroupv2_01 \ cgroup_ifpriomap \ cgroup_ignore \ cgroup_stray \ @@ -679,6 +680,8 @@ sk-unix-listen02: CFLAGS += -DSK_UNIX_LISTEN02 sk-unix-listen03: CFLAGS += -DSK_UNIX_LISTEN03 sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 +cgroupv2_01: LDLIBS += -pthread + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/cgroupv2_01.c b/test/zdtm/static/cgroupv2_01.c new file mode 100644 index 000000000..f3a6d18ba --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.c @@ -0,0 +1,180 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup-v2 threaded controllers"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg01"; + +task_waiter_t t; + +#define gettid(code) syscall(__NR_gettid) + +void cleanup(void) +{ + char path[1024]; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + rmdir(path); + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + rmdir(path); + sprintf(path, "%s/%s", dirname, cgname); + rmdir(path); + sprintf(path, "%s", dirname); + umount(path); +} + +int is_in_cgroup(char *cgname) +{ + FILE *cgf; + char buffer[1024]; + + sprintf(buffer, "/proc/self/task/%ld/cgroup", gettid()); + cgf = fopen(buffer, "r"); + if (cgf == NULL) { + pr_err("Fail to open thread's cgroup procfs\n"); + return 0; + } + + while (fgets(buffer, sizeof(buffer), cgf)) { + if (strstr(buffer, cgname)) { + fclose(cgf); + return 1; + } + } + + fclose(cgf); + return 0; +} + +void *thread_func(void *arg) +{ + char path[1024], aux[1024]; + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) { + cleanup(); + exit(1); + } + + read_value(path, aux, sizeof(aux)); + + task_waiter_complete(&t, 1); + + /* Wait for restore */ + task_waiter_wait4(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread2"); + if (!is_in_cgroup(path)) { + fail("Thread2's cgroup is not restored"); + cleanup(); + exit(1); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + pthread_t thread2; + int ret = 1; + + test_init(argc, argv); + task_waiter_init(&t); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + ret = pthread_create(&thread2, NULL, thread_func, NULL); + if (ret < 0) { + pr_err("pthread_create %s\n", strerror(ret)); + ret = 1; + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) + goto out; + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + task_waiter_complete(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread1"); + if (!is_in_cgroup(path)) { + fail("Main thread's cgroup is not restored"); + cleanup(); + exit(1); + } + pthread_join(thread2, NULL); + pass(); + + ret = 0; + +out: + cleanup(); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_01.checkskip b/test/zdtm/static/cgroupv2_01.checkskip new file mode 100755 index 000000000..375ed3564 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc new file mode 100644 index 000000000..4bfd4b265 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.hook b/test/zdtm/static/cgroupv2_01.hook new file mode 100755 index 000000000..2263fd014 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.hook @@ -0,0 +1,24 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg01" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" + +set +e +rmdir "$tname/$cgname/thread1" + +# When the test finishes, the cleanup() function removes this directory +# successfully because the thread in this controller exit and no other +# threads belong to this controller +if [ "$1" == "--pre-restore" ]; then + rmdir "$tname/$cgname/thread2" +fi + +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" From 2ebce92333cc6994f217575335327e4d2f0d1757 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Tue, 6 Sep 2022 22:04:08 +0700 Subject: [PATCH 083/775] ci: Make cpuset move to cgroup-v2 hierarchy As cgroupv2_00, cgroupv2_01 need cpuset in cgroup-v2 hierarchy to check CRIU handle cgroup-v2 properly, umount cpuset in cgroup-v1 to make it move to cgroup-v2. Signed-off-by: Bui Quang Minh --- scripts/ci/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 30dd9ebeb..48a1e1887 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -1,4 +1,10 @@ -local: +# Umount cpuset in cgroupv1 to make it move to cgroupv2 +cpuset-cgroupv2: + if [ -d /sys/fs/cgroup/cpuset ]; then \ + umount /sys/fs/cgroup/cpuset; \ + fi + +local: cpuset-cgroupv2 ./run-ci-tests.sh .PHONY: local From 516ebc4f581bab37457f07b2adfd06b4c7f91cec Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 7 Nov 2022 08:55:28 +0100 Subject: [PATCH 084/775] ci: Do not fail if latest epel repository definition is already installed Signed-off-by: Adrian Reber --- .cirrus.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index dbfb899ff..914ceb72c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -104,7 +104,9 @@ task: setup_script: | ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core + # Do not fail if latest epel repository definition is already installed + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : + yum install -y dnf-plugins-core yum config-manager --set-enabled powertools yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-junit_xml xmlto alternatives --set python /usr/bin/python3 @@ -133,7 +135,8 @@ task: setup_script: | # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm + # Do not fail if latest epel repository definition is already installed + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel # Even with selinux in permissive mode the selinux tests will be executed From 153614cb1d32f47a3727222b6c1dc39a0822a047 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 7 Nov 2022 08:26:10 +0100 Subject: [PATCH 085/775] ci: move cgroup unmounting to run-ci-tests.sh A previous commit added a cgroup cpuset unmounting to scripts/ci/Makefile. We are sometimes running in a container without the necessary privileges to unmount certain cgroups. This commit moves the cgroup unmounting to a place in run-ci-tests.sh which already requires privileged access and does not break unprivileged build-only CI runs. Signed-off-by: Adrian Reber --- scripts/ci/Makefile | 8 +------- scripts/ci/run-ci-tests.sh | 5 +++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 48a1e1887..30dd9ebeb 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -1,10 +1,4 @@ -# Umount cpuset in cgroupv1 to make it move to cgroupv2 -cpuset-cgroupv2: - if [ -d /sys/fs/cgroup/cpuset ]; then \ - umount /sys/fs/cgroup/cpuset; \ - fi - -local: cpuset-cgroupv2 +local: ./run-ci-tests.sh .PHONY: local diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 1b761ea56..7b64c6b06 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -144,6 +144,11 @@ time make unittest [ -n "$SKIP_CI_TEST" ] && exit 0 +# Umount cpuset in cgroupv1 to make it move to cgroupv2 +if [ -d /sys/fs/cgroup/cpuset ]; then + umount /sys/fs/cgroup/cpuset +fi + ulimit -c unlimited cgid=$$ From c1ae880eb4d6803367c87d4f673d8ae0aeae3a1b Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Tue, 8 Nov 2022 21:10:52 +0700 Subject: [PATCH 086/775] kerndat: Mark memfd_create(MFD_HUGETLB) unavailable when ENOSYS is returned Some users on Raspberry Pi report that the kerndat checking for memfd_create(MFD_HUGETLB) support returns ENOSYS even when memfd_create syscall is available. We currently treat this error as unexpected and return error. This commit marks the memfd_create(MFD_HUGETLB) as unavailable when ENOSYS is returned. Signed-off-by: Bui Quang Minh --- criu/kerndat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index a209190ee..5b567e79f 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -500,7 +500,7 @@ static bool kerndat_has_memfd_hugetlb(void) if (ret >= 0) { kdat.has_memfd_hugetlb = true; close(ret); - } else if (ret == -1 && (errno == EINVAL || errno == ENOENT)) { + } else if (ret == -1 && (errno == EINVAL || errno == ENOENT || errno == ENOSYS)) { kdat.has_memfd_hugetlb = false; } else { pr_perror("Unexpected error from memfd_create(\"\", MFD_HUGETLB)"); From f5e0f641a8c940029d953881f00552899ba53b2e Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 3 Nov 2022 22:04:53 +0700 Subject: [PATCH 087/775] cgroup: Remove redundant code that handles zombie tasks Zombie tasks are dumped in dump_zombies() so it is redundant to handle them in dump_one_task(). Deprecate cg_set in task_core_entry as this field must be per thread now. Signed-off-by: Bui Quang Minh --- criu/cr-dump.c | 9 --------- criu/cr-restore.c | 7 +++++-- images/core.proto | 1 + 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index e31b2f702..63eb627fc 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -811,16 +811,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item goto err; } - /* - * We don't support multithreads zombie tasks so there is - * no thread_core in zombie tasks, store the cg_set in - * task_core in these cases. - */ cg_set = &core->thread_core->cg_set; - if (item->pid->state == TASK_THREAD) { - core->tc->has_cg_set = true; - cg_set = &core->tc->cg_set; - } ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 78f2a9701..974202f16 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1350,9 +1350,12 @@ static inline int fork_with_pid(struct pstree_item *item) item->pid->state = ca.core->tc->task_state; - /* Zombie task's cg_set is stored in task_core */ + /* + * Zombie tasks' cgroup is not dumped/restored. + * cg_set == 0 is skipped in prepare_task_cgroup() + */ if (item->pid->state == TASK_DEAD) - rsti(item)->cg_set = ca.core->tc->cg_set; + rsti(item)->cg_set = 0; else rsti(item)->cg_set = ca.core->thread_core->cg_set; diff --git a/images/core.proto b/images/core.proto index 1ee32bfda..bc8b7a488 100644 --- a/images/core.proto +++ b/images/core.proto @@ -40,6 +40,7 @@ message task_core_entry { optional task_timers_entry timers = 7; optional task_rlimits_entry rlimits = 8; + /* This is deprecated, should be per-thread */ optional uint32 cg_set = 9; optional signal_queue_entry signals_s = 10; From c7211f52dbc08f748665b6f511b971ff48079d86 Mon Sep 17 00:00:00 2001 From: Mathias Gibbens Date: Thu, 17 Nov 2022 22:28:57 +0000 Subject: [PATCH 088/775] Remove execute bit from source file Signed-off-by: Mathias Gibbens --- images/core-mips.proto | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 images/core-mips.proto diff --git a/images/core-mips.proto b/images/core-mips.proto old mode 100755 new mode 100644 From fa2c585c2267725b59813063ee999b8b1e8afde3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 9 Nov 2022 11:01:29 +0000 Subject: [PATCH 089/775] amdgpu: define __nmk_dir if missing This patch adds a missing definition for `__nmk_dir` in the Makefile for the amdgpu plugin. This definition is required, for example, when building the `test_topology_remap` target: make -C plugins/amdgpu/ test_topology_remap Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 367a52c99..64a923d38 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -12,6 +12,7 @@ LIBDRM_INC := -I/usr/include/libdrm DEPS_OK := amdgpu_plugin.so amdgpu_plugin_test DEPS_NOK := ; +__nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk CC := gcc From c48b5290dc57eb26171b70078a1827fae2ba4d35 Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Mon, 21 Nov 2022 21:57:05 +0000 Subject: [PATCH 090/775] Fix warnings from -Wstrict-prototypes in clang 16.0.0 While building on a machine that has a HOL clang compiler, I ran into warnings regarding the changed line. It appears this warning is on by default because of anticipated changes to the C standard. Signed-off-by: Drew Wock --- criu/net.c | 2 +- criu/util.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/net.c b/criu/net.c index 2eff519c5..f29a166f8 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3366,7 +3366,7 @@ int collect_net_namespaces(bool for_dump) struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net"); -struct ns_id *net_get_root_ns() +struct ns_id *net_get_root_ns(void) { static struct ns_id *root_netns = NULL; diff --git a/criu/util.c b/criu/util.c index b3b2b6659..959e60938 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1876,7 +1876,7 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) uint64_t criu_run_id; -void util_init() +void util_init(void) { struct timespec tp; From 095b3e84b7cd6eee677586ae772471dfce17d0f7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 23 Nov 2022 14:59:08 +0000 Subject: [PATCH 091/775] ci/lint: install ShellCheck with dnf The way ShellCheck is installed was changed in commit c056f99 (ci/gha/lint: install a recent shellcheck) to use the latest version v0.8.0 and remove some of the "shellcheck disable=..." annotations. Since then, Fedora 37 has been released and the ShellCheck package has been updated to v0.8.0. Signed-off-by: Radostin Stoyanov --- .github/workflows/lint.yml | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 3d42f3dcf..4c05285e6 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -9,18 +9,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format - - # TODO: remove this and use ShellCheck from repo once F37 with ShellCheck 0.8.0 is out. - - name: install shellcheck - env: - VERSION: v0.8.0 - BASEURL: https://github.com/koalaman/shellcheck/releases/download - SHA256: f4bce23c11c3919c1b20bcb0f206f6b44c44e26f2bc95f8aa708716095fa0651 - run: | - curl -sSfL --retry 5 $BASEURL/$VERSION/shellcheck-$VERSION.linux.x86_64.tar.xz | - tar xfJ - -C /usr/local/bin --strip 1 shellcheck-$VERSION/shellcheck - sha256sum --strict --check - <<<"$SHA256 /usr/local/bin/shellcheck" + run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format ShellCheck - uses: actions/checkout@v2 From d0c64b7b344e12a4cc13197bbbe4b37edb369f0a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 23 Nov 2022 15:19:02 +0000 Subject: [PATCH 092/775] ci/alpine: remove symlink for /usr/bin/python The python3 package in Alpine has recently been updated to install symbolic link for /usr/bin/python. https://git.alpinelinux.org/aports/commit/main/python3?id=d91da210b1614eb75517d59b7f348fee01699f35 This causes the following error in CI: Step 10/11 : RUN ln -s /usr/bin/python3 /usr/bin/python ---> Running in a5a94be9dc93 ln: failed to create symbolic link '/usr/bin/python': File exists The command '/bin/sh -c ln -s /usr/bin/python3 /usr/bin/python' returned a non-zero code: 1 Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.alpine | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index cab72e8a1..eced46c22 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -45,7 +45,4 @@ RUN adduser -u 1000 -D test RUN pip3 install junit_xml -# For zdtm we need an unversioned python binary -RUN ln -s /usr/bin/python3 /usr/bin/python - RUN make -C test/zdtm From 14b9ec195f0acefe9497da271beb301700ecfaf5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 24 Nov 2022 10:48:35 +0000 Subject: [PATCH 093/775] ci: fix make indent This patch fixes applies the changes required by clang-format v15.0.5 for `make indent`. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/stopped03.c | 2 +- test/zdtm/static/stopped04.c | 2 +- test/zdtm/transition/maps007.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/stopped03.c b/test/zdtm/static/stopped03.c index 85c7177f7..9a373930f 100644 --- a/test/zdtm/static/stopped03.c +++ b/test/zdtm/static/stopped03.c @@ -23,7 +23,7 @@ struct shared { futex_t fstate; int status; int code; -} * sh; +} *sh; static int new_pgrp(void) { diff --git a/test/zdtm/static/stopped04.c b/test/zdtm/static/stopped04.c index 237094ca4..9bd968aa2 100644 --- a/test/zdtm/static/stopped04.c +++ b/test/zdtm/static/stopped04.c @@ -21,7 +21,7 @@ struct shared { futex_t fstate; int status; int code; -} * sh; +} *sh; static int new_pgrp(void) { diff --git a/test/zdtm/transition/maps007.c b/test/zdtm/transition/maps007.c index 8a605cfe0..35c196bc4 100644 --- a/test/zdtm/transition/maps007.c +++ b/test/zdtm/transition/maps007.c @@ -38,7 +38,7 @@ int main(int argc, char **argv) struct { futex_t delta; futex_t stop; - } * shm; + } *shm; uint32_t v; unsigned long long count = 0; int i; From f3fdce81a64b440d85611950647ad79dc6890393 Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Tue, 6 Sep 2022 18:02:06 +0800 Subject: [PATCH 094/775] files-reg.c: fiemap algorithm for ghost file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to reduce the frequency of using system call, based on https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git/tree/misc/create_inode.c#n519, I created a new algorithm of dumping chunk via fiemap.(copy_file_to_chunks_fiemap) Also, I added another BOOL_OPT for users to determine which algorithm they want to use. Moreover, for those filesystem not supporting fiemap, criu will fall back to the original algorithm(SEEK_HOLE/SEEK_DATA). v2: don't call copy_chunk_from_file on outstanding extent; rearange headers to workaround "redeclaration of ‘enum fsconfig_command’" problem Signed-off-by: Liang-Chun Chen --- criu/config.c | 2 + criu/files-reg.c | 108 ++++++++++++++++++++++++++++++++++++-- criu/include/cr_options.h | 4 ++ 3 files changed, 110 insertions(+), 4 deletions(-) diff --git a/criu/config.c b/criu/config.c index 9ba79c8ef..9f02ae992 100644 --- a/criu/config.c +++ b/criu/config.c @@ -430,6 +430,7 @@ void init_opts(void) opts.pre_dump_mode = PRE_DUMP_SPLICE; opts.file_validation_method = FILE_VALIDATION_DEFAULT; opts.network_lock_method = NETWORK_LOCK_DEFAULT; + opts.ghost_fiemap = FIEMAP_DEFAULT; } bool deprecated_ok(char *what) @@ -701,6 +702,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), + BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), {}, }; diff --git a/criu/files-reg.c b/criu/files-reg.c index 2e3d57c5e..13e114cea 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -11,10 +11,13 @@ #include #include #include -#include +#include #include +#include +#include #include "tty.h" +#include "stats.h" #ifndef SEEK_DATA #define SEEK_DATA 3 @@ -32,6 +35,7 @@ */ #define BUILD_ID_MAP_SIZE 1048576 #define ST_UNIT 512 +#define EXTENT_MAX_COUNT 512 #include "cr_options.h" #include "imgset.h" @@ -221,6 +225,92 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) return 0; } +static int skip_outstanding(struct fiemap_extent *fe, size_t file_size) +{ + /* Skip outstanding extent */ + if (fe->fe_logical > file_size) + return 1; + + /* Skip outstanding part of the extent */ + if (fe->fe_logical + fe->fe_length > file_size) + fe->fe_length = file_size - fe->fe_logical; + return 0; +} + +static int copy_file_to_chunks_fiemap(int fd, struct cr_img *img, size_t file_size) +{ + GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; + struct fiemap *fiemap_buf; + struct fiemap_extent *ext_buf; + int ext_buf_size, fie_buf_size; + off_t pos = 0; + unsigned int i; + int ret = 0; + int exit_code = 0; + + ext_buf_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent); + fie_buf_size = sizeof(struct fiemap) + ext_buf_size; + + fiemap_buf = xzalloc(fie_buf_size); + if (!fiemap_buf) { + pr_perror("Out of memory when allocating fiemap"); + return -1; + } + + ext_buf = fiemap_buf->fm_extents; + fiemap_buf->fm_length = FIEMAP_MAX_OFFSET; + fiemap_buf->fm_flags |= FIEMAP_FLAG_SYNC; + fiemap_buf->fm_extent_count = EXTENT_MAX_COUNT; + + do { + fiemap_buf->fm_start = pos; + memzero(ext_buf, ext_buf_size); + ret = ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + exit_code = -EOPNOTSUPP; + } else { + exit_code = -1; + pr_perror("fiemap ioctl() failed"); + } + goto out; + } else if (fiemap_buf->fm_mapped_extents == 0) { + goto out; + } + + for (i = 0; i < fiemap_buf->fm_mapped_extents; i++) { + if (skip_outstanding(&fiemap_buf->fm_extents[i], file_size)) + continue; + + ce.len = fiemap_buf->fm_extents[i].fe_length; + ce.off = fiemap_buf->fm_extents[i].fe_logical; + + if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) { + exit_code = -1; + goto out; + } + + if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) { + exit_code = -1; + goto out; + } + + if (fiemap_buf->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) { + /* there are no extents left, break. */ + goto out; + } + } + + /* Record file's logical offset as pos */ + pos = ce.len + ce.off; + + /* Since there are still extents left, continue. */ + } while (fiemap_buf->fm_mapped_extents == EXTENT_MAX_COUNT); +out: + xfree(fiemap_buf); + return exit_code; +} + static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { int ret; @@ -913,10 +1003,20 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de goto err_out; } - if (gfe.chunks) - ret = copy_file_to_chunks(fd, img, st->st_size); - else + if (gfe.chunks) { + if (opts.ghost_fiemap) { + ret = copy_file_to_chunks_fiemap(fd, img, st->st_size); + if (ret == -EOPNOTSUPP) { + pr_debug("file system don't support fiemap\n"); + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { ret = copy_file(fd, img_raw_fd(img), st->st_size); + } + close(fd); if (ret) goto err_out; diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index eacaa03a6..c7e98c756 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -95,6 +95,9 @@ enum FILE_VALIDATION_OPTIONS { /* This constant dictates which file validation method should be tried by default. */ #define FILE_VALIDATION_DEFAULT FILE_VALIDATION_BUILD_ID +/* This constant dictates that criu use fiemap to copy ghost file by default.*/ +#define FIEMAP_DEFAULT 1 + struct irmap; struct irmap_path_opt { @@ -167,6 +170,7 @@ struct cr_options { int enable_external_masters; bool aufs; /* auto-detected, not via cli */ bool overlayfs; + int ghost_fiemap; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED bool has_binfmt_misc; /* auto-detected */ #endif From bdbccc315a3774cfcf9f160e856b7fad0495cea8 Mon Sep 17 00:00:00 2001 From: Liang-Chun Chen Date: Sun, 11 Sep 2022 15:26:29 +0800 Subject: [PATCH 095/775] zdtm: add two tests for highly sparse ghost file ghost_multi_hole00 and ghost_multi_hole01 are tests which create a ghost file with a lot of holes, there are 4K data and 4K hole inside every 8K length. The only difference between them is ghost-fiemap option, 01 is a test for the fiemap dumping algorithm, and we want to test the behavior of EXTENT_MAX_COUNT part, so the file size should be 8M, thus there will be 1024 chunks in the ghost file. In some file system, such as xfs, we somehow can not easily create highly sparse file as in ext4 or btrfs, therefore we need `fallocate` to forcibly create holes. Signed-off-by: Liang-Chun Chen --- test/zdtm/static/Makefile | 2 + test/zdtm/static/ghost_multi_hole00.c | 122 +++++++++++++++++++++++ test/zdtm/static/ghost_multi_hole00.desc | 1 + test/zdtm/static/ghost_multi_hole01.c | 1 + test/zdtm/static/ghost_multi_hole01.desc | 1 + 5 files changed, 127 insertions(+) create mode 100644 test/zdtm/static/ghost_multi_hole00.c create mode 100644 test/zdtm/static/ghost_multi_hole00.desc create mode 120000 test/zdtm/static/ghost_multi_hole01.c create mode 100644 test/zdtm/static/ghost_multi_hole01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index edac92c83..000488133 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -308,6 +308,8 @@ TST_FILE = \ ghost_holes02 \ ghost_holes_large00 \ ghost_holes_large01 \ + ghost_multi_hole00 \ + ghost_multi_hole01 \ unlink_largefile \ mtime_mmap \ fifo \ diff --git a/test/zdtm/static/ghost_multi_hole00.c b/test/zdtm/static/ghost_multi_hole00.c new file mode 100644 index 000000000..0f78d4f14 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with a lot of holes(every 8K length contains only 4K data)"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for hole size */ +#define BUFSIZE 4096 +static unsigned char buf4k[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define FILE_SIZE (1 << 23) /* 8Mb */ + +#define FILE_INTERVAL (1 << 13) /* 8Kb */ + +int main(int argc, char **argv) +{ + int fd, off; + struct stat st; + uint32_t crc; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + crc = ~0; + datagen(buf4k, BUFSIZE, &crc); + if (pwrite(fd, &buf4k, BUFSIZE, off) != BUFSIZE) { + perror("pwrite"); + goto failed; + } + + /* + * In some file system, such as xfs, + * only pwrite might not able to create highly sparse file, + * so we need to forcibly allocate hole inside the file. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off + BUFSIZE, BUFSIZE)) { + perror("fallocate"); + goto failed; + } + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("Size %u OK\n", FILE_SIZE); + + /* Data*/ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (pread(fd, buf4k, BUFSIZE, off) != BUFSIZE) { + fail("pread failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + crc = ~0; + if (datachk(buf4k, BUFSIZE, &crc)) { + fail("datachk failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + test_msg("Data @%du OK\n", off / FILE_INTERVAL); + } + + /* Hole */ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (lseek(fd, off, SEEK_HOLE) != off + BUFSIZE) { + fail("failed to find hole @ %u", off / FILE_SIZE); + goto failed; + } + test_msg("Hole @%du OK\n", off / FILE_INTERVAL); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_multi_hole00.desc b/test/zdtm/static/ghost_multi_hole00.desc new file mode 100644 index 000000000..3981e8180 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --no-ghost-fiemap'} diff --git a/test/zdtm/static/ghost_multi_hole01.c b/test/zdtm/static/ghost_multi_hole01.c new file mode 120000 index 000000000..c75006a6b --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.c @@ -0,0 +1 @@ +ghost_multi_hole00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole01.desc b/test/zdtm/static/ghost_multi_hole01.desc new file mode 100644 index 000000000..d1dc68a54 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --ghost-fiemap'} From e5ccfbb240da1638f81db5fc0fa32998c8b9fca9 Mon Sep 17 00:00:00 2001 From: Shubham Verma Date: Fri, 2 Dec 2022 01:52:20 +0530 Subject: [PATCH 096/775] Fix typo in comment Signed-off-by: Shubham Verma --- test/zdtm/static/s390x_regs_check.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/s390x_regs_check.c b/test/zdtm/static/s390x_regs_check.c index 40c480b3f..82dca0519 100644 --- a/test/zdtm/static/s390x_regs_check.c +++ b/test/zdtm/static/s390x_regs_check.c @@ -40,13 +40,13 @@ const char *test_author = "Michael Holzheu "; * * - Verify that "criu restore" sets the correct register sets * from "criu dump": - * $ zdtmp.py run -t zdtm/static/s390x_regs_check + * $ zdtm.py run -t zdtm/static/s390x_regs_check * * - Verify that dumpee continues running with correct registers after * parasite injection: - * $ zdtmp.py run --norst -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --norst --pre 2 -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --check-only -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst --pre 2 -t zdtm/static/s390x_regs_check + * $ zdtm.py run --check-only -t zdtm/static/s390x_regs_check */ #define NR_THREADS 2 #define NR_THREADS_ALL (NR_THREADS + 1) From 2180e03b904e38dd2f246581b29373b06a7fdd58 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 14 Nov 2022 09:23:00 -0800 Subject: [PATCH 097/775] non-root: Rework socket bufs for unprivileged mode SO_SNDBUFFORCE/SO_RCVBUFFORCE require root or CAP_NET_ADMIN. We can use SO_SNDBUF/SO_RCVBUF in some cases and avoid needing elevated privileges. This patch renames sk_setbufs() to sk_setbufs_ns() and makes sk_setbufs() a general helper that sets socket send and receive buffer sizes. The helper tries to use SO_SNDBUFFORCE/SO_RCVBUFFORCE first and falls back to SO_SNDBUF/SO_RCVBUF if we're in unprivileged mode. The existing sk_setbufs_ns() which takes a pid parameter and is intended to be called via userns_call() is rewritten to call sk_setbufs(). Existing code that sets buffer sizes via setsockopt() is modified to call sk_setbufs() instead. Signed-off-by: Younes Manton --- criu/fdstore.c | 15 ++------------- criu/include/sockets.h | 1 + criu/pidfd-store.c | 5 ++--- criu/sockets.c | 32 ++++++++++++++++++++++++-------- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/criu/fdstore.c b/criu/fdstore.c index 03afa9f17..d615ad15d 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -15,6 +15,7 @@ #include "util.h" #include "cr_options.h" #include "util-caps.h" +#include "sockets.h" /* clang-format off */ static struct fdstore_desc { @@ -29,8 +30,6 @@ int fdstore_init(void) uint32_t buf[2] = { INT_MAX / 2, INT_MAX / 2 }; struct sockaddr_un addr; unsigned int addrlen; - int rcv_opt_name; - int snd_opt_name; struct stat st; int sk, ret; @@ -53,17 +52,7 @@ int fdstore_init(void) return -1; } - if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { - rcv_opt_name = SO_RCVBUFFORCE; - snd_opt_name = SO_SNDBUFFORCE; - } else { - rcv_opt_name = SO_RCVBUF; - snd_opt_name = SO_SNDBUF; - } - - if (setsockopt(sk, SOL_SOCKET, snd_opt_name, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, rcv_opt_name, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(sk, buf)) { close(sk); return -1; } diff --git a/criu/include/sockets.h b/criu/include/sockets.h index 399d38664..c3e7c879a 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -27,6 +27,7 @@ struct socket_desc { extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index b15568e08..9fdc74cb7 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -13,6 +13,7 @@ #include "log.h" #include "util.h" #include "pidfd-store.h" +#include "sockets.h" struct pidfd_entry { pid_t pid; @@ -94,9 +95,7 @@ int init_pidfd_store_sk(pid_t pid, int sk) * This is similar to how fdstore_init() works. */ if (addrlen == sizeof(sa_family_t)) { - if (setsockopt(pidfd_store_sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(pidfd_store_sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(pidfd_store_sk, buf)) { goto err; } diff --git a/criu/sockets.c b/criu/sockets.c index db772707b..7708344d6 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -29,6 +29,7 @@ #include "pstree.h" #include "util.h" #include "fdstore.h" +#include "cr_options.h" #undef LOG_PREFIX #define LOG_PREFIX "sockets: " @@ -465,18 +466,33 @@ int do_restore_opt(int sk, int level, int name, void *val, int len) return 0; } -static int sk_setbufs(void *arg, int fd, pid_t pid) +int sk_setbufs(int sk, uint32_t *bufs) { - u32 *buf = (u32 *)arg; + uint32_t sndbuf = bufs[0], rcvbuf = bufs[1]; - if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0])) - return -1; - if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1])) - return -1; + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &rcvbuf, sizeof(rcvbuf))) { + if (opts.unprivileged) { + pr_info("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE, falling back to SO_SNDBUF/SO_RCVBUF\n"); + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf))) { + pr_perror("Unable to set socket SO_SNDBUF/SO_RCVBUF"); + return -1; + } + } else { + pr_perror("Unable to set socket SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + return -1; + } + } return 0; } +static int sk_setbufs_ns(void *arg, int fd, pid_t pid) +{ + return sk_setbufs(fd, (uint32_t *)arg); +} + /* * Set sizes of buffers to maximum and prevent blocking * Caller of this fn should call other socket restoring @@ -489,7 +505,7 @@ int restore_prepare_socket(int sk) /* In kernel a bufsize has type int and a value is doubled. */ u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 }; - if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk)) + if (userns_call(sk_setbufs_ns, 0, maxbuf, sizeof(maxbuf), sk)) return -1; /* Prevent blocking on restore */ @@ -517,7 +533,7 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ - ret |= userns_call(sk_setbufs, 0, bufs, sizeof(bufs), sk); + ret |= userns_call(sk_setbufs_ns, 0, bufs, sizeof(bufs), sk); if (soe->has_so_buf_lock) { pr_debug("\trestore buf_lock %d for socket\n", soe->so_buf_lock); From 5a19c34322c9fe212dfd0b94bdbf19e44fd24eeb Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 21 Nov 2022 11:14:20 -0800 Subject: [PATCH 098/775] non-root: Don't dump socket option SO_MARK if 0 Restoring SO_MARK requires root or CAP_NET_ADMIN. If the value is 0 we will avoid dumping it so that we don't need to do a privileged call on restore. Signed-off-by: Younes Manton --- criu/sockets.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/criu/sockets.c b/criu/sockets.c index 7708344d6..c99fc7b50 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -647,8 +647,13 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); soe->has_so_rcvlowat = true; ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); - soe->has_so_mark = true; + /* + * Restoring SO_MARK requires root or CAP_NET_ADMIN. Avoid saving it + * in unprivileged mode if still has its default value. + */ ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); + if (soe->so_mark != 0) + soe->has_so_mark = true; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; From 5c60d35be4e51bd2654e74d5bb2d09772f4ccaae Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 12 Dec 2022 09:32:58 -0800 Subject: [PATCH 099/775] sockets: tiny style fix Signed-off-by: Andrei Vagin --- criu/sockets.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/sockets.c b/criu/sockets.c index c99fc7b50..d17e0a986 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -652,8 +652,7 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) * in unprivileged mode if still has its default value. */ ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); - if (soe->so_mark != 0) - soe->has_so_mark = true; + soe->has_so_mark = !!soe->so_mark; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; From 9686693aa6646254e5d8b1e0b2e9e885aad97b9c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 13 Dec 2022 09:39:49 -0800 Subject: [PATCH 100/775] test/javaTests: update org.testng:testng (Maven) TestNG is vulnerable to Path Traversal Fixes https://github.com/checkpoint-restore/criu/security/dependabot/1. Signed-off-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.hotspot-alpine | 2 +- scripts/build/Dockerfile.hotspot-ubuntu | 2 +- scripts/build/Dockerfile.openj9-ubuntu | 2 +- test/javaTests/pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index d6e6e5130..cb9332fd0 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:8-alpine +FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc RUN apk update && apk add \ diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 8936adf81..350102818 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:8-focal +FROM docker.io/library/eclipse-temurin:11-focal ARG CC=gcc COPY scripts/ci/apt-install /bin/apt-install diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 2e35358ff..23db14e8d 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-8-jdk-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal ARG CC=gcc COPY scripts/ci/apt-install /bin/apt-install diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml index faae44d1b..ddb6c89cf 100644 --- a/test/javaTests/pom.xml +++ b/test/javaTests/pom.xml @@ -38,7 +38,7 @@ org.testng testng - 6.3.1 + 7.7.0 From 21f5be91a90a420c7e183b61f0a64ab7f0f8f137 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 1 Jan 2023 17:18:52 -0800 Subject: [PATCH 101/775] cgroups: ignore EOPNOTSUPP on setting memory.kmem.limit_in_byte memory.kmem.limit_in_bytes has been deprecated. Look at e7c4184164f7 ("memcg, kmem: further deprecate kmem.limit_in_bytes") for more details. Signed-off-by: Andrei Vagin --- criu/cgroup.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 918827d99..0c730713a 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1402,7 +1402,7 @@ static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { - int cg, fd, ret = -1, flag; + int cg, fd, exit_code = -1, flag; CgroupPerms *perms = cg_prop_entry_p->perms; int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); @@ -1438,18 +1438,18 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat /* skip these two since restoring their values doesn't make sense */ if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) { - ret = 0; + exit_code = 0; goto out; } if (is_subtree_control) { - ret = restore_cgroup_subtree_control(cg_prop_entry_p, fd); + exit_code = restore_cgroup_subtree_control(cg_prop_entry_p, fd); goto out; } /* skip restoring cgroup.type if its value is not "threaded" */ if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { - ret = 0; + exit_code = 0; goto out; } @@ -1471,21 +1471,28 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat } while (*next_line != '\0'); } else { size_t len = strlen(cg_prop_entry_p->value); + int ret; - if (write(fd, cg_prop_entry_p->value, len) != len) { + ret = write(fd, cg_prop_entry_p->value, len); + /* memory.kmem.limit_in_bytes has been deprecated. Look at + * 58056f77502f3 ("memcg, kmem: further deprecate + * kmem.limit_in_bytes") for more details. */ + if (ret == -1 && errno == EOPNOTSUPP && + !strcmp(cg_prop_entry_p->name, "memory.kmem.limit_in_bytes")) + ret = len; + if (ret != len) { pr_perror("Failed writing %s to %s", cg_prop_entry_p->value, path); if (!skip_fails) goto out; } } - ret = 0; - + exit_code = 0; out: if (close(fd) != 0) pr_perror("Failed closing %s", path); - return ret; + return exit_code; } static CgroupPropEntry *freezer_state_entry; From a302b369408ab9a41f27d993e1e61c25bd3c90c2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 5 Jan 2023 17:47:21 +0000 Subject: [PATCH 102/775] zdtm: fix 'zdtm.py list' command The command ./zdtm.py list currently fails with if opts['rootless']: ~~~~^^^^^^^^^^^^ KeyError: 'rootless' Signed-off-by: Radostin Stoyanov --- test/zdtm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index a311610c3..33859f61e 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -626,14 +626,14 @@ class zdtm_test: ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) - if opts['rootless']: + if 'rootless' in opts and opts['rootless']: return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups", str(uuid)]) @staticmethod def cleanup(): - if opts['rootless']: + if 'rootless' in opts and opts['rootless']: return subprocess.check_call( ["flock", "zdtm_mount_cgroups.lock", "./zdtm_umount_cgroups", str(uuid)]) From bb3f7bef66d4a951e132035ff89d44b01f65e8e5 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 4 Jan 2023 17:15:44 +0300 Subject: [PATCH 103/775] crtools: fix help message alignment for --network-lock Fixes: 2e30db5c3 ("criu: add --network-lock option to allow nftables alternative") Signed-off-by: Pavel Tikhomirov --- criu/crtools.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index ac05bc821..4258ce388 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -414,9 +414,8 @@ usage: " --mntns-compat-mode Use mount engine in compatibility mode. By default criu\n" " tries to use mount-v2 mode with more reliable algorithm\n" " based on MOVE_MOUNT_SET_GROUP kernel feature\n" - " --network-lock METHOD\n" - " network locking/unlocking method; argument\n" - " can be 'nftables' or 'iptables' (default).\n" + " --network-lock METHOD network locking/unlocking method; argument\n" + " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" " consult documentation for further details\n" "\n" From 7c6eb0b85c3bfca03f8e56389691ee10275055ab Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Mon, 9 Jan 2023 13:28:48 +0100 Subject: [PATCH 104/775] asm: fix for_each_bit macro find_next_bit operates on a bit instead of byte positions/sizes. Signed-off-by: Michal Clapinski --- include/common/arch/ppc64/asm/bitops.h | 7 ++++--- include/common/arch/s390/asm/bitops.h | 7 ++++--- include/common/arch/x86/asm/bitops.h | 7 ++++--- include/common/asm-generic/bitops.h | 7 ++++--- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/common/arch/ppc64/asm/bitops.h b/include/common/arch/ppc64/asm/bitops.h index dbfa6be7f..54d55da16 100644 --- a/include/common/arch/ppc64/asm/bitops.h +++ b/include/common/arch/ppc64/asm/bitops.h @@ -46,6 +46,7 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #define __stringify_in_c(...) #__VA_ARGS__ #define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " @@ -202,8 +203,8 @@ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ diff --git a/include/common/arch/s390/asm/bitops.h b/include/common/arch/s390/asm/bitops.h index f396721e9..22547c544 100644 --- a/include/common/arch/s390/asm/bitops.h +++ b/include/common/arch/s390/asm/bitops.h @@ -10,6 +10,7 @@ #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) static inline unsigned long *__bitops_word(unsigned long nr, volatile unsigned long *ptr) { @@ -143,8 +144,8 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo return _find_next_bit(addr, size, offset, 0UL); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* _S390_BITOPS_H */ diff --git a/include/common/arch/x86/asm/bitops.h b/include/common/arch/x86/asm/bitops.h index c13c1eb45..f3c7dbbdf 100644 --- a/include/common/arch/x86/asm/bitops.h +++ b/include/common/arch/x86/asm/bitops.h @@ -10,6 +10,7 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc @@ -119,8 +120,8 @@ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ diff --git a/include/common/asm-generic/bitops.h b/include/common/asm-generic/bitops.h index 004da4c4e..d8f38091d 100644 --- a/include/common/asm-generic/bitops.h +++ b/include/common/asm-generic/bitops.h @@ -14,6 +14,7 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc @@ -103,8 +104,8 @@ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* __CR_GENERIC_BITOPS_H__ */ From aab709b602a142ffd72e0e00a9c575cc047e1fca Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 13 Dec 2018 18:15:49 +0300 Subject: [PATCH 105/775] log: Write more details in write_pidfile Signed-off-by: Cyrill Gorcunov Signed-off-by: Pavel Tikhomirov --- criu/log.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/criu/log.c b/criu/log.c index c4ce90ec0..e31f24e39 100644 --- a/criu/log.c +++ b/criu/log.c @@ -397,15 +397,28 @@ void print_on_level(unsigned int loglevel, const char *format, ...) int write_pidfile(int pid) { - int fd; + int fd, ret, exit_code = -1; fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600); if (fd == -1) { - pr_perror("Can't open %s", opts.pidfile); + pr_perror("pidfile: Can't open %s", opts.pidfile); return -1; } - dprintf(fd, "%d", pid); + ret = dprintf(fd, "%d", pid); + if (ret < 0) { + pr_perror("pidfile: Can't write pid %d to %s", pid, opts.pidfile); + goto close; + } + + if (ret == 0) { + pr_err("pidfile: Can't write pid %d to %s\n", pid, opts.pidfile); + goto close; + } + + pr_debug("pidfile: Wrote pid %d to %s (%d bytes)\n", pid, opts.pidfile, ret); + exit_code = 0; +close: close(fd); - return 0; + return exit_code; } From c8b4fb9ba5aa4975c842d6b46daac41f3c6beeaf Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 12 Jan 2023 18:06:26 +0300 Subject: [PATCH 106/775] autofs: fix a frankenstein auto-created by clang-format Fixes: 93dd984ca ("Run 'make indent' on all C files") Signed-off-by: Pavel Tikhomirov --- criu/autofs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/autofs.c b/criu/autofs.c index c662bea60..6a7d8db0d 100644 --- a/criu/autofs.c +++ b/criu/autofs.c @@ -431,8 +431,7 @@ static int access_autofs_mount(struct mount_info *pm) pr_err("failed to fork\n"); goto close_autofs_mnt; case 0: - /* We don't care about results. - * All we need is to "touch" */ + /* We don't care about results, all we need is to "touch" */ /* coverity[check_return] */ openat(autofs_mnt, mnt_path, O_RDONLY | O_NONBLOCK | O_DIRECTORY); _exit(0); From abfe0b5d2469f5697a9afa0bd8089ced95d3a246 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 16 Jan 2023 18:22:07 +0300 Subject: [PATCH 107/775] clang-format: add for_each_bit macros to ForEachMacros Signed-off-by: Pavel Tikhomirov --- .clang-format | 1 + scripts/fetch-clang-format.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/.clang-format b/.clang-format index 96ba5909f..b64ade9d9 100644 --- a/.clang-format +++ b/.clang-format @@ -71,6 +71,7 @@ FixNamespaceComments: false # Unknown to clang-format-4.0 # | sort | uniq ForEachMacros: - 'for_each_pstree_item' + - 'for_each_bit' - 'apei_estatus_for_each_section' - 'ata_for_each_dev' - 'ata_for_each_link' diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index 0e9545f2d..b93a804a1 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -10,6 +10,7 @@ curl -s "${URL}" | sed -e " s,ControlStatements,ControlStatementsExceptForEachMacros,g; s,ColumnLimit: 80,ColumnLimit: 120,g; s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; + s,ForEachMacros:,ForEachMacros:\n - 'for_each_bit',g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; s,\(AlignTrailingComments:.*\)$,\1\nAlignConsecutiveMacros: true,g; s,AlignTrailingComments: false,AlignTrailingComments: true,g; From 1bb84f96f53dc44c6d559ac2bdaaa58f439c5aa2 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 16 Jan 2023 18:30:32 +0300 Subject: [PATCH 108/775] tty: fix codding-style around for_each_bit call Wraping "{" to next line after for-each macros is wrong. Signed-off-by: Pavel Tikhomirov --- criu/tty.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/tty.c b/criu/tty.c index 13f645f3a..199984ec0 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -398,8 +398,7 @@ static int tty_verify_active_pairs(void) { unsigned long i, unpaired_slaves = 0; - for_each_bit(i, tty_active_pairs) - { + for_each_bit(i, tty_active_pairs) { if ((i % 2) == 0) { if (test_bit(i + 1, tty_active_pairs)) { i++; From a918093cebd600b72d42d63aa92bcb80586c49b6 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 16 Jan 2023 16:48:26 +0000 Subject: [PATCH 109/775] scripts/ci: use Fedora 37 for vagrant based tests Signed-off-by: Adrian Reber --- scripts/ci/vagrant.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index e23486f29..a3e4b6937 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -7,8 +7,8 @@ set -e set -x VAGRANT_VERSION=2.2.19 -FEDORA_VERSION=35 -FEDORA_BOX_VERSION=35.20211026.0 +FEDORA_VERSION=37 +FEDORA_BOX_VERSION=37.20221105.0 setup() { if [ -n "$TRAVIS" ]; then From be61624f4504930716c5c365f5e7a0843fb224ba Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 11 Jan 2023 11:15:37 +0300 Subject: [PATCH 110/775] clang-format: rework make indent to check specific commits Previousely "make indent" checked all files in criu source directory for codding style flaws. We have several problems with it: - clang-format default format sometimes changes in new versions of the package and we need to reformat all our code base each time it happens - on different systems we may have different versions of clang-format and on latest criu-dev "make indent" may be still unhappy on your system - when we want to update clang-format rules ourselves we need to update all our code base each time - sometimes clang-format rules are not fitting all our cases, (e.g.: an option IndentGotoLabels works nice for simple C code, but is a no go for assembler and C macros) and putting "clang-format off" everywhere is a mess - sometimes we intentionally want to break clang-format rules (e.g.: we want to put function arguments on a new line separating them "logically" not "mechanically" following 120-char rule like clang-format does). This adds a BASE option for "make indent" where all commits in range BASE..HEAD would be checked with git-clang-format for codding style flaws. For instance when developing on top of criu-dev, one can use "make BASE=origin/criu-dev indent" to check all their commits for compliance with the clang-format rules. Default base is HEAD~1 to make last commit checked when "make indent" is called. The closest thing to the old behaviour would then be "make indent BASE=init", note that only commited files would be checked. Extra options to git-clang-format may be passed through OPTS variable. Also reuse "make indent" in github lint workflow. Signed-off-by: Pavel Tikhomirov --- .github/workflows/lint.yml | 16 ++++++---------- Makefile | 4 +++- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4c05285e6..0194c9393 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -23,16 +23,12 @@ jobs: - name: Run make indent run: > if [ -z "${{github.base_ref}}" ]; then - make indent + if ! make indent OPTS=--diff; then + exit 1 + fi else git fetch origin ${{github.base_ref}} && - git clang-format --style file --extensions c,h --quiet origin/${{github.base_ref}} - fi && - STATUS=$(git status --porcelain) && - if [ ! -z "$STATUS" ]; then - echo "FAIL: some files are not correctly formatted."; - echo "$STATUS" - git diff - echo "FAIL: please run 'make indent'"; - exit 1; + if ! make indent OPTS=--diff BASE=origin/${{github.base_ref}}; then + exit 1 + fi fi diff --git a/Makefile b/Makefile index 537720339..ba1f77440 100644 --- a/Makefile +++ b/Makefile @@ -451,8 +451,10 @@ fetch-clang-format: .FORCE $(E) ".clang-format" $(Q) scripts/fetch-clang-format.sh +BASE ?= "HEAD~1" +OPTS ?= "--quiet" indent: - find . -name '*.[ch]' -type f -print0 | xargs --null --max-args 128 --max-procs 4 clang-format -i + git clang-format --style file --extensions c,h $(OPTS) $(BASE) .PHONY: indent include Makefile.install From d800ef65884841c17f134612ef5c36acefa6cdd7 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 21 Feb 2022 15:27:19 +0300 Subject: [PATCH 111/775] zdtm/lib: copy list.h Need it to use linux lists in zdtm. Also copy container_of from comiler.h to zdtmtst.h like we already do for e.g. __stack_aligned__ macro. Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/list.h | 389 ++++++++++++++++++++++++++++++++++++++++ test/zdtm/lib/zdtmtst.h | 6 + 2 files changed, 395 insertions(+) create mode 100644 test/zdtm/lib/list.h diff --git a/test/zdtm/lib/list.h b/test/zdtm/lib/list.h new file mode 100644 index 000000000..97d0f1e06 --- /dev/null +++ b/test/zdtm/lib/list.h @@ -0,0 +1,389 @@ +#ifndef __ZDTM_LIST_H__ +#define __ZDTM_LIST_H__ + +/* + * Double linked lists. + */ + +#include +#include "zdtmtst.h" + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *)0x00100100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *)0x00200200 + POISON_POINTER_DELTA) + +struct list_head { + struct list_head *prev, *next; +}; + +#define LIST_HEAD_INIT(name) \ + { \ + &(name), &(name) \ + } +#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline void list_replace(struct list_head *old, struct list_head *new) +{ + new->next = old->next; + new->next->prev = new; + new->prev = old->prev; + new->prev->next = new; +} + +static inline void list_replace_init(struct list_head *old, struct list_head *new) +{ + list_replace(old, new); + INIT_LIST_HEAD(old); +} + +static inline void list_del_init(struct list_head *entry) +{ + __list_del_entry(entry); + INIT_LIST_HEAD(entry); +} + +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +static inline void list_move_tail(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add_tail(list, head); +} + +static inline int list_is_last(const struct list_head *list, const struct list_head *head) +{ + return list->next == head; +} + +static inline int list_is_first(const struct list_head *list, const struct list_head *head) +{ + return list->prev == head; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) +{ + struct list_head *new_first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new_first; + new_first->prev = head; +} + +static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) +{ + if (list_empty(head)) + return; + if (list_is_singular(head) && (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); +} + +static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +static inline void list_splice(const struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head, head->next); +} + +static inline void list_splice_tail(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev, head); +} + +static inline void list_splice_init(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } +} + +static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } +} + +#define list_entry(ptr, type, member) container_of(ptr, type, member) + +#define list_first_entry(ptr, type, member) list_entry((ptr)->next, type, member) + +#define list_for_each(pos, head) for (pos = (head)->next; pos != (head); pos = pos->next) + +#define list_for_each_prev(pos, head) for (pos = (head)->prev; pos != (head); pos = pos->prev) + +#define list_for_each_safe(pos, n, head) for (pos = (head)->next, n = pos->next; pos != (head); pos = n, n = pos->next) + +#define list_for_each_prev_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; pos != (head); pos = n, n = pos->prev) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_prepare_entry(pos, head, member) ((pos) ?: list_entry(head, typeof(*pos), member)) + +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_entry(pos->member.prev, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_entry(pos->member.next, typeof(*pos), member); &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +#define list_safe_reset_next(pos, n, member) n = list_entry(pos->member.next, typeof(*pos), member) + +/* + * Double linked lists with a single pointer list head. + */ + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#define HLIST_HEAD_INIT \ + { \ + .first = NULL \ + } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) + +static inline void INIT_HLIST_NODE(struct hlist_node *h) +{ + h->next = NULL; + h->pprev = NULL; +} + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (!hlist_unhashed(n)) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, struct hlist_node *next) +{ + next->next = n->next; + n->next = next; + next->pprev = &n->next; + + if (next->next) + next->next->pprev = &next->next; +} + +/* after that we'll appear to be on some hlist and hlist_del will work */ +static inline void hlist_add_fake(struct hlist_node *n) +{ + n->pprev = &n->next; +} + +/* + * Move a list from one list head to another. Fixup the pprev + * reference of the first entry if it exists. + */ +static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) +{ + new->first = old->first; + if (new->first) + new->first->pprev = &new->first; + old->first = NULL; +} + +#define hlist_entry(ptr, type, member) container_of(ptr, type, member) + +#define hlist_for_each(pos, head) for (pos = (head)->first; pos; pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && ({ \ + n = pos->next; \ + 1; \ + }); \ + pos = n) + +#define hlist_entry_safe(ptr, type, member) (ptr) ? hlist_entry(ptr, type, member) : NULL + +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member); pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_continue(pos, member) \ + for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member); pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_from(pos, member) \ + for (; pos; pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member); pos && ({ \ + n = pos->member.next; \ + 1; \ + }); \ + pos = hlist_entry_safe(n, typeof(*pos), member)) + +#endif /* __ZDTM_LIST_H__ */ diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index 105f3c11a..b0e25702e 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -219,4 +219,10 @@ static inline void cleanup_closep(void *p) extern int write_value(const char *path, const char *value); extern int read_value(const char *path, char *value, int size); +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + #endif /* _VIMITESU_H_ */ From 543501d5f88fda1ea3e059498798d02b3418e734 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 22 Feb 2022 11:49:10 +0300 Subject: [PATCH 112/775] zdtm/lib: copy xmalloc.h Need to use xzalloc in zdtm lib. Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/xmalloc.h | 68 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 test/zdtm/lib/xmalloc.h diff --git a/test/zdtm/lib/xmalloc.h b/test/zdtm/lib/xmalloc.h new file mode 100644 index 000000000..95e0d4043 --- /dev/null +++ b/test/zdtm/lib/xmalloc.h @@ -0,0 +1,68 @@ +#ifndef __ZDTM_XMALLOC_H__ +#define __ZDTM_XMALLOC_H__ + +#include +#include + +#ifndef pr_err +#error "Macro pr_err is needed." +#endif + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + if (!___p) \ + pr_err("%s: Can't allocate %li bytes\n", __func__, (long)(size)); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) free(p) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -1; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define xmemdup(ptr, size) \ + ({ \ + void *new = xmalloc(size); \ + if (new) \ + memcpy(new, ptr, size); \ + new; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +/* + * Helper for allocating trees with single xmalloc. + * This one advances the void *pointer on s bytes and + * returns the previous value. Use like this + * + * m = xmalloc(total_size); + * a = xptr_pull(&m, tree_root_t); + * a->b = xptr_pull(&m, leaf_a_t); + * a->c = xptr_pull(&m, leaf_c_t); + * ... + */ +static inline void *xptr_pull_s(void **m, size_t s) +{ + void *ret = (*m); + (*m) += s; + return ret; +} + +#define xptr_pull(m, type) xptr_pull_s(m, sizeof(type)) + +#endif /* __CR_XMALLOC_H__ */ From 2837a13ef9f279bbe34b1030d4ee6e47ca010a4f Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 21 Feb 2022 15:35:41 +0300 Subject: [PATCH 113/775] zdtm: add mountinfo parsing to test lib For mount testing it is nice to be able to parse mountinfo from zdtm test itself, for instance to be able to compare mountinfo topology before and after c/r, or for anything else. So let's add a helper mntns_parse_mountinfo() which parses current mount namespace mountinfo. Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/Makefile | 2 +- test/zdtm/lib/mountinfo.c | 139 ++++++++++++++++++++++++++++++++++++++ test/zdtm/lib/mountinfo.h | 27 ++++++++ 3 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 test/zdtm/lib/mountinfo.c create mode 100644 test/zdtm/lib/mountinfo.h diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 949dc123a..90bd28f9e 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -4,7 +4,7 @@ CFLAGS += $(USERCFLAGS) LIB := libzdtmtst.a -LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c file.c +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c file.c mountinfo.c PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') diff --git a/test/zdtm/lib/mountinfo.c b/test/zdtm/lib/mountinfo.c new file mode 100644 index 000000000..3e9d44581 --- /dev/null +++ b/test/zdtm/lib/mountinfo.c @@ -0,0 +1,139 @@ +#include +#include + +#include "mountinfo.h" +#include "fs.h" +#include "xmalloc.h" + +/* + * mountinfo contains mangled paths. space, tab and back slash were replaced + * with usual octal escape. This function replaces these symbols back. + */ +static void cure_path(char *path) +{ + int i, len, off = 0; + + if (strchr(path, '\\') == NULL) /* fast path */ + return; + + len = strlen(path); + for (i = 0; i < len; i++) { + if (!strncmp(path + i, "\\040", 4)) { + path[i - off] = ' '; + goto replace; + } else if (!strncmp(path + i, "\\011", 4)) { + path[i - off] = '\t'; + goto replace; + } else if (!strncmp(path + i, "\\134", 4)) { + path[i - off] = '\\'; + goto replace; + } + if (off) + path[i - off] = path[i]; + continue; + replace: + off += 3; + i += 3; + } + path[len - off] = 0; +} + +static struct mountinfo_zdtm *mountinfo_zdtm_alloc(struct mntns_zdtm *mntns) +{ + struct mountinfo_zdtm *new; + + new = xzalloc(sizeof(struct mountinfo_zdtm)); + if (new) + list_add_tail(&new->list, &mntns->mountinfo_list); + return new; +} + +static void mountinfo_zdtm_free(struct mountinfo_zdtm *mountinfo) +{ + list_del(&mountinfo->list); + xfree(mountinfo->mountpoint); + xfree(mountinfo->root); + xfree(mountinfo->fstype); + xfree(mountinfo); +} + +static void mountinfo_zdtm_free_all(struct mntns_zdtm *mntns) +{ + struct mountinfo_zdtm *mountinfo, *tmp; + + list_for_each_entry_safe(mountinfo, tmp, &mntns->mountinfo_list, list) + mountinfo_zdtm_free(mountinfo); +} + +#define BUF_SIZE 4096 +char buf[BUF_SIZE]; + +int mntns_parse_mountinfo(struct mntns_zdtm *mntns) +{ + FILE *f; + int ret; + + INIT_LIST_HEAD(&mntns->mountinfo_list); + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) { + pr_perror("Failed to open mountinfo"); + return -1; + } + + while (fgets(buf, BUF_SIZE, f)) { + struct mountinfo_zdtm *new; + unsigned int kmaj, kmin; + char *str, *hyphen, *shared, *master; + int n; + + new = mountinfo_zdtm_alloc(mntns); + if (!new) { + pr_perror("Failed to alloc mountinfo_zdtm"); + goto free; + } + + ret = sscanf(buf, "%i %i %u:%u %ms %ms %*s %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, + &new->root, &new->mountpoint, &n); + if (ret != 6) { + pr_perror("Failed to parse mountinfo line \"%s\"", buf); + goto free; + } + cure_path(new->root); + cure_path(new->mountpoint); + new->s_dev = MKKDEV(kmaj, kmin); + + str = buf + n; + hyphen = strstr(buf, " - "); + if (!hyphen) { + pr_perror("Failed to find \" - \" in mountinfo line \"%s\"", buf); + goto free; + } + *hyphen++ = '\0'; + + shared = strstr(str, "shared:"); + if (shared) + new->shared_id = atoi(shared + 7); + master = strstr(str, "master:"); + if (master) + new->master_id = atoi(master + 7); + + ret = sscanf(hyphen, "- %ms", &new->fstype); + if (ret != 1) { + pr_perror("Failed to parse fstype in mountinfo tail \"%s\"", hyphen); + goto free; + } + } + + fclose(f); + return 0; +free: + mountinfo_zdtm_free_all(mntns); + fclose(f); + return -1; +} + +void mntns_free_all(struct mntns_zdtm *mntns) +{ + mountinfo_zdtm_free_all(mntns); +} diff --git a/test/zdtm/lib/mountinfo.h b/test/zdtm/lib/mountinfo.h new file mode 100644 index 000000000..8b7459d21 --- /dev/null +++ b/test/zdtm/lib/mountinfo.h @@ -0,0 +1,27 @@ +#ifndef __ZDTM_MOUNTINFO__ +#define __ZDTM_MOUNTINFO__ + +#include "list.h" + +struct mountinfo_zdtm { + int mnt_id; + int parent_mnt_id; + char *mountpoint; + char *root; + unsigned int s_dev; + int shared_id; + int master_id; + char *fstype; + + /* list of all mounts */ + struct list_head list; +}; + +struct mntns_zdtm { + struct list_head mountinfo_list; +}; + +extern int mntns_parse_mountinfo(struct mntns_zdtm *mntns); +extern void mntns_free_all(struct mntns_zdtm *mntns); + +#endif From ba09fad3914bb3e89df06c24a6398ab0b9bc656a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 22 Feb 2022 16:41:44 +0300 Subject: [PATCH 114/775] zdtm: add mountinfo topology compare to test lib Now we can compare mount tree and sharing group tree topology before and after c/r with mntns_compare() helper. Algorithm here is: 1) build mount tree based on mnt_id and parent_mnt_id from mountinfo 2) sort mount tree children based on path comparison 3) at the same time set topology_id for mounts by DFS order and order mounts in list accordingly 4) build shared groups tree based on sharing_id and master_id 5) at the same time set topology_id for sharings as smallest topology_id of its mounts, also sharings are put in their list in order of their topology_id 6) walk sorted mounts lists for both namespaces simultaneously each pair of moutns should have matching ids and parent ids 7) walk sorted sharings lists for both namespaces simultaneously each pair of sharings should have matching ids and parent ids Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/mountinfo.c | 351 ++++++++++++++++++++++++++++++++++++++ test/zdtm/lib/mountinfo.h | 35 ++++ 2 files changed, 386 insertions(+) diff --git a/test/zdtm/lib/mountinfo.c b/test/zdtm/lib/mountinfo.c index 3e9d44581..d6ab67a3f 100644 --- a/test/zdtm/lib/mountinfo.c +++ b/test/zdtm/lib/mountinfo.c @@ -133,7 +133,358 @@ free: return -1; } +static struct mountinfo_topology *mountinfo_topology_alloc(struct mntns_zdtm *mntns, struct mountinfo_zdtm *mountinfo) +{ + struct mountinfo_topology *new; + + new = xzalloc(sizeof(struct mountinfo_topology)); + if (new) { + new->mountinfo = mountinfo; + new->topology_id = -1; + INIT_LIST_HEAD(&new->children); + INIT_LIST_HEAD(&new->siblings); + list_add_tail(&new->list, &mntns->topology_list); + INIT_LIST_HEAD(&new->sharing_list); + } + return new; +} + +static void mountinfo_topology_free(struct mountinfo_topology *topology) +{ + list_del(&topology->list); + xfree(topology); +} + +static void mountinfo_topology_free_all(struct mntns_zdtm *mntns) +{ + struct mountinfo_topology *topology, *tmp; + + list_for_each_entry_safe(topology, tmp, &mntns->topology_list, list) + mountinfo_topology_free(topology); +} + +static struct mountinfo_topology *mountinfo_topology_lookup_parent(struct mntns_zdtm *mntns, + struct mountinfo_topology *topology) +{ + struct mountinfo_topology *parent; + + list_for_each_entry(parent, &mntns->topology_list, list) { + if (parent->mountinfo->mnt_id == topology->mountinfo->parent_mnt_id) + return parent; + } + + return NULL; +} + +static struct mountinfo_topology *mt_subtree_next(struct mountinfo_topology *mt, struct mountinfo_topology *root) +{ + if (!list_empty(&mt->children)) + return list_entry(mt->children.next, struct mountinfo_topology, siblings); + + while (mt->parent && mt != root) { + if (mt->siblings.next == &mt->parent->children) + mt = mt->parent; + else + return list_entry(mt->siblings.next, struct mountinfo_topology, siblings); + } + + return NULL; +} + +static void __mt_resort_siblings(struct mountinfo_topology *parent) +{ + LIST_HEAD(list); + + while (!list_empty(&parent->children)) { + struct mountinfo_topology *m, *p; + + m = list_first_entry(&parent->children, struct mountinfo_topology, siblings); + list_del(&m->siblings); + + list_for_each_entry(p, &list, siblings) + if (strcmp(p->mountinfo->mountpoint, m->mountinfo->mountpoint) < 0) + break; + + list_add_tail(&m->siblings, &p->siblings); + } + + list_splice(&list, &parent->children); +} + +static void mntns_mt_resort_siblings(struct mntns_zdtm *mntns) +{ + struct mountinfo_topology *mt = mntns->tree; + LIST_HEAD(mtlist); + int i = 0; + + while (1) { + /* Assign topology id to mt in dfs order */ + mt->topology_id = i++; + list_move_tail(&mt->list, &mtlist); + __mt_resort_siblings(mt); + mt = mt_subtree_next(mt, mntns->tree); + if (!mt) + break; + } + + /* Update mntns->topology_list in dfs order */ + list_splice(&mtlist, &mntns->topology_list); +} + +static struct sharing_group *sharing_group_find_or_alloc(struct mntns_zdtm *mntns, int shared_id, int master_id, + unsigned int s_dev) +{ + struct sharing_group *sg; + + list_for_each_entry(sg, &mntns->sharing_groups_list, list) { + if ((sg->shared_id == shared_id) && (sg->master_id == master_id)) { + if (sg->s_dev != s_dev) { + pr_err("Sharing/devid inconsistency\n"); + return NULL; + } + return sg; + } + } + + sg = xzalloc(sizeof(struct sharing_group)); + if (!sg) + return NULL; + + sg->shared_id = shared_id; + sg->master_id = master_id; + sg->s_dev = s_dev; + sg->topology_id = -1; + + INIT_LIST_HEAD(&sg->children); + INIT_LIST_HEAD(&sg->siblings); + INIT_LIST_HEAD(&sg->mounts_list); + + list_add_tail(&sg->list, &mntns->sharing_groups_list); + + return sg; +} + +static void sharing_group_free(struct sharing_group *sg) +{ + list_del(&sg->list); + xfree(sg); +} + +static void sharing_group_free_all(struct mntns_zdtm *mntns) +{ + struct sharing_group *sg, *tmp; + + list_for_each_entry_safe(sg, tmp, &mntns->sharing_groups_list, list) + sharing_group_free(sg); +} + +static struct sharing_group *sharing_group_lookup_parent(struct mntns_zdtm *mntns, struct sharing_group *sg) +{ + struct sharing_group *parent; + + list_for_each_entry(parent, &mntns->sharing_groups_list, list) { + if (parent->shared_id == sg->master_id) + return parent; + } + + /* Create "external" sharing */ + parent = sharing_group_find_or_alloc(mntns, sg->master_id, 0, sg->s_dev); + if (parent) + return parent; + + return NULL; +} + +static int mntns_build_tree(struct mntns_zdtm *mntns) +{ + struct mountinfo_topology *topology, *parent, *tree = NULL; + struct mountinfo_zdtm *mountinfo; + struct sharing_group *sg, *sg_parent; + + INIT_LIST_HEAD(&mntns->topology_list); + + /* Prealloc mount tree */ + list_for_each_entry(mountinfo, &mntns->mountinfo_list, list) { + topology = mountinfo_topology_alloc(mntns, mountinfo); + if (!topology) + goto err; + } + + /* Build mount tree */ + list_for_each_entry(topology, &mntns->topology_list, list) { + parent = mountinfo_topology_lookup_parent(mntns, topology); + if (!parent) { + if (tree) { + pr_err("Bad mount tree with too roots %d and %d\n", tree->mountinfo->mnt_id, + parent->mountinfo->mnt_id); + goto err; + } + tree = topology; + } else { + topology->parent = parent; + list_add_tail(&topology->siblings, &parent->children); + } + } + mntns->tree = tree; + + /* Sort mounts by mountpoint */ + mntns_mt_resort_siblings(mntns); + + INIT_LIST_HEAD(&mntns->sharing_groups_list); + + /* Prealloc sharing groups */ + list_for_each_entry(topology, &mntns->topology_list, list) { + if (!topology->mountinfo->shared_id && !topology->mountinfo->master_id) + continue; + + /* + * Due to mntns->topology_list is sorted in dfs order + * sharing groups are also sorted the same + */ + sg = sharing_group_find_or_alloc(mntns, topology->mountinfo->shared_id, topology->mountinfo->master_id, + topology->mountinfo->s_dev); + if (!sg) + goto err; + + list_add_tail(&topology->sharing_list, &sg->mounts_list); + topology->sharing = sg; + + /* Set sharing group topology id to minimal topology id of it's mounts */ + if (sg->topology_id == -1 || topology->topology_id < sg->topology_id) + sg->topology_id = topology->topology_id; + } + + /* Build sharing group trees */ + list_for_each_entry(sg, &mntns->sharing_groups_list, list) { + if (sg->master_id) { + sg_parent = sharing_group_lookup_parent(mntns, sg); + sg->parent = sg_parent; + list_add(&sg->siblings, &sg_parent->children); + } + } + + return 0; +err: + mountinfo_topology_free_all(mntns); + sharing_group_free_all(mntns); + return -1; +} + +static int mountinfo_topology_list_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) +{ + struct mountinfo_topology *topology_a, *topology_b; + + topology_a = list_first_entry(&mntns_a->topology_list, struct mountinfo_topology, list); + topology_b = list_first_entry(&mntns_b->topology_list, struct mountinfo_topology, list); + + while (&topology_a->list != &mntns_a->topology_list && &topology_b->list != &mntns_b->topology_list) { + if (topology_a->topology_id != topology_b->topology_id) { + pr_err("Mounts %d and %d have different topology id %d and %d\n", topology_a->mountinfo->mnt_id, + topology_b->mountinfo->mnt_id, topology_a->topology_id, topology_b->topology_id); + return -1; + } + + if (topology_a->parent && topology_b->parent) { + if (topology_a->parent->topology_id != topology_b->parent->topology_id) { + pr_err("Mounts %d and %d have different parent topology id %d and %d\n", + topology_a->mountinfo->mnt_id, topology_b->mountinfo->mnt_id, + topology_a->parent->topology_id, topology_b->parent->topology_id); + return -1; + } + } else if (topology_a->parent || topology_b->parent) { + pr_err("One of mounts %d and %d has parent and other doesn't\n", topology_a->mountinfo->mnt_id, + topology_b->mountinfo->mnt_id); + return -1; + } + + if (topology_a->sharing && topology_b->sharing) { + if (topology_a->sharing->topology_id != topology_b->sharing->topology_id) { + pr_err("Mounts %d and %d have different sharing topology id %d and %d\n", + topology_a->mountinfo->mnt_id, topology_b->mountinfo->mnt_id, + topology_a->sharing->topology_id, topology_b->sharing->topology_id); + return -1; + } + } else if (topology_a->sharing || topology_b->sharing) { + pr_err("One of mounts %d and %d has sharing and other doesn't\n", topology_a->mountinfo->mnt_id, + topology_b->mountinfo->mnt_id); + return -1; + } + + topology_a = list_entry(topology_a->list.next, struct mountinfo_topology, list); + topology_b = list_entry(topology_b->list.next, struct mountinfo_topology, list); + } + if (&topology_a->list != &mntns_a->topology_list || &topology_b->list != &mntns_b->topology_list) { + pr_err("Mount tree topology length mismatch\n"); + return -1; + } + + return 0; +} + +static int sharing_group_list_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) +{ + struct sharing_group *sg_a, *sg_b; + + sg_a = list_first_entry(&mntns_a->sharing_groups_list, struct sharing_group, list); + sg_b = list_first_entry(&mntns_b->sharing_groups_list, struct sharing_group, list); + + while (&sg_a->list != &mntns_a->sharing_groups_list && &sg_b->list != &mntns_b->sharing_groups_list) { + if (sg_a->topology_id != sg_b->topology_id) { + pr_err("Sharings (%d,%d) and (%d,%d) have different sharing topology id %d and %d\n", + sg_a->shared_id, sg_a->master_id, sg_b->shared_id, sg_b->master_id, sg_a->topology_id, + sg_b->topology_id); + return -1; + } + + if (sg_a->parent && sg_b->parent) { + if (sg_a->parent->topology_id != sg_b->parent->topology_id) { + pr_err("Sharings (%d,%d) and (%d,%d) have different parent topology id %d and %d\n", + sg_a->shared_id, sg_a->master_id, sg_b->shared_id, sg_b->master_id, + sg_a->parent->topology_id, sg_b->parent->topology_id); + return -1; + } + } else if (sg_a->parent || sg_b->parent) { + pr_err("One of sharings (%d,%d) and (%d,%d) has parent and other doesn't\n", sg_a->shared_id, + sg_a->master_id, sg_b->shared_id, sg_b->master_id); + return -1; + } + + sg_a = list_entry(sg_a->list.next, struct sharing_group, list); + sg_b = list_entry(sg_b->list.next, struct sharing_group, list); + } + + if (&sg_a->list != &mntns_a->sharing_groups_list || &sg_b->list != &mntns_b->sharing_groups_list) { + pr_err("Mount tree sharing topology length mismatch\n"); + return -1; + } + + return 0; +} + +int mntns_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) +{ + if (mntns_build_tree(mntns_a)) { + pr_err("Failed to build first mountinfo topology tree\n"); + return -1; + } + + if (mntns_build_tree(mntns_b)) { + pr_err("Failed to build second mountinfo topology tree\n"); + return -1; + } + + if (mountinfo_topology_list_compare(mntns_a, mntns_b)) + return -1; + + if (sharing_group_list_compare(mntns_a, mntns_b)) + return -1; + + return 0; +} + void mntns_free_all(struct mntns_zdtm *mntns) { mountinfo_zdtm_free_all(mntns); + mountinfo_topology_free_all(mntns); + sharing_group_free_all(mntns); } diff --git a/test/zdtm/lib/mountinfo.h b/test/zdtm/lib/mountinfo.h index 8b7459d21..b5a8f5bcd 100644 --- a/test/zdtm/lib/mountinfo.h +++ b/test/zdtm/lib/mountinfo.h @@ -19,9 +19,44 @@ struct mountinfo_zdtm { struct mntns_zdtm { struct list_head mountinfo_list; + struct list_head topology_list; + struct mountinfo_topology *tree; + struct list_head sharing_groups_list; +}; + +struct sharing_group { + int shared_id; + int master_id; + unsigned int s_dev; + + struct sharing_group *parent; + struct list_head children; + struct list_head siblings; + + int topology_id; + + struct list_head mounts_list; + + struct list_head list; +}; + +struct mountinfo_topology { + struct mountinfo_zdtm *mountinfo; + + struct mountinfo_topology *parent; + struct list_head children; + struct list_head siblings; + + int topology_id; + + struct sharing_group *sharing; + struct list_head sharing_list; + + struct list_head list; }; extern int mntns_parse_mountinfo(struct mntns_zdtm *mntns); extern void mntns_free_all(struct mntns_zdtm *mntns); +extern int mntns_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b); #endif From fb66727a25036b6d6244eb10234ea4a190fa9e1d Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 22 Feb 2022 11:43:59 +0300 Subject: [PATCH 115/775] zdtm: add mntns_compare check to mount_complex_sharing This way we can check that mount tree topology (including sharing groups) is the same before and after c/r. Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/mount_complex_sharing.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/zdtm/static/mount_complex_sharing.c b/test/zdtm/static/mount_complex_sharing.c index b4463c41a..c6402d646 100644 --- a/test/zdtm/static/mount_complex_sharing.c +++ b/test/zdtm/static/mount_complex_sharing.c @@ -5,6 +5,7 @@ #include #include +#include "mountinfo.h" #include "zdtmtst.h" const char *test_doc = "Check complex sharing options for mounts"; @@ -211,6 +212,7 @@ static int mount_loop(void) int main(int argc, char **argv) { + struct mntns_zdtm mntns_before, mntns_after; int ret = 1; test_init(argc, argv); @@ -223,12 +225,23 @@ int main(int argc, char **argv) if (mount_loop()) goto err; + if (mntns_parse_mountinfo(&mntns_before)) + goto err; + test_daemon(); test_waitsig(); + if (mntns_parse_mountinfo(&mntns_after)) + goto err; + + if (mntns_compare(&mntns_before, &mntns_after)) + goto err; + pass(); ret = 0; err: + mntns_free_all(&mntns_before); + mntns_free_all(&mntns_after); if (ret) fail(); return ret; From 0bddecead055e3873f2d1a7fe07499fa39000a84 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Tue, 17 Jan 2023 17:59:53 +0100 Subject: [PATCH 116/775] restorer: add logging on prctl PR_SET_MM_MAP failure This kernel feature contained some bugs initially. Those logs are useful in identifing what the underlaying issue is and which kernel patch to backport. Signed-off-by: Michal Clapinski --- criu/pie/restorer.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 99cff1f7d..efab729e8 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1813,6 +1813,24 @@ long __export_restore_task(struct task_restore_args *args) .exe_fd = args->fd_exe_link, }; ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0); + if (ret) { + pr_debug("prctl PR_SET_MM_MAP failed with %d\n", (int)ret); + pr_debug(" .start_code = %" PRIx64 "\n", prctl_map.start_code); + pr_debug(" .end_code = %" PRIx64 "\n", prctl_map.end_code); + pr_debug(" .start_data = %" PRIx64 "\n", prctl_map.start_data); + pr_debug(" .end_data = %" PRIx64 "\n", prctl_map.end_data); + pr_debug(" .start_stack = %" PRIx64 "\n", prctl_map.start_stack); + pr_debug(" .start_brk = %" PRIx64 "\n", prctl_map.start_brk); + pr_debug(" .brk = %" PRIx64 "\n", prctl_map.brk); + pr_debug(" .arg_start = %" PRIx64 "\n", prctl_map.arg_start); + pr_debug(" .arg_end = %" PRIx64 "\n", prctl_map.arg_end); + pr_debug(" .env_start = %" PRIx64 "\n", prctl_map.env_start); + pr_debug(" .env_end = %" PRIx64 "\n", prctl_map.env_end); + pr_debug(" .auxv_size = %" PRIu32 "\n", prctl_map.auxv_size); + for (i = 0; i < prctl_map.auxv_size / sizeof(uint64_t); i++) + pr_debug(" .auxv[%d] = %" PRIx64 "\n", i, prctl_map.auxv[i]); + pr_debug(" .exe_fd = %" PRIu32 "\n", prctl_map.exe_fd); + } if (ret == -EINVAL) { ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); From f73ba77269725d3e56802a7486f06bb5533aabba Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 9 Dec 2022 15:46:20 +0100 Subject: [PATCH 117/775] ci: switch from lgtm to codeql Signed-off-by: Adrian Reber --- .github/workflows/codeql.yml | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 000000000..2d1039a0e --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,45 @@ +name: "CodeQL" + +on: + push: + branches: [ "criu-dev", "master" ] + pull_request: + branches: [ "criu-dev" ] + schedule: + - cron: "11 6 * * 3" + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ python, cpp ] + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Install Packages (cpp) + if: ${{ matrix.language == 'cpp' }} + run: | + sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 + with: + category: "/language:${{ matrix.language }}" From 7459d0204322a379f83f73f59336d43cc150c9dc Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 25 Jan 2023 10:57:18 +0300 Subject: [PATCH 118/775] lint: prohibit to use %m specifier in pr_* functions As our pr_* functions are complex and can call different system calls inside before actual printing (e.g. gettimeofday for timestamps) actual errno at the time of printing may be changed. Signed-off-by: Pavel Tikhomirov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ba1f77440..eaa8a91e0 100644 --- a/Makefile +++ b/Makefile @@ -431,7 +431,7 @@ lint: # Do not append \n to pr_perror or fail ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' # Do not use %m with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*%m' + ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|debug|info|msg)|fail)\>.*%m' # Do not use errno with pr_perror or fail ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n From 8cfda2748c7b9609b595aaf9b953167ba7756427 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 25 Jan 2023 11:01:56 +0300 Subject: [PATCH 119/775] log: remove all uses of %m specifier in pr_* functions As our pr_* functions are complex and can call different system calls inside before actual printing (e.g. gettimeofday for timestamps) actual errno at the time of printing may be changed. Let's just use %s + strerror(errno) instead of %m with pr_* functions to be explicit that errno to string transformation happens before calling anything else. Note: tcp_repair_off is called from pie with no pr_perror defined due to CR_NOGLIBC set and if I use errno variable there I get "Unexpected undefined symbol: `__errno_location'. External symbol in PIE?", so it seems there is no way to print errno there, so let's just skip it. Signed-off-by: Pavel Tikhomirov --- compel/include/log.h | 5 ++++- criu/fsnotify.c | 2 +- criu/include/sk-inet.h | 2 +- criu/kerndat.c | 4 ++-- criu/sk-unix.c | 7 ++++--- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/compel/include/log.h b/compel/include/log.h index 0e33976b1..5250622c8 100644 --- a/compel/include/log.h +++ b/compel/include/log.h @@ -1,6 +1,9 @@ #ifndef COMPEL_LOG_H__ #define COMPEL_LOG_H__ +#include +#include + #include "uapi/compel/log.h" #ifndef LOG_PREFIX @@ -45,6 +48,6 @@ extern void compel_print_on_level(unsigned int loglevel, const char *format, ... #define pr_debug(fmt, ...) compel_print_on_level(COMPEL_LOG_DEBUG, LOG_PREFIX fmt, ##__VA_ARGS__) -#define pr_perror(fmt, ...) pr_err(fmt ": %m\n", ##__VA_ARGS__) +#define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #endif /* COMPEL_LOG_H__ */ diff --git a/criu/fsnotify.c b/criu/fsnotify.c index 03711f0b2..8572dc2f3 100644 --- a/criu/fsnotify.c +++ b/criu/fsnotify.c @@ -183,7 +183,7 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ return path; } } else - pr_debug("\t\t\tnot openable as %s (%m)\n", __path); + pr_debug("\t\t\tnot openable as %s (%s)\n", __path, strerror(errno)); } err: diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 5dd2a6551..961d711ee 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -76,7 +76,7 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_err("Failed to turn off repair mode on socket: %m\n"); + pr_err("Failed to turn off repair mode on socket\n"); } extern void tcp_locked_conn_add(struct inet_sk_info *); diff --git a/criu/kerndat.c b/criu/kerndat.c index 5b567e79f..5d99c575b 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1055,9 +1055,9 @@ static int kerndat_has_move_mount_set_group(void) exit_code = 0; out: if (umount2(tmpdir, MNT_DETACH)) - pr_warn("Fail to umount2 %s: %m\n", tmpdir); + pr_warn("Fail to umount2 %s: %s\n", tmpdir, strerror(errno)); if (rmdir(tmpdir)) - pr_warn("Fail to rmdir %s: %m\n", tmpdir); + pr_warn("Fail to rmdir %s: %s\n", tmpdir, strerror(errno)); return exit_code; } diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 873360bfa..5c0f57523 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -221,7 +221,7 @@ int kerndat_socket_unix_file(void) } fd = ioctl(sk, SIOCUNIXFILE); if (fd < 0 && errno != ENOENT) { - pr_warn("Unable to open a socket file: %m\n"); + pr_warn("Unable to open a socket file: %s\n", strerror(errno)); kdat.sk_unix_file = false; close(sk); return 0; @@ -620,7 +620,8 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U snprintf(rpath, sizeof(rpath), ".%s", name); if (fstatat(mntns_root, rpath, &st, 0)) { if (errno != ENOENT) { - pr_warn("Can't stat socket %#" PRIx32 "(%s), skipping: %m (err %d)\n", id, rpath, errno); + pr_warn("Can't stat socket %#" PRIx32 "(%s), skipping: %s (err %d)\n", id, rpath, + strerror(errno), errno); goto skip; } @@ -669,7 +670,7 @@ static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, UnixS fd = ioctl(lfd, SIOCUNIXFILE); if (fd < 0) { - pr_warn("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl: %m\n"); + pr_warn("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl: %s\n", strerror(errno)); goto fallback; } From 0a7c5fd1bd8d1e49e273b51ff39af473d6c68cbc Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 18 Jan 2023 14:15:58 +0300 Subject: [PATCH 120/775] string: use our own __strlcpy and __strlcat to remove bsd headers We see that libbsd redefines __has_include to be always true, which breaks such checks for rseq. The idea behind this patch is remove the use of libbsd functions and always export our replacement functions. Using __strlcat and __strlcpy everywhere in existing code: git grep --files-with-matches "strlcat" | xargs sed -i 's/strlcat/__strlcat/g' git grep --files-with-matches "strlcpy" | xargs sed -i 's/strlcpy/__strlcpy/g' Fixes: #2036 Suggested-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/apparmor.c | 8 ++++---- criu/cr-dump.c | 6 +++--- criu/cr-restore.c | 4 ++-- criu/files-reg.c | 4 ++-- criu/files.c | 2 +- criu/include/string.h | 13 ++----------- criu/log.c | 2 +- criu/net.c | 8 ++++---- criu/proc_parse.c | 4 ++-- criu/seize.c | 4 ++-- criu/string.c | 8 ++------ criu/tun.c | 8 ++++---- 12 files changed, 29 insertions(+), 42 deletions(-) diff --git a/criu/apparmor.c b/criu/apparmor.c index 67553c8f1..9de54ce40 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -108,7 +108,7 @@ static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) return -1; aa_policy__init(cur); - strlcat(path + my_offset, "name", PATH_MAX - my_offset); + __strlcat(path + my_offset, "name", PATH_MAX - my_offset); f = fopen(path, "r"); if (!f) { xfree(cur); @@ -124,7 +124,7 @@ static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) return -1; } - strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); + __strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("failed to open aa policy %s", path); @@ -520,13 +520,13 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit tmp = *end; *end = 0; - strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); + __strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); *end = tmp; break; } default: - strlcpy(namespace, ns->name, sizeof(namespace)); + __strlcpy(namespace, ns->name, sizeof(namespace)); for (i = 0; i < ns->n_policies; i++) { if (strcmp(ns->policies[i]->name, rewrite_pos)) pr_warn("binary rewriting of apparmor policies not supported right now, not renaming %s to %s\n", diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 63eb627fc..30713f96b 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -429,7 +429,7 @@ static int dump_filemap(struct vma_area *vma_area, int fd) if (vma_area->aufs_rpath) { struct fd_link aufs_link; - strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); + __strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); aufs_link.len = strlen(aufs_link.name); p.link = &aufs_link; } @@ -774,7 +774,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item if (ret < 0) goto err; - strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); + __strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); core->tc->flags = stat->flags; core->tc->task_state = item->pid->state; core->tc->exit_code = 0; @@ -919,7 +919,7 @@ static int dump_one_zombie(const struct pstree_item *item, const struct proc_pid if (!core) return -1; - strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); + __strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); core->tc->task_state = TASK_DEAD; core->tc->exit_code = pps->exit_code; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 974202f16..2f9bab414 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3395,7 +3395,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_profile = lsm_profile; - strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); + __strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); xfree(rendered); } } else { @@ -3429,7 +3429,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_sockcreate = lsm_sockcreate; - strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); + __strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); xfree(rendered); } } else { diff --git a/criu/files-reg.c b/criu/files-reg.c index 13e114cea..1bbfe44ba 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -507,7 +507,7 @@ static int nomntns_create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, stru if (ghost_apply_metadata(path, gfe)) return -1; - strlcpy(gf->remap.rpath, path + 1, PATH_MAX); + __strlcpy(gf->remap.rpath, path + 1, PATH_MAX); pr_debug("Remap rpath is %s\n", gf->remap.rpath); return 0; } @@ -638,7 +638,7 @@ static int open_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rpe) gf->remap.rmnt_id = rfi->rfe->mnt_id; if (S_ISDIR(gfe->mode)) - strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); + __strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); else ghost_path(gf->remap.rpath, PATH_MAX, rfi, rpe); diff --git a/criu/files.c b/criu/files.c index 38dc076d2..3b653e24b 100644 --- a/criu/files.c +++ b/criu/files.c @@ -302,7 +302,7 @@ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) char buf[PATH_MAX]; int n; - strlcpy(buf, link->name, PATH_MAX); + __strlcpy(buf, link->name, PATH_MAX); n = snprintf(link->name, PATH_MAX, "%s/%s", m->ns_mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); diff --git a/criu/include/string.h b/criu/include/string.h index e11a42058..4c71d961c 100644 --- a/criu/include/string.h +++ b/criu/include/string.h @@ -3,18 +3,9 @@ #include -#ifdef CONFIG_HAS_LIBBSD -#include -#endif - #include "common/config.h" -#ifndef CONFIG_HAS_STRLCPY -extern size_t strlcpy(char *dest, const char *src, size_t size); -#endif - -#ifndef CONFIG_HAS_STRLCAT -extern size_t strlcat(char *dest, const char *src, size_t count); -#endif +extern size_t __strlcpy(char *dest, const char *src, size_t size); +extern size_t __strlcat(char *dest, const char *src, size_t count); #endif /* __CR_STRING_H__ */ diff --git a/criu/log.c b/criu/log.c index e31f24e39..47419313b 100644 --- a/criu/log.c +++ b/criu/log.c @@ -133,7 +133,7 @@ static void log_note_err(char *msg) */ mutex_lock(&first_err->l); if (first_err->s[0] == '\0') - strlcpy(first_err->s, msg, sizeof(first_err->s)); + __strlcpy(first_err->s, msg, sizeof(first_err->s)); mutex_unlock(&first_err->l); } } diff --git a/criu/net.c b/criu/net.c index f29a166f8..755a48377 100644 --- a/criu/net.c +++ b/criu/net.c @@ -1398,7 +1398,7 @@ static int move_veth(const char *netdev, struct ns_id *ns, struct net_link *link len_val = strlen(netdev); if (len_val >= IFNAMSIZ) return -1; - strlcpy(mvreq.ifnam, netdev, IFNAMSIZ); + __strlcpy(mvreq.ifnam, netdev, IFNAMSIZ); ret = userns_call(move_veth_cb, 0, &mvreq, sizeof(mvreq), ns->net.ns_fd); if (ret < 0) @@ -1528,7 +1528,7 @@ static int changeflags(int s, char *name, short flags) { struct ifreq ifr; - strlcpy(ifr.ifr_name, name, IFNAMSIZ); + __strlcpy(ifr.ifr_name, name, IFNAMSIZ); ifr.ifr_flags = flags; if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { @@ -3483,7 +3483,7 @@ static int move_to_bridge(struct external *ext, void *arg) ret = -1; goto out; } - strlcpy(ifr.ifr_name, br, IFNAMSIZ); + __strlcpy(ifr.ifr_name, br, IFNAMSIZ); ret = ioctl(s, SIOCBRADDIF, &ifr); if (ret < 0) { pr_perror("Can't add interface %s to bridge %s", out, br); @@ -3495,7 +3495,7 @@ static int move_to_bridge(struct external *ext, void *arg) * $ ip link set dev up */ ifr.ifr_ifindex = 0; - strlcpy(ifr.ifr_name, out, IFNAMSIZ); + __strlcpy(ifr.ifr_name, out, IFNAMSIZ); ret = ioctl(s, SIOCGIFFLAGS, &ifr); if (ret < 0) { pr_perror("Can't get flags of interface %s", out); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index abac5908b..a5f749d45 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -315,7 +315,7 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct if (is_memfd(vfi_dev)) { char tmp[PATH_MAX]; - strlcpy(tmp, fname, PATH_MAX); + __strlcpy(tmp, fname, PATH_MAX); strip_deleted(tmp, strlen(tmp)); /* @@ -890,7 +890,7 @@ int parse_pid_stat(pid_t pid, struct proc_pid_stat *s) *tok = '\0'; *p = '\0'; - strlcpy(s->comm, tok + 1, sizeof(s->comm)); + __strlcpy(s->comm, tok + 1, sizeof(s->comm)); n = sscanf(p + 1, " %c %d %d %d %d %d %u %lu %lu %lu %lu " diff --git a/criu/seize.c b/criu/seize.c index f2af12a0b..f8e3278ea 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -146,12 +146,12 @@ static int freezer_write_state(int fd, enum freezer_state new_state) if (new_state == THAWED) { if (cgroup_v2) state[0] = '0'; - else if (strlcpy(state, thawed, sizeof(state)) >= sizeof(state)) + else if (__strlcpy(state, thawed, sizeof(state)) >= sizeof(state)) return -1; } else if (new_state == FROZEN) { if (cgroup_v2) state[0] = '1'; - else if (strlcpy(state, frozen, sizeof(state)) >= sizeof(state)) + else if (__strlcpy(state, frozen, sizeof(state)) >= sizeof(state)) return -1; } else { return -1; diff --git a/criu/string.c b/criu/string.c index 7df0b3e09..7edd35363 100644 --- a/criu/string.c +++ b/criu/string.c @@ -6,7 +6,6 @@ #include "string.h" -#ifndef CONFIG_HAS_STRLCPY /** * strlcpy - Copy a %NUL terminated string into a sized buffer * @dest: Where to copy the string to @@ -18,7 +17,7 @@ * of course, the buffer size is zero). It does not pad * out the result like strncpy() does. */ -size_t strlcpy(char *dest, const char *src, size_t size) +size_t __strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); @@ -29,16 +28,14 @@ size_t strlcpy(char *dest, const char *src, size_t size) } return ret; } -#endif -#ifndef CONFIG_HAS_STRLCAT /** * strlcat - Append a length-limited, %NUL-terminated string to another * @dest: The string to be appended to * @src: The string to append to it * @count: The size of the destination buffer. */ -size_t strlcat(char *dest, const char *src, size_t count) +size_t __strlcat(char *dest, const char *src, size_t count) { size_t dsize = strlen(dest); size_t len = strlen(src); @@ -57,4 +54,3 @@ size_t strlcat(char *dest, const char *src, size_t count) dest[len] = 0; return res; } -#endif diff --git a/criu/tun.c b/criu/tun.c index 573137091..2e2cc32bf 100644 --- a/criu/tun.c +++ b/criu/tun.c @@ -121,7 +121,7 @@ static int list_tun_link(NetDeviceEntry *nde, unsigned ns_id) if (!tl) return -1; - strlcpy(tl->name, nde->name, sizeof(tl->name)); + __strlcpy(tl->name, nde->name, sizeof(tl->name)); /* * Keep tun-flags not only for persistency fixup (see * comment below), but also for TUNSETIFF -- we must @@ -153,7 +153,7 @@ static struct tun_link *__dump_tun_link_fd(int fd, char *name, unsigned ns_id, u tl = xmalloc(sizeof(*tl)); if (!tl) goto err; - strlcpy(tl->name, name, sizeof(tl->name)); + __strlcpy(tl->name, name, sizeof(tl->name)); tl->ns_id = ns_id; INIT_LIST_HEAD(&tl->l); @@ -241,7 +241,7 @@ static int open_tun_dev(char *name, unsigned int idx, unsigned flags) } memset(&ifr, 0, sizeof(ifr)); - strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); + __strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_flags = flags; if (ioctl(fd, TUNSETIFF, &ifr)) { @@ -393,7 +393,7 @@ static int tunfile_open(struct file_desc *d, int *new_fd) } memset(&ifr, 0, sizeof(ifr)); - strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); + __strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); ifr.ifr_flags = tl->rst.flags; if (ioctl(fd, TUNSETIFF, &ifr) < 0) { From a92dfb61ffa9a4fd43220595feb4724b59834b11 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 18 Jan 2023 14:41:32 +0300 Subject: [PATCH 121/775] string: define wrapers __setproctitle and __setproctitle_init to hide bsd headers We see that libbsd redefines __has_include to be always true, which breaks such checks for rseq. The idea behind this patch is to put all uses of libbsd functions to separate c files and only export wrapper functions for them. Using __setproctitle and __setproctitle_init everywhere in existing code: git grep --files-with-matches "setproctitle" | xargs sed -i 's/setproctitle/__setproctitle/g' git grep --files-with-matches "setproctitle_init" | xargs sed -i 's/setproctitle_init/__setproctitle_init/g' Fixes: #2036 Suggested-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/Makefile.crtools | 1 + criu/cr-service.c | 16 +++++++------- criu/crtools.c | 2 +- criu/include/setproctitle.h | 16 ++------------ criu/setproctitle.c | 42 +++++++++++++++++++++++++++++++++++++ 5 files changed, 54 insertions(+), 23 deletions(-) create mode 100644 criu/setproctitle.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 22108cce0..f58644917 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -74,6 +74,7 @@ obj-y += sk-unix.o obj-y += sockets.o obj-y += stats.o obj-y += string.o +obj-y += setproctitle.o obj-y += sysctl.o obj-y += sysfs_parse.o obj-y += timerfd.o diff --git a/criu/cr-service.c b/criu/cr-service.c index 73c48f5a6..314c309be 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -752,7 +752,7 @@ static int dump_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -795,7 +795,7 @@ static int restore_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - setproctitle("restore --rpc -D %s", images_dir); + __setproctitle("restore --rpc -D %s", images_dir); if (cr_restore_tasks()) goto exit; @@ -841,7 +841,7 @@ static int check(int sk, CriuOpts *req) } if (pid == 0) { - setproctitle("check --rpc"); + __setproctitle("check --rpc"); opts.mode = CR_CHECK; if (setup_opts_from_req(sk, req)) @@ -879,7 +879,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) if (setup_opts_from_req(sk, req)) goto cout; - setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -957,7 +957,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) if (setup_opts_from_req(sk, req)) goto out_ch; - setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); + __setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); pr_debug("Starting page server\n"); @@ -1117,7 +1117,7 @@ static int handle_feature_check(int sk, CriuReq *msg) if (kerndat_init()) exit(1); - setproctitle("feature-check --rpc"); + __setproctitle("feature-check --rpc"); if ((msg->features->has_mem_track == 1) && (msg->features->mem_track == true)) feat.mem_track = kdat.has_dirty_track; @@ -1204,8 +1204,8 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (setup_opts_from_req(sk, msg->opts)) goto cout; - setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", - images_dir); + __setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", + images_dir); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); diff --git a/criu/crtools.c b/criu/crtools.c index 4258ce388..ca9361977 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -127,7 +127,7 @@ int main(int argc, char *argv[], char *envp[]) } cr_pb_init(); - setproctitle_init(argc, argv, envp); + __setproctitle_init(argc, argv, envp); if (argc < 2) goto usage; diff --git a/criu/include/setproctitle.h b/criu/include/setproctitle.h index bc634331b..a4873578a 100644 --- a/criu/include/setproctitle.h +++ b/criu/include/setproctitle.h @@ -1,19 +1,7 @@ #ifndef __CR_SETPROCTITLE_H__ #define __CR_SETPROCTITLE_H__ -#ifdef CONFIG_HAS_LIBBSD -#include -#else - -/* - * setproctitle_init is in the libbsd since v0.6.0. This macro allows to - * compile criu with libbsd<0.6.0. - */ -#ifndef CONFIG_HAS_SETPROCTITLE_INIT -#define setproctitle_init(argc, argv, envp) -#endif - -#define setproctitle(fmt, ...) -#endif +extern void __setproctitle_init(int argc, char *argv[], char *envp[]); +extern void __setproctitle(const char *fmt, ...); #endif /* __CR_SETPROCTITLE_H__ */ diff --git a/criu/setproctitle.c b/criu/setproctitle.c new file mode 100644 index 000000000..9e01678d2 --- /dev/null +++ b/criu/setproctitle.c @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#ifdef CONFIG_HAS_LIBBSD +#include +#else + +#include "setproctitle.h" + +/* + * setproctitle_init is in the libbsd since v0.6.0. This macro allows to + * compile criu with libbsd<0.6.0. + */ +#ifndef CONFIG_HAS_SETPROCTITLE_INIT +#define setproctitle_init(argc, argv, envp) +#endif + +#define setproctitle(fmt, ...) +#endif + +void __setproctitle_init(int argc, char *argv[], char *envp[]) +{ + setproctitle_init(argc, argv, envp); +} + +#ifndef SPT_MAXTITLE +#define SPT_MAXTITLE 255 +#endif + +void __setproctitle(const char *fmt, ...) +{ + char buf[SPT_MAXTITLE + 1]; + va_list args; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + setproctitle("%s", buf); +} From 951c56917abbd947e9ed8ac738700b723179c435 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 12 Dec 2022 09:29:26 -0800 Subject: [PATCH 122/775] proc_parse: Set VMA_AREA_REGULAR where needed This patch sets VMA_AREA_REGULAR on hugetlb and anon shmem VMAs since they can be handled the same way as other kinds of regular memory. Co-authored-by: Ivanq Signed-off-by: Younes Manton --- criu/proc_parse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index a5f749d45..345de2592 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -332,6 +332,7 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct } if (is_hugetlb_dev(vfi_dev, &hugetlb_flag) || is_anon_shmem_map(vfi_dev)) { + vma->e->status |= VMA_AREA_REGULAR; if (!(vma->e->flags & MAP_SHARED)) vma->e->status |= VMA_ANON_PRIVATE; else From 95e590b51291b415ff2e0df847e12c7b903e58f2 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 12 Dec 2022 10:49:32 -0800 Subject: [PATCH 123/775] shmem: Fall back to /proc/$pid/mem if no map_files If trying to open /proc/$pid/map_files/x-x for a given VMA fails with EPERM (can happen in unprivileged mode when running in a non-init user ns), fall back to reading the content from /proc/$pid/mem. Co-authored-by: Ivanq Signed-off-by: Younes Manton --- criu/shmem.c | 64 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/criu/shmem.c b/criu/shmem.c index 81e701586..b92076a05 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -724,7 +724,7 @@ static int next_data_segment(int fd, unsigned long pfn, unsigned long *next_data return 0; } -static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si) +static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si, bool seek_data_supported) { struct page_pipe *pp; struct page_xfer xfer; @@ -750,7 +750,8 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si) unsigned long pgaddr; int st = -1; - if (pfn >= next_hole_pfn && next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) + if (seek_data_supported && pfn >= next_hole_pfn && + next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) goto err_xfer; if (si->pstate_map && is_shmem_tracking_en()) { @@ -808,20 +809,59 @@ static int dump_one_shmem(struct shmem_info *si) { int fd, ret = -1; void *addr; + unsigned long cur, remaining; + bool seek_data_supported; pr_info("Dumping shared memory %ld\n", si->shmid); - fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end); - if (fd < 0) - goto err; + fd = __open_proc(si->pid, EPERM, O_RDONLY, "map_files/%lx-%lx", si->start, si->end); + if (fd >= 0) { + addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); + goto errc; + } - addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); - goto errc; + seek_data_supported = true; + } else { + if (errno != EPERM || !opts.unprivileged) { + goto err; + } + + pr_debug("Could not access map_files/ link, falling back to /proc/$pid/mem\n"); + + fd = open_proc(si->pid, "mem"); + if (fd < 0) { + goto err; + } + + addr = mmap(NULL, si->size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (addr == MAP_FAILED) { + pr_err("Can't map empty space for shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); + goto errc; + } + + if (lseek(fd, si->start, SEEK_SET) < 0) { + pr_perror("Can't seek virtual memory"); + goto errc; + } + + cur = 0; + remaining = si->size; + do { + ret = read(fd, addr + cur, remaining); + if (ret <= 0) { + pr_perror("Can't read virtual memory"); + goto errc; + } + remaining -= ret; + cur += ret; + } while (remaining > 0); + + seek_data_supported = false; } - ret = do_dump_one_shmem(fd, addr, si); + ret = do_dump_one_shmem(fd, addr, si, seek_data_supported); munmap(addr, si->size); errc: @@ -849,7 +889,7 @@ int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size) goto err; } - ret = do_dump_one_shmem(fd, addr, &si); + ret = do_dump_one_shmem(fd, addr, &si, true); munmap(addr, size); err: @@ -875,7 +915,7 @@ int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid) if (fd < 0) return -1; - ret = do_dump_one_shmem(fd, addr, si); + ret = do_dump_one_shmem(fd, addr, si, true); close(fd); return ret; } From 80528dbf72b85c1cb20f774dbab7969abccaed48 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 12 Dec 2022 10:53:22 -0800 Subject: [PATCH 124/775] proc_parse: Don't bail out on is_memfd() VMAs Co-authored-by: Ivanq Signed-off-by: Younes Manton --- criu/proc_parse.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 345de2592..5ba3beb63 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -313,24 +313,6 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct vfi_dev = makedev(vfi->dev_maj, vfi->dev_min); - if (is_memfd(vfi_dev)) { - char tmp[PATH_MAX]; - __strlcpy(tmp, fname, PATH_MAX); - strip_deleted(tmp, strlen(tmp)); - - /* - * The error EPERM will be shown in the following pr_perror(). - * It comes from the previous open() call. - */ - pr_perror("Can't open mapped [%s]", tmp); - - /* - * TODO Perhaps we could do better than failing and dump the - * memory like what is being done in shmem.c - */ - return -1; - } - if (is_hugetlb_dev(vfi_dev, &hugetlb_flag) || is_anon_shmem_map(vfi_dev)) { vma->e->status |= VMA_AREA_REGULAR; if (!(vma->e->flags & MAP_SHARED)) From cec43025ac8475255668b8d5c4eca79765204447 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Fri, 16 Dec 2022 08:55:55 -0800 Subject: [PATCH 125/775] criu(8): Add info about unprivileged mode limitations Signed-off-by: Younes Manton --- Documentation/criu.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 3b68f16a4..294127050 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -906,6 +906,16 @@ Currently *criu* can benefit from the following additional capabilities: - *CAP_SETUID* - *CAP_SYS_RESOURCE* +Note that for some operations, having a capability in a namespace other than +the init namespace (i.e. the default/root namespace) is not sufficient. For +example, in order to read symlinks in proc/[pid]/map_files CRIU requires +CAP_CHECKPOINT_RESTORE in the init namespace; having CAP_CHECKPOINT_RESTORE +while running in another user namespace (e.g. in a container) does not allow +CRIU to read symlinks in /proc/[pid]/map_files. + +Without access to /proc/[pid]/map_files checkpointing/restoring processes +that have mapped deleted files may not be possible. + Independent of the capabilities it is always necessary to use "*--unprivileged*" to accept *criu*'s limitation in non-root mode. From 3837d31b5beb416603677af7465badede4773bca Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 26 Jan 2023 15:12:28 +0300 Subject: [PATCH 126/775] ci/lint: make push action have at least too commits depth We see that when lint is called for push action git has only one last commit which makes make indent with git-clang-format fail to operate. Fix it by increasing fetch depth to one more commit. Fixes: #2066 Fixes: d6db3333a ("clang-format: rework make indent to check specific commits") Signed-off-by: Pavel Tikhomirov --- .github/workflows/lint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 0194c9393..a501af30e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -23,6 +23,7 @@ jobs: - name: Run make indent run: > if [ -z "${{github.base_ref}}" ]; then + git fetch --deepen=1 && if ! make indent OPTS=--diff; then exit 1 fi From 757a2b46ce2780a9f73444e15c7a89e64b0e4465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 26 Jan 2023 00:20:33 +0100 Subject: [PATCH 127/775] remap: Fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: 237bd26982a1 ("remap: Rename global lock", 2017-05-18) Signed-off-by: Michał Mirosław --- criu/files-reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 1bbfe44ba..6f05b272c 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -86,7 +86,7 @@ static LIST_HEAD(ghost_files); /* * When opening remaps we first create a link on the remap * target, then open one, then unlink. In case the remap - * source has more than one instance, these tree steps + * source has more than one instance, these three steps * should be serialized with each other. */ static mutex_t *remap_open_lock; From 43fa4e76d2a1d4ade89c3751ac38b7d78f0b626f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 26 Jan 2023 01:19:57 +0100 Subject: [PATCH 128/775] remap: refactor goto jumping to a while loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the code a bit more readable by uncovering a while loop from a if() goto sequence. Signed-off-by: Michał Mirosław --- criu/files-reg.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 6f05b272c..ed8b9c889 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1138,7 +1138,6 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i RegFileEntry rfe = REG_FILE_ENTRY__INIT; FownEntry fwn = FOWN_ENTRY__INIT; int mntns_root; - int ret; const struct stat *ost = &parms->stat; if (!opts.link_remap_ok) { @@ -1177,19 +1176,18 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i mntns_root = mntns_get_root_fd(nsid); -again: - ret = linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH); - if (ret < 0 && errno == ENOENT) { + while (linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH) < 0) { + if (errno != ENOENT) { + pr_perror("Can't link remap to %s", path); + return -1; + } + /* Use grand parent, if parent directory does not exist. */ if (trim_last_parent(link_name) < 0) { pr_err("trim failed: @%s@\n", link_name); check_overlayfs_fallback(path, parms, fallback); return -1; } - goto again; - } else if (ret < 0) { - pr_perror("Can't link remap to %s", path); - return -1; } if (note_link_remap(link_name, nsid)) From 63159c14c0e91b884e4d1c3f29a03e6f320b179b Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 13 Jan 2023 19:38:07 +0300 Subject: [PATCH 129/775] mount: simplify code around mount_cr_time_mount Checking errno in outer function is really strange, also saving errno of mount syscall after calling pr_perror is completely wrong. So let's try to simplify things. Signed-off-by: Pavel Tikhomirov --- criu/mount.c | 61 ++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/criu/mount.c b/criu/mount.c index 115e3d067..3369fea34 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -1723,38 +1723,49 @@ err: return NULL; } -/* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */ +/* + * Returns: + * 0 - success + * -1 - error + * 1 - skip + */ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, const char *target, const char *type) { - int mnt_fd, cwd_fd, ret, exit_code = 0; + int mnt_fd, cwd_fd, exit_code = -1; struct stat st; - ret = switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd); - if (ret < 0) { + if (switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd)) { pr_err("Can't switch mnt_ns\n"); - goto out; + return -1; } - ret = mount(source, target, type, 0, NULL); - if (ret < 0) { - pr_perror("Unable to mount %s %s", source, target); - exit_code = -errno; - goto restore_ns; - } else { - if (stat(target, &st) < 0) { - pr_perror("Can't stat %s", target); - exit_code = 0; - } else { - *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + if (mount(source, target, type, 0, NULL)) { + switch (errno) { + case EPERM: + case EBUSY: + case ENODEV: + case ENOENT: + pr_debug("Skipping %s as was unable to mount it: %s\n", type, strerror(errno)); exit_code = 1; + break; + default: + pr_perror("Unable to mount %s %s %s", type, source, target); } + goto restore_ns; } + if (stat(target, &st)) { + pr_perror("Can't stat %s", target); + goto restore_ns; + } + + *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + exit_code = 0; restore_ns: - ret = restore_mnt_ns(mnt_fd, &cwd_fd); -out: - return ret < 0 ? 0 : exit_code; + if (restore_mnt_ns(mnt_fd, &cwd_fd)) + exit_code = -1; + return exit_code; } static int dump_one_fs(struct mount_info *mi) @@ -3978,16 +3989,10 @@ int collect_mnt_namespaces(bool for_dump) if (ns) { ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, "binfmt_misc"); - if (ret == -EPERM) - pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n"); - else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) { - pr_err("Can't mount binfmt_misc: %d %s\n", ret, strerror(-ret)); + if (ret == -1) { goto err; - } else if (ret == 0) { - ret = -1; - goto err; - } else if (ret > 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, - s_dev, false)) { + } else if (ret == 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, + s_dev, false)) { ret = -1; goto err; } From fcdb753ed574700c0632afd5c6beb2baf81d77bf Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 16 Jan 2023 19:13:44 +0300 Subject: [PATCH 130/775] namespaces: cleanup switch_mnt_ns and restore_mnt_ns Simplify code a bit: make exit codes of those functions more transparent, rename ret to exit_code. Signed-off-by: Pavel Tikhomirov --- criu/namespaces.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/criu/namespaces.c b/criu/namespaces.c index 0dc19d5b6..b1b5303fa 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -284,7 +284,6 @@ int restore_ns(int rst, struct ns_desc *nd) int switch_mnt_ns(int pid, int *rst, int *cwd_fd) { - int ret; int fd; if (!cwd_fd) @@ -293,13 +292,12 @@ int switch_mnt_ns(int pid, int *rst, int *cwd_fd) fd = open(".", O_PATH); if (fd < 0) { pr_perror("unable to open current directory"); - return fd; + return -1; } - ret = switch_ns(pid, &mnt_ns_desc, rst); - if (ret < 0) { + if (switch_ns(pid, &mnt_ns_desc, rst)) { close(fd); - return ret; + return -1; } *cwd_fd = fd; @@ -308,23 +306,22 @@ int switch_mnt_ns(int pid, int *rst, int *cwd_fd) int restore_mnt_ns(int rst, int *cwd_fd) { - int ret = -1; + int exit_code = -1; - ret = restore_ns(rst, &mnt_ns_desc); - if (ret < 0) + if (restore_ns(rst, &mnt_ns_desc)) goto err_restore; - if (cwd_fd) { - ret = fchdir(*cwd_fd); - if (ret) - pr_perror("unable to restore current directory"); + if (cwd_fd && fchdir(*cwd_fd)) { + pr_perror("Unable to restore current directory"); + goto err_restore; } + exit_code = 0; err_restore: if (cwd_fd) close_safe(cwd_fd); - return ret; + return exit_code; } struct ns_id *ns_ids = NULL; From 7280e96a794f4d2ed610ac46c3b7b08a4b8523c3 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 9 Jan 2023 12:13:46 +0300 Subject: [PATCH 131/775] clang-format: use IndentGotoLabels to get rid of goto label indentation This is done to follow 'Linux kernel coding style', same change was added to .clang-format in linux kernel source recently: https://github.com/torvalds/linux/commit/d7f6604341c74 We don't change it in current code base but let's follow it in all future uses. Signed-off-by: Pavel Tikhomirov --- .clang-format | 1 + scripts/fetch-clang-format.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/.clang-format b/.clang-format index b64ade9d9..475638015 100644 --- a/.clang-format +++ b/.clang-format @@ -516,6 +516,7 @@ IncludeCategories: Priority: 1 IncludeIsMainRegex: '(Test)?$' IndentCaseLabels: false +IndentGotoLabels: false IndentPPDirectives: None # Unknown to clang-format-5.0 IndentWidth: 8 IndentWrappedFunctionNames: false diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index b93a804a1..b80175f05 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -14,4 +14,5 @@ curl -s "${URL}" | sed -e " s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; s,\(AlignTrailingComments:.*\)$,\1\nAlignConsecutiveMacros: true,g; s,AlignTrailingComments: false,AlignTrailingComments: true,g; + s,\(IndentCaseLabels: false\),\1\nIndentGotoLabels: false,g; " > .clang-format From 6d7c0d007ef2ab12d175cdb1288803c0e90f3697 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 9 Feb 2023 10:32:38 +0000 Subject: [PATCH 132/775] compel/mips: fix parasite with GCC 12 This patch applies the '-ffreestanding' flag that was introduced with https://github.com/checkpoint-restore/criu/pull/1726 to MIPS. Fixes: #1725 Signed-off-by: Radostin Stoyanov --- compel/src/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/compel/src/main.c b/compel/src/main.c index 632354582..ef05a46d0 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -56,6 +56,7 @@ static const flags_t flags = { .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_MIPS .arch = "mips", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif From 144b467a05113e2475a5ac39c711267d83931fa1 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Thu, 2 Feb 2023 09:00:40 -0800 Subject: [PATCH 133/775] shmem: pr_err -> pr_perror Signed-off-by: Younes Manton --- criu/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/shmem.c b/criu/shmem.c index b92076a05..8d539d4d0 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -818,7 +818,7 @@ static int dump_one_shmem(struct shmem_info *si) if (fd >= 0) { addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { - pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); + pr_perror("Can't map shmem 0x%lx (0x%lx-0x%lx)", si->shmid, si->start, si->end); goto errc; } @@ -837,7 +837,7 @@ static int dump_one_shmem(struct shmem_info *si) addr = mmap(NULL, si->size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { - pr_err("Can't map empty space for shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); + pr_perror("Can't map empty space for shmem 0x%lx (0x%lx-0x%lx)", si->shmid, si->start, si->end); goto errc; } From d2abc9817ff856c2e2e66c172c5571830617e491 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Thu, 2 Feb 2023 09:07:15 -0800 Subject: [PATCH 134/775] shmem: Close fd when VMA is copied from /proc/$pid/mem If we don't have access to map_files and instead have to get the data from /proc/$pid/mem we can close and reset the fd before passing it to do_dump_one_shmem() which can then check it before trying to seek past holes, eliminating the need for a separate seek_data_supported boolean. Signed-off-by: Younes Manton --- criu/shmem.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/criu/shmem.c b/criu/shmem.c index 8d539d4d0..c13a39b66 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -724,7 +724,7 @@ static int next_data_segment(int fd, unsigned long pfn, unsigned long *next_data return 0; } -static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si, bool seek_data_supported) +static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si) { struct page_pipe *pp; struct page_xfer xfer; @@ -750,8 +750,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si, bool see unsigned long pgaddr; int st = -1; - if (seek_data_supported && pfn >= next_hole_pfn && - next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) + if (fd >= 0 && pfn >= next_hole_pfn && next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) goto err_xfer; if (si->pstate_map && is_shmem_tracking_en()) { @@ -810,7 +809,6 @@ static int dump_one_shmem(struct shmem_info *si) int fd, ret = -1; void *addr; unsigned long cur, remaining; - bool seek_data_supported; pr_info("Dumping shared memory %ld\n", si->shmid); @@ -821,8 +819,6 @@ static int dump_one_shmem(struct shmem_info *si) pr_perror("Can't map shmem 0x%lx (0x%lx-0x%lx)", si->shmid, si->start, si->end); goto errc; } - - seek_data_supported = true; } else { if (errno != EPERM || !opts.unprivileged) { goto err; @@ -858,14 +854,16 @@ static int dump_one_shmem(struct shmem_info *si) cur += ret; } while (remaining > 0); - seek_data_supported = false; + close(fd); + fd = -1; } - ret = do_dump_one_shmem(fd, addr, si, seek_data_supported); + ret = do_dump_one_shmem(fd, addr, si); munmap(addr, si->size); errc: - close(fd); + if (fd >= 0) + close(fd); err: return ret; } @@ -889,7 +887,7 @@ int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size) goto err; } - ret = do_dump_one_shmem(fd, addr, &si, true); + ret = do_dump_one_shmem(fd, addr, &si); munmap(addr, size); err: @@ -915,7 +913,7 @@ int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid) if (fd < 0) return -1; - ret = do_dump_one_shmem(fd, addr, si, true); + ret = do_dump_one_shmem(fd, addr, si); close(fd); return ret; } From d7da4a69af3cbf9aea97444cee3d8718a1c8eeb9 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Thu, 2 Feb 2023 09:43:37 -0800 Subject: [PATCH 135/775] ci: Add maps00 test in unprivileged mode in user namespace CAP_CHECKPOINT_RESTORE does not give access to /proc/$pid/map_files in user namespaces. In order to test that CRIU in unprivileged mode can dump and restore anonymous shared memory pages we will run the maps00 tests in a user namespace. Signed-off-by: Younes Manton --- scripts/build/Dockerfile.alpine | 4 +++- scripts/ci/run-ci-tests.sh | 15 +++++++++++++++ scripts/ci/vagrant.sh | 5 +++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index eced46c22..19b08315f 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -21,7 +21,9 @@ RUN apk update && apk add \ py3-pip \ py3-protobuf \ python3 \ - sudo + sudo \ + libcap-utils \ + util-linux COPY . /criu WORKDIR /criu diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 7b64c6b06..5b9f6d929 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -289,6 +289,21 @@ ip net add test ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/transition/fork -t zdtm/static/ghost_holes00 -t zdtm/static/socket-tcp -t zdtm/static/msgque -k always ./test/crit-recode.py +# Rootless tests +# Check if cap_checkpoint_restore is supported and also if unshare -c is supported. +if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then + make -C test/zdtm/ cleanout + rm -rf test/dump + setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu + # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore + # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, + # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. + sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" + setcap -r criu/criu +else + echo "Skipping unprivileged mode tests" +fi + # more crit testing make -C test/others/crit run diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index a3e4b6937..f0996b01d 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -78,6 +78,11 @@ fedora-non-root() { ssh default 'cd /vagrant/criu; criu/criu check --unprivileged; ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' # Run it as root with '--rootless' ssh default 'cd /vagrant/criu; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h; sudo chmod 777 test/dump/zdtm/static/{env00,pthread00}; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' + # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore + # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, + # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. + # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. + ssh default 'cd /vagrant/criu; selinuxmode=`getenforce` && sudo setenforce Permissive && unshare -Ucfpm --mount-proc bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" && sudo setenforce $selinuxmode' } $1 From 14e883656432f90f29450586da935ad3202cc7fb Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Mon, 13 Feb 2023 10:40:12 -0800 Subject: [PATCH 136/775] proc_parse: Handle btrfs files when map_files is not accessible If we can't access a map_files entry directly and instead have to follow the link and access the file via a filesystem path we need to properly deal with files on btrfs subvolumes. Signed-off-by: Younes Manton --- criu/proc_parse.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 5ba3beb63..bcb8256b4 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -338,22 +338,49 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("Can't open mapped [%s]", fname); - return -1; + goto returnerr; } if (vma_stat(vma, fd)) { - close(fd); - return -1; + goto closefd; } - if (vma->vmst->st_dev != vfi_dev || vma->vmst->st_ino != vfi->ino) { - pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); - close(fd); - return -1; + if (vma->vmst->st_ino != vfi->ino) { + goto errmsg; + } + + /* + * If devices don't match it could be because file is on a btrfs subvolume, + * which means that device number returned by stat will not match what is + * seen in smaps and other places. To deal with that we need a more involved + * check. + */ + if (vma->vmst->st_dev != vfi_dev) { + int mnt_id; + struct ns_id *ns; + + if (get_fd_mntid(fd, &mnt_id)) + goto errmsg; + + ns = lookup_nsid_by_mnt_id(mnt_id); + if (!ns) + goto errmsg; + + if (!phys_stat_dev_match(vma->vmst->st_dev, vfi_dev, ns, fname)) + goto errmsg; + + vma->mnt_id = mnt_id; } *vm_file_fd = fd; return 0; + +errmsg: + pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); +closefd: + close(fd); +returnerr: + return -1; } static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi, From f1c8d386b402554d85e1acfb86b211616ab95356 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 2 Dec 2022 11:21:54 +0300 Subject: [PATCH 137/775] kerndat: check if setsockopt IPV6_FREEBIND is supported Signed-off-by: Pavel Tikhomirov --- criu/cr-check.c | 10 ++++++++++ criu/include/kerndat.h | 1 + criu/kerndat.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/criu/cr-check.c b/criu/cr-check.c index b54c79387..bcbcf3f2b 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1375,6 +1375,14 @@ static int check_openat2(void) return 0; } +static int check_ipv6_freebind(void) +{ + if (!kdat.has_ipv6_freebind) + return -1; + + return 0; +} + static int (*chk_feature)(void); /* @@ -1494,6 +1502,7 @@ int cr_check(void) ret |= check_move_mount_set_group(); ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); + ret |= check_ipv6_freebind(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1614,6 +1623,7 @@ static struct feature_list feature_list[] = { { "move_mount_set_group", check_move_mount_set_group }, { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, + { "ipv6_freebind", check_ipv6_freebind }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index a3959c992..0b2f715f3 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -84,6 +84,7 @@ struct kerndat_s { bool has_rseq; bool has_ptrace_get_rseq_conf; struct __ptrace_rseq_configuration libc_rseq_conf; + bool has_ipv6_freebind; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index 5d99c575b..bc0c7ba05 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1541,6 +1541,34 @@ static int kerndat_has_nftables_concat(void) #endif } +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + +static int kerndat_has_ipv6_freebind(void) +{ + int sk, val; + + sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (sk == -1) { + pr_perror("Unable to create a ipv6 dgram socket"); + return -1; + } + + val = 1; + if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { + if (errno == ENOPROTOOPT) { + kdat.has_ipv6_freebind = false; + return 0; + } + pr_perror("Unable to setsockopt ipv6_freebind"); + return -1; + } + + kdat.has_ipv6_freebind = true; + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1780,6 +1808,10 @@ int kerndat_init(void) pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ret = -1; } + if (!ret && (kerndat_has_ipv6_freebind() < 0)) { + pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); From 7d4d4915af687f10809ba39af1c06e3d1016e113 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 1 Dec 2022 12:08:43 +0300 Subject: [PATCH 138/775] sk-inet: save IP_FREEBIND option for SOCK_RAW sockets also The IP_FREEBIND option is supported for RAW sockets, why not save it while we do this for other ip sockets anyway? One difference is that for SOCK_RAW there is no fallback between IP_FREEBIND and IPV6_FREEBIND, see: https://github.com/torvalds/linux/blob/ef4d3ea40565a781c25847e9cb96c1bd9f462bc6/net/ipv6/ipv6_sockglue.c#L1497 So let's have explicit IPV6_FREEBIND for ipv6. Signed-off-by: Pavel Tikhomirov --- criu/sk-inet.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index e52b198c3..b14dd2ed4 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "../soccr/soccr.h" @@ -388,6 +390,10 @@ static int dump_ip_raw_opts(int sk, int family, int proto, IpOptsRawEntry *r) return ret; } +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *ioe) { int ret = 0; @@ -398,12 +404,19 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io * and fetch additional options. */ ret |= dump_ip_raw_opts(sk, family, proto, ioe->raw); - } else { - /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ - ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); - ioe->has_freebind = ioe->freebind; } + if (family == AF_INET6) { + if (kdat.has_ipv6_freebind) + ret |= dump_opt(sk, SOL_IPV6, IPV6_FREEBIND, &ioe->freebind); + else if (type != SOCK_RAW) + /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ + ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + } else { + ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + } + ioe->has_freebind = ioe->freebind; + return ret; } @@ -787,8 +800,13 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) { int ret = 0; - if (ioe->has_freebind) - ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + if (family == AF_INET6) { + if (ioe->has_freebind) + ret |= restore_opt(sk, SOL_IPV6, IPV6_FREEBIND, &ioe->freebind); + } else { + if (ioe->has_freebind) + ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + } if (ioe->raw) ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); From bd9b66c8c0143727124c10735e6c3688e6eec440 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 29 Nov 2022 14:47:28 +0300 Subject: [PATCH 139/775] sk-inet: support IP_PKTINFO and IPV6_RECVPKTINFO options We see systemd-resolved relying on these options, and after migration the options are lost and systemd-resolved stops serving dns requests. The socket options make kernel add cmsg with destination address to packets, see more how systemd-resolved uses them: https://github.com/systemd/systemd/blob/00a60eaf5fcb3a0e415349aa649f2699550d26b0/src/resolve/resolved-manager.c#L826 Signed-off-by: Pavel Tikhomirov --- criu/sk-inet.c | 7 +++++++ images/sk-inet.proto | 2 ++ 2 files changed, 9 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index b14dd2ed4..4bd5abff1 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -412,10 +412,13 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io else if (type != SOCK_RAW) /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + ret |= dump_opt(sk, SOL_IPV6, IPV6_RECVPKTINFO, &ioe->pktinfo); } else { ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); } ioe->has_freebind = ioe->freebind; + ioe->has_pktinfo = !!ioe->pktinfo; return ret; } @@ -803,9 +806,13 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) if (family == AF_INET6) { if (ioe->has_freebind) ret |= restore_opt(sk, SOL_IPV6, IPV6_FREEBIND, &ioe->freebind); + if (ioe->has_pktinfo) + ret |= restore_opt(sk, SOL_IPV6, IPV6_RECVPKTINFO, &ioe->pktinfo); } else { if (ioe->has_freebind) ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + if (ioe->has_pktinfo) + ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); } if (ioe->raw) diff --git a/images/sk-inet.proto b/images/sk-inet.proto index 594e29c66..ee1f0ae41 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -17,6 +17,8 @@ message ip_opts_entry { optional bool freebind = 1; // Fields 2 and 3 are reserved for vz7 use optional ip_opts_raw_entry raw = 4; + + optional bool pktinfo = 5; } message inet_sk_entry { From 4a8c02d6368c5010b83cb3973ce349e7bec932cc Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 29 Nov 2022 15:42:51 +0300 Subject: [PATCH 140/775] zdtm: Add tests for IP_PKTINFO and IP_FREEBIND sock options Just creates ipv4/ipv6 raw/dgram sockets with IP_PKTINFO and IP_FREEBIND socket options enabled/disabled and checks that these options persist. Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 3 + test/zdtm/static/sock_ip_opts00.c | 110 +++++++++++++++++++++++++++ test/zdtm/static/sock_ip_opts00.desc | 1 + test/zdtm/static/sock_ip_opts01.c | 1 + test/zdtm/static/sock_ip_opts01.desc | 1 + 5 files changed, 116 insertions(+) create mode 100644 test/zdtm/static/sock_ip_opts00.c create mode 100644 test/zdtm/static/sock_ip_opts00.desc create mode 120000 test/zdtm/static/sock_ip_opts01.c create mode 120000 test/zdtm/static/sock_ip_opts01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 000488133..4b3d2e341 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -123,6 +123,8 @@ TST_NOFILE := \ sock_opts00 \ sock_opts01 \ sock_opts02 \ + sock_ip_opts00 \ + sock_ip_opts01 \ sk-unix-unconn \ sk-unix-unconn-seqpacket \ ipc_namespace \ @@ -598,6 +600,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 +sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS mnt_ext_manual: CFLAGS += -D ZDTM_EXTMAP_MANUAL mntns_pivot_root_ro: CFLAGS += -DMNTNS_PIVOT_ROOT_RO diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c new file mode 100644 index 000000000..08970c0da --- /dev/null +++ b/test/zdtm/static/sock_ip_opts00.c @@ -0,0 +1,110 @@ +#include +#include +#include + +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that different ip socket options are restored"; +const char *test_author = "Pavel Tikhomirov "; + +#ifdef ZDTM_VAL_ZERO +#define IP_OPT_VAL 0 +#else +#define IP_OPT_VAL 1 +#endif + +struct sk_opt { + int level; + int opt; +}; + +struct sk_opt sk_opts_v4[] = { + { SOL_IP, IP_FREEBIND }, + { SOL_IP, IP_PKTINFO }, +}; + +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + +struct sk_opt sk_opts_v6[] = { + { SOL_IPV6, IPV6_FREEBIND }, + { SOL_IPV6, IPV6_RECVPKTINFO }, +}; + +struct sk_conf { + int domain; + int type; + int protocol; + int sk; +} sk_confs[] = { + { AF_INET, SOCK_DGRAM, IPPROTO_UDP }, + { AF_INET, SOCK_RAW, IPPROTO_UDP }, + { AF_INET6, SOCK_DGRAM, IPPROTO_UDP }, + { AF_INET6, SOCK_RAW, IPPROTO_UDP }, +}; + +int main(int argc, char **argv) +{ + struct sk_opt *opts; + int exit_code = 1; + int i, j, val; + socklen_t len; + int n_opts; + + test_init(argc, argv); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); + if (sk_confs[i].sk == -1) { + pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, + sk_confs[i].protocol); + goto close; + } + } + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + opts = sk_confs[i].domain == AF_INET ? sk_opts_v4 : sk_opts_v6; + n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); + + for (j = 0; j < n_opts; j++) { + val = IP_OPT_VAL; + if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { + pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + opts = sk_confs[i].domain == AF_INET ? sk_opts_v4 : sk_opts_v6; + n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); + + for (j = 0; j < n_opts; j++) { + len = sizeof(int); + if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { + pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + + if (val != IP_OPT_VAL) { + fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, + sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); + goto close; + } + } + } + + pass(); + exit_code = 0; +close: + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) + close(sk_confs[i].sk); + return exit_code; +} diff --git a/test/zdtm/static/sock_ip_opts00.desc b/test/zdtm/static/sock_ip_opts00.desc new file mode 100644 index 000000000..2201f0298 --- /dev/null +++ b/test/zdtm/static/sock_ip_opts00.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'ipv6_freebind'} diff --git a/test/zdtm/static/sock_ip_opts01.c b/test/zdtm/static/sock_ip_opts01.c new file mode 120000 index 000000000..15526f808 --- /dev/null +++ b/test/zdtm/static/sock_ip_opts01.c @@ -0,0 +1 @@ +sock_ip_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_ip_opts01.desc b/test/zdtm/static/sock_ip_opts01.desc new file mode 120000 index 000000000..e2c29ca25 --- /dev/null +++ b/test/zdtm/static/sock_ip_opts01.desc @@ -0,0 +1 @@ +sock_ip_opts00.desc \ No newline at end of file From 50e42c9ddcc26e705c5059ff8d4713686f183c48 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 Feb 2023 12:55:01 +0000 Subject: [PATCH 141/775] Add documentation for --ghost-fiemap The --ghost-fiemap option was introduced with #1963. It enables an optimized algorithm based on fiemap ioctl that can reduce the number of syscalls used to checkpoint highly sparse ghost files. This option is enabled by default. It can be disabled with --no-ghost-fiemap when using SEEK_HOLE/SEEK_DATA is preferred. In addition, an automatic fallback to SEEK_HOLE/SEEK_DATA is used for filesystems that do not supporting fiemap. Co-authored-by: Pavel Tikhomirov Signed-off-by: Radostin Stoyanov --- Documentation/criu.txt | 7 +++++++ criu/crtools.c | 1 + 2 files changed, 8 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 294127050..0e7d19c4c 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -384,6 +384,13 @@ mount -t cgroup -o devices,freezer none devices,freezer 'size' may be postfixed with a *K*, *M* or *G*, which stands for kilo-, mega, and gigabytes, accordingly. +*--ghost-fiemap*:: + Enable an optimization based on fiemap ioctl that can reduce the + number of system calls used when checkpointing highly sparse ghost + files. This option is enabled by default, and it can be disabled + with *--no-ghost-fiemap*. An automatic fallback to SEEK_HOLE/SEEK_DATA + is used when fiemap is not supported. + *-j*, *--shell-job*:: Allow one to dump shell jobs. This implies the restored task will inherit session and process group ID from the *criu* itself. diff --git a/criu/crtools.c b/criu/crtools.c index ca9361977..94657f418 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -445,6 +445,7 @@ usage: " is inaccessible\n" " --link-remap allow one to link unlinked files back when possible\n" " --ghost-limit size limit max size of deleted file contents inside image\n" + " --ghost-fiemap enable dumping of deleted files using fiemap\n" " --action-script FILE add an external action script\n" " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" " -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" From fd7e97fcfd285b09f40df3622dd16b260e1e1de0 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 23 Feb 2023 17:12:32 +0800 Subject: [PATCH 142/775] lint: exclude tags file from codespell If we build tags for our repo: [criu]$ make tags GEN tags And then run codespell, we get an error: [criu]$ codespell ./tags:3755: struc ==> struct Let's exclude tags file from codespell search, this would add usability to `make lint`. Signed-off-by: Pavel Tikhomirov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eaa8a91e0..aabe28a92 100644 --- a/Makefile +++ b/Makefile @@ -427,7 +427,7 @@ lint: shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh - codespell + codespell -S tags # Do not append \n to pr_perror or fail ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' # Do not use %m with pr_perror or fail From fa4af04302a5f5cfb521e1ed7b67013ac2c9fb4a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 10 May 2017 16:52:03 +0300 Subject: [PATCH 143/775] dump: Show task comm early When error happens on file dumping stage the only information about the task we dumping is its PID. For debug purpose show task's @comm early. It proves useful when trying to understand which of dumped applications is "guilty" in brokern dump when pid is not there anymore. Signed-off-by: Cyrill Gorcunov Signed-off-by: Pavel Tikhomirov --- criu/cr-dump.c | 6 +++--- criu/include/seize.h | 3 +++ criu/seize.c | 46 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 30713f96b..249c02226 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1455,7 +1455,7 @@ static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Pre-dumping task (pid: %d)\n", pid); + pr_info("Pre-dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); /* @@ -1545,7 +1545,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Dumping task (pid: %d)\n", pid); + pr_info("Dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); if (item->pid->state == TASK_DEAD) @@ -2113,7 +2113,7 @@ int cr_dump_tasks(pid_t pid) int ret = -1; pr_info("========================================\n"); - pr_info("Dumping processes (pid: %d)\n", pid); + pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); /* diff --git a/criu/include/seize.h b/criu/include/seize.h index cf7366cb0..4545bf262 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -6,4 +6,7 @@ extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); +extern char *task_comm_info(pid_t pid, char *comm, size_t size); +extern char *__task_comm_info(pid_t pid); + #endif diff --git a/criu/seize.c b/criu/seize.c index f8e3278ea..0758410e5 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -24,6 +24,46 @@ #include "xmalloc.h" #include "util.h" +char *task_comm_info(pid_t pid, char *comm, size_t size) +{ + int ret = 0; + + if (!pr_quelled(LOG_INFO)) { + int saved_errno = errno; + char path[32]; + int fd; + + snprintf(path, sizeof(path), "/proc/%d/comm", pid); + fd = open(path, O_RDONLY); + if (fd >= 0) { + ssize_t n = read(fd, comm, size); + if (n > 0) + comm[n - 1] = '\0'; + else + ret = -1; + close(fd); + } else { + ret = -1; + } + errno = saved_errno; + } + + if (ret || (pr_quelled(LOG_INFO) && comm[0])) + comm[0] = '\0'; + + return comm; +} + +/* + * NOTE: Don't run simultaneously, it uses local static buffer! + */ +char *__task_comm_info(pid_t pid) +{ + static char comm[32]; + + return task_comm_info(pid, comm, sizeof(comm)); +} + #define NR_ATTEMPTS 5 static const char frozen[] = "FROZEN"; @@ -249,13 +289,13 @@ static int seize_cgroup_tree(char *root_path, enum freezer_state state) if (ret == 0) continue; if (errno != ESRCH) { - pr_perror("Unexpected error"); + pr_perror("Unexpected error for pid %d (comm %s)", pid, __task_comm_info(pid)); fclose(f); return -1; } if (!compel_interrupt_task(pid)) { - pr_debug("SEIZE %d: success\n", pid); + pr_debug("SEIZE %d (comm %s): success\n", pid, __task_comm_info(pid)); processes_to_wait++; } else if (state == FROZEN) { char buf[] = "/proc/XXXXXXXXXX/exe"; @@ -272,7 +312,7 @@ static int seize_cgroup_tree(char *root_path, enum freezer_state state) * before it compete exit procedure. The caller simply * should wait a bit and try freezing again. */ - pr_err("zombie found while seizing\n"); + pr_err("zombie %d (comm %s) found while seizing\n", pid, __task_comm_info(pid)); fclose(f); return -EAGAIN; } From 9b919ab748d4818636ecab50b7e52916695342e7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 Feb 2023 11:11:36 +0000 Subject: [PATCH 144/775] crit: fix empty string comparison In Python 3 b'' == '' is False. This causes the info action to fail with File "/usr/lib/python3.11/site-packages/crit-3.17-py3.11.egg/pycriu/images/images.py", line 178, in count size, = struct.unpack('i', buf) ^^^^^^^^^^^^^^^^^^^^^^^ struct.error: unpack requires a buffer of 4 bytes Reported-by: Sankalp Acharya (@sankalp-12) Signed-off-by: Radostin Stoyanov --- lib/py/images/images.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/py/images/images.py b/lib/py/images/images.py index eda030a5c..df4f92ac9 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -98,7 +98,7 @@ class entry_handler: # Read payload pbuff = self.payload() buf = f.read(4) - if buf == b'': + if len(buf) == 0: break size, = struct.unpack('i', buf) pbuff.ParseFromString(f.read(size)) @@ -172,7 +172,7 @@ class entry_handler: while True: buf = f.read(4) - if buf == '': + if len(buf) == 0: break size, = struct.unpack('i', buf) f.seek(size, 1) @@ -195,7 +195,7 @@ class pagemap_handler: pbuff = pb.pagemap_head() while True: buf = f.read(4) - if buf == b'': + if len(buf) == 0: break size, = struct.unpack('i', buf) pbuff.ParseFromString(f.read(size)) @@ -422,7 +422,7 @@ class ipc_msg_queue_handler: messages = [] for x in range(0, entry['qnum']): buf = f.read(4) - if buf == '': + if len(buf) == 0: break size, = struct.unpack('i', buf) msg = pb.ipc_msg() @@ -455,7 +455,7 @@ class ipc_msg_queue_handler: pl_len = 0 for x in range(0, entry['qnum']): buf = f.read(4) - if buf == '': + if len(buf) == 0: break size, = struct.unpack('i', buf) msg = pb.ipc_msg() From 11c71656bdc5d62d5c351fa6114dac027ca9ede1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 Feb 2023 16:39:13 +0000 Subject: [PATCH 145/775] ci: add test for crit info Signed-off-by: Radostin Stoyanov --- test/others/crit/test.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 5d13066e7..105aac72b 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -53,6 +53,9 @@ function run_test2 { # prepare ${CRIT} decode -i "${PROTO_IN}" -o "${JSON_IN}" + # show info about image + ${CRIT} info "${PROTO_IN}" + # proto in - json out decode cat "${PROTO_IN}" | ${CRIT} decode || exit 1 cat "${PROTO_IN}" | ${CRIT} decode -o "${OUT}" || exit 1 From 1ae9bac5488adcb55c4c589e3c3892ac6e6430ca Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 27 Feb 2023 12:14:21 +0800 Subject: [PATCH 146/775] dump: improve error printing and readability of task_comm_info This addresses Andrei comments from https://github.com/checkpoint-restore/criu/pull/2064 - Add comment about '\n' fixing - Replace ret with more self explainting is_read - Print warings if we failed to print comm for some reason Signed-off-by: Pavel Tikhomirov --- criu/seize.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 0758410e5..91090ae1a 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -26,7 +26,7 @@ char *task_comm_info(pid_t pid, char *comm, size_t size) { - int ret = 0; + bool is_read = false; if (!pr_quelled(LOG_INFO)) { int saved_errno = errno; @@ -37,18 +37,21 @@ char *task_comm_info(pid_t pid, char *comm, size_t size) fd = open(path, O_RDONLY); if (fd >= 0) { ssize_t n = read(fd, comm, size); - if (n > 0) + if (n > 0) { + is_read = true; + /* Replace '\n' printed by kernel with '\0' */ comm[n - 1] = '\0'; - else - ret = -1; + } else { + pr_warn("Failed to read %s: %s\n", path, strerror(errno)); + } close(fd); } else { - ret = -1; + pr_warn("Failed to open %s: %s\n", path, strerror(errno)); } errno = saved_errno; } - if (ret || (pr_quelled(LOG_INFO) && comm[0])) + if (!is_read) comm[0] = '\0'; return comm; From 7b80353448827763f8537bea2734ec83a7d2643a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 28 Feb 2023 12:42:49 +0100 Subject: [PATCH 147/775] mailmap: update my email Signed-off-by: Alexander Mikhalitsyn --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 6f046b972..8076f0bc9 100644 --- a/.mailmap +++ b/.mailmap @@ -6,3 +6,5 @@ Andrei Vagin Andrei Vagin Andrei Vagin Cyrill Gorcunov +Alexander Mikhalitsyn +Alexander Mikhalitsyn From 676b4579f957f480e74e6a49ef273db1c474334f Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 28 Feb 2023 13:16:27 +0800 Subject: [PATCH 148/775] zdtm/transition/epoll: don't rely on errno in case of zero return Checking errno in case read succeeded is undefined behaviour. Signed-off-by: Pavel Tikhomirov --- test/zdtm/transition/epoll.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/zdtm/transition/epoll.c b/test/zdtm/transition/epoll.c index fdd492ab2..803e50541 100644 --- a/test/zdtm/transition/epoll.c +++ b/test/zdtm/transition/epoll.c @@ -158,9 +158,11 @@ int main(int argc, char **argv) exit(1); } for (i = 0; i < rv; i++) { - while (read(events[i].data.fd, buf, buf_size) > 0) + int ret; + + while ((ret = read(events[i].data.fd, buf, buf_size)) > 0) ; - if (errno != EAGAIN && errno != 0 && errno) { + if (ret < 0 && errno != EAGAIN) { pr_perror("read error"); killall(); exit(1); From e7ab6fe635804c5d70500e242ffbe4c3fbbcb494 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 17 Oct 2019 16:00:40 +0300 Subject: [PATCH 149/775] restore: don't miss futex abort in restore_task_with_children Fixes: 37b99ebe5 ("files: Do setup_newborn_fds() later") Signed-off-by: Pavel Tikhomirov --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 2f9bab414..195fa5639 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1790,7 +1790,7 @@ static int restore_task_with_children(void *_arg) } if (log_init_by_pid(vpid(current))) - return -1; + goto err; if (current->parent == NULL) { /* From 91ff24b47e9b96fcb4725434518250d0da688f80 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 25 Feb 2023 20:20:23 +0000 Subject: [PATCH 150/775] ci: disable c/r of cgroups with podman This patch disables the checkpoint/restore of cgroups for the tests using Podman as a temporary workaround for https://github.com/checkpoint-restore/criu/issues/2091 Signed-off-by: Radostin Stoyanov --- scripts/ci/podman-test.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index e08fdf3bc..5ba9d9396 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -25,6 +25,12 @@ make install popd rm -rf "${tmp_dir}" +# FIXME: Disable checkpoint/restore of cgroups +# https://github.com/checkpoint-restore/criu/issues/2091 +mkdir -p /etc/criu +echo "manage-cgroups ignore" > /etc/criu/runc.conf +sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers.conf + podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' From 85b5c1e4510a640629329257af7fdb4fa0ee1aa1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 26 Feb 2023 12:13:48 +0000 Subject: [PATCH 151/775] ci/podman-test: drop crun installation script In a previous commit, we set the default runtime to runc and "manage-cgroups" to ignore. We remove the installation script for crun as it is not used with this test. Signed-off-by: Radostin Stoyanov --- scripts/ci/podman-test.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 5ba9d9396..687acb8ff 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -11,20 +11,6 @@ make install PREFIX=/usr criu --version -# Install crun build dependencies -scripts/ci/apt-install libyajl-dev libseccomp-dev libsystemd-dev - -# Install crun from source to test libcriu integration -tmp_dir=$(mktemp -d -t ci-XXXXXXXXXX) -pushd "${tmp_dir}" -git clone --depth=1 https://github.com/containers/crun -cd crun -./autogen.sh && ./configure --prefix=/usr -make -j"$(nproc)" -make install -popd -rm -rf "${tmp_dir}" - # FIXME: Disable checkpoint/restore of cgroups # https://github.com/checkpoint-restore/criu/issues/2091 mkdir -p /etc/criu From a0cc95c03eff9988cf20d8e1910dc3ab3db15c5b Mon Sep 17 00:00:00 2001 From: Kouame Behouba Manasse Date: Fri, 24 Feb 2023 05:32:44 +0300 Subject: [PATCH 152/775] lib/py: reduce code duplication Refactor lib/py/images/images.py to reduce code duplication by extracting repetitive code into helper functions and private methods. This improves code readability and maintainability, as well as reducing the risk of bugs caused by duplicated code. Additionally, in Makefile, lib/py/images/images.py is added to the list of files to run by flake8 during CI. Fixes: #340 Signed-off-by: Kouame Behouba Manasse --- Makefile | 1 + lib/py/images/images.py | 104 ++++++++++++++++++++-------------------- 2 files changed, 53 insertions(+), 52 deletions(-) diff --git a/Makefile b/Makefile index aabe28a92..24318d692 100644 --- a/Makefile +++ b/Makefile @@ -417,6 +417,7 @@ lint: flake8 --config=scripts/flake8.cfg test/inhfd/*.py flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py + flake8 --config=scripts/flake8.cfg lib/py/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns flake8 --config=scripts/flake8.cfg scripts/crit-setup.py flake8 --config=scripts/flake8.cfg coredump/ diff --git a/lib/py/images/images.py b/lib/py/images/images.py index df4f92ac9..a1d76e7cf 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -69,6 +69,22 @@ class MagicException(Exception): self.magic = magic +def decode_base64_data(data): + """A helper function to decode base64 data.""" + if (sys.version_info > (3, 0)): + return base64.decodebytes(str.encode(data)) + else: + return base64.decodebytes(data) + + +def write_base64_data(f, data): + """A helper function to write base64 encoded data to a file.""" + if (sys.version_info > (3, 0)): + f.write(base64.decodebytes(str.encode(data))) + else: + f.write(base64.decodebytes(data)) + + # Generic class to handle loading/dumping criu images entries from/to bin # format to/from dict(json). class entry_handler: @@ -285,15 +301,9 @@ class ghost_file_handler: size = len(pb_str) f.write(struct.pack('i', size)) f.write(pb_str) - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(item['extra']))) - else: - f.write(base64.decodebytes(item['extra'])) + write_base64_data(f, item['extra']) else: - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(item['extra']))) - else: - f.write(base64.decodebytes(item['extra'])) + write_base64_data(f, item['extra']) def dumps(self, entries): f = io.BytesIO('') @@ -314,10 +324,7 @@ class pipes_data_extra_handler: return base64.encodebytes(data).decode('utf-8') def dump(self, extra, f, pload): - if (sys.version_info > (3, 0)): - data = base64.decodebytes(str.encode(extra)) - else: - data = base64.decodebytes(extra) + data = decode_base64_data(extra) f.write(data) def skip(self, f, pload): @@ -332,10 +339,7 @@ class sk_queues_extra_handler: return base64.encodebytes(data).decode('utf-8') def dump(self, extra, f, _unused): - if (sys.version_info > (3, 0)): - data = base64.decodebytes(str.encode(extra)) - else: - data = base64.decodebytes(extra) + data = decode_base64_data(extra) f.write(data) def skip(self, f, pload): @@ -356,12 +360,8 @@ class tcp_stream_extra_handler: return d def dump(self, extra, f, _unused): - if (sys.version_info > (3, 0)): - inq = base64.decodebytes(str.encode(extra['inq'])) - outq = base64.decodebytes(str.encode(extra['outq'])) - else: - inq = base64.decodebytes(extra['inq']) - outq = base64.decodebytes(extra['outq']) + inq = decode_base64_data(extra['inq']) + outq = decode_base64_data(extra['outq']) f.write(inq) f.write(outq) @@ -370,6 +370,7 @@ class tcp_stream_extra_handler: f.seek(0, os.SEEK_END) return pbuff.inq_len + pbuff.outq_len + class bpfmap_data_extra_handler: def load(self, f, pload): size = pload.keys_bytes + pload.values_bytes @@ -384,14 +385,13 @@ class bpfmap_data_extra_handler: f.seek(pload.bytes, os.SEEK_CUR) return pload.bytes + class ipc_sem_set_handler: def load(self, f, pbuff): entry = pb2dict.pb2dict(pbuff) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) - s = array.array('H') - if s.itemsize != sizeof_u16: - raise Exception("Array size mismatch") + s = self._get_sem_array() s.frombytes(f.read(size)) f.seek(rounded - size, 1) return s.tolist() @@ -400,9 +400,7 @@ class ipc_sem_set_handler: entry = pb2dict.pb2dict(pbuff) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) - s = array.array('H') - if s.itemsize != sizeof_u16: - raise Exception("Array size mismatch") + s = self._get_sem_array() s.fromlist(extra) if len(s) != entry['nsems']: raise Exception("Number of semaphores mismatch") @@ -415,23 +413,16 @@ class ipc_sem_set_handler: f.seek(round_up(size, sizeof_u64), os.SEEK_CUR) return size + def _get_sem_array(self): + s = array.array('H') + if s.itemsize != sizeof_u16: + raise Exception("Array size mismatch") + return s + class ipc_msg_queue_handler: def load(self, f, pbuff): - entry = pb2dict.pb2dict(pbuff) - messages = [] - for x in range(0, entry['qnum']): - buf = f.read(4) - if len(buf) == 0: - break - size, = struct.unpack('i', buf) - msg = pb.ipc_msg() - msg.ParseFromString(f.read(size)) - rounded = round_up(msg.msize, sizeof_u64) - data = f.read(msg.msize) - f.seek(rounded - msg.msize, 1) - messages.append(pb2dict.pb2dict(msg)) - messages.append(base64.encodebytes(data).decode('utf-8')) + messages, _ = self._read_messages(f, pbuff) return messages def dump(self, extra, f, pbuff): @@ -443,15 +434,17 @@ class ipc_msg_queue_handler: f.write(struct.pack('i', size)) f.write(msg_str) rounded = round_up(msg.msize, sizeof_u64) - if (sys.version_info > (3, 0)): - data = base64.decodebytes(str.encode(extra[i + 1])) - else: - data = base64.decodebytes(extra[i + 1]) + data = decode_base64_data(extra[i + 1]) f.write(data[:msg.msize]) f.write(b'\0' * (rounded - msg.msize)) def skip(self, f, pbuff): + _, pl_len = self._read_messages(f, pbuff, skip_data=True) + return pl_len + + def _read_messages(self, f, pbuff, skip_data=False): entry = pb2dict.pb2dict(pbuff) + messages = [] pl_len = 0 for x in range(0, entry['qnum']): buf = f.read(4) @@ -461,10 +454,17 @@ class ipc_msg_queue_handler: msg = pb.ipc_msg() msg.ParseFromString(f.read(size)) rounded = round_up(msg.msize, sizeof_u64) - f.seek(rounded, os.SEEK_CUR) pl_len += size + msg.msize - return pl_len + if skip_data: + f.seek(rounded, os.SEEK_CUR) + else: + data = f.read(msg.msize) + f.seek(rounded - msg.msize, 1) + messages.append(pb2dict.pb2dict(msg)) + messages.append(base64.encodebytes(data).decode('utf-8')) + + return messages, pl_len class ipc_shm_handler: @@ -560,7 +560,7 @@ handlers = { 'MEMFD_INODE': entry_handler(pb.memfd_inode_entry), 'BPFMAP_FILE': entry_handler(pb.bpfmap_file_entry), 'BPFMAP_DATA': entry_handler(pb.bpfmap_data_entry, - bpfmap_data_extra_handler()), + bpfmap_data_extra_handler()), 'APPARMOR': entry_handler(pb.apparmor_entry), } @@ -574,12 +574,12 @@ def __rhandler(f): try: m = magic.by_val[img_magic] - except: + except Exception: raise MagicException(img_magic) try: handler = handlers[m] - except: + except Exception: raise Exception("No handler found for image with magic " + m) return m, handler @@ -641,7 +641,7 @@ def dump(img, f): try: handler = handlers[m] - except: + except Exception: raise Exception("No handler found for image with such magic") handler.dump(img['entries'], f) From 3ca979f9a15019313efb451f72db74f01f24cb4d Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 1 Mar 2023 15:47:53 +0000 Subject: [PATCH 153/775] coredump: handle long command-lines This fixes errors with long command-lines: File "/home/criu/coredump/criu_coredump/coredump.py", line 320, in _gen_prpsinfo prpsinfo.pr_psargs = self._gen_cmdline(pid) ^^^^^^^^^^^^^^^^^^ ValueError: bytes too long (88, maximum length 80) Signed-off-by: Adrian Reber --- coredump/criu_coredump/coredump.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 53b143ec0..e2c56c01c 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -315,7 +315,9 @@ class coredump_generator: prpsinfo.pr_ppid = pstree["ppid"] prpsinfo.pr_pgrp = pstree["pgid"] prpsinfo.pr_sid = pstree["sid"] - prpsinfo.pr_psargs = self._gen_cmdline(pid) + # prpsinfo.pr_psargs has a limit of 80 characters which means it will + # fail here if the cmdline is longer than 80 + prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] if (sys.version_info > (3, 0)): prpsinfo.pr_fname = core["tc"]["comm"].encode() else: From edaec5d762be45fb95af3d3dd4e50b31175f3bb0 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 1 Mar 2023 15:48:46 +0000 Subject: [PATCH 154/775] coredump: report missing files without a backtrace New message: ERROR: Required file /usr/lib64/libcrypto.so.3.0.1 not found. Exiting Old message: File "/home/criu/coredump/criu_coredump/coredump.py", line 693, in _gen_mem_chunk f = open(fname, 'rb') FileNotFoundError: [Errno 2] No such file or directory: '/usr/lib64/libcrypto.so.3.0.1' Signed-off-by: Adrian Reber --- coredump/coredump.py | 8 +++++++- coredump/criu_coredump/coredump.py | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/coredump/coredump.py b/coredump/coredump.py index 5e63d2138..88a1b374c 100644 --- a/coredump/coredump.py +++ b/coredump/coredump.py @@ -1,5 +1,6 @@ import argparse import os +import sys import criu_coredump @@ -34,7 +35,12 @@ def main(): opts = vars(parser.parse_args()) - coredump(opts) + try: + coredump(opts) + except SystemExit as error: + print('ERROR: %s' % error) + print('Exiting') + sys.exit(1) if __name__ == '__main__': diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index e2c56c01c..8ee402676 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -692,7 +692,11 @@ class coredump_generator: files = self.reg_files fname = next(filter(lambda x: x["id"] == shmid, files))["name"] - f = open(fname, 'rb') + try: + f = open(fname, 'rb') + except FileNotFoundError: + sys.exit('Required file %s not found.' % fname) + f.seek(off) start = vma["start"] From 1d4777e452e4d8776e6c8431d1c5e41ace0e46fe Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 1 Mar 2023 15:49:29 +0000 Subject: [PATCH 155/775] test: add long command-line to coredump test Signed-off-by: Adrian Reber --- test/others/criu-coredump/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index 9b6e56475..eec2b817f 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -5,7 +5,7 @@ set -x source ../env.sh || exit 1 function gen_imgs { - PID=$(../loop) + PID=$(../loop with a very very very very very very very very very very very very long cmdline) if ! $CRIU dump -v4 -o dump.log -D ./ -t "$PID"; then echo "Failed to checkpoint process $PID" cat dump.log From 2982867185644e0ad5347cfad38dda8b4e0dc69b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 15 Jul 2019 17:34:13 +0300 Subject: [PATCH 156/775] pie/restorer: Fix fd leaking on error path Nothing serious since OS will close it anyway but still to be precise. Signed-off-by: Cyrill Gorcunov Signed-off-by: Pavel Tikhomirov --- criu/pie/restorer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index efab729e8..5e78e74d4 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1960,6 +1960,7 @@ long __export_restore_task(struct task_restore_args *args) } if (ret != thread_args[i].pid) { pr_err("Unable to create a thread: %ld\n", ret); + sys_close(fd); mutex_unlock(&task_entries_local->last_pid_mutex); goto core_restore_end; } From 42c4be2a924bfe83e27aef7da76144ad1833f785 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 3 Feb 2023 12:53:00 +0800 Subject: [PATCH 157/775] net: use get_legacy_iptables_bin also on restore Without this we might try to restore iptables legacy images with iptables nft. Signed-off-by: Pavel Tikhomirov --- criu/cr-check.c | 4 ++-- criu/include/util.h | 2 +- criu/net.c | 42 ++++++++++++++++++++++++++++++++++++++---- criu/util.c | 32 +++++++++++++++++--------------- 4 files changed, 58 insertions(+), 22 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index bcbcf3f2b..e4e590c4d 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1193,7 +1193,7 @@ static int check_ipt_legacy(void) char *ipt_legacy_bin; char *ip6t_legacy_bin; - ipt_legacy_bin = get_legacy_iptables_bin(false); + ipt_legacy_bin = get_legacy_iptables_bin(false, false); if (!ipt_legacy_bin) { pr_warn("Couldn't find iptables version which is using iptables legacy API\n"); return -1; @@ -1204,7 +1204,7 @@ static int check_ipt_legacy(void) if (!kdat.ipv6) return 0; - ip6t_legacy_bin = get_legacy_iptables_bin(true); + ip6t_legacy_bin = get_legacy_iptables_bin(true, false); if (!ip6t_legacy_bin) { pr_warn("Couldn't find ip6tables version which is using iptables legacy API\n"); return -1; diff --git a/criu/include/util.h b/criu/include/util.h index 3a0403113..4b4dfda95 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -384,7 +384,7 @@ static inline void print_stack_trace(pid_t pid) extern int mount_detached_fs(const char *fsname); -extern char *get_legacy_iptables_bin(bool ipv6); +extern char *get_legacy_iptables_bin(bool ipv6, bool restore); extern int set_opts_cap_eff(void); diff --git a/criu/net.c b/criu/net.c index 755a48377..230cc7433 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2039,10 +2039,10 @@ static inline int dump_iptables(struct cr_imgset *fds) * and iptables backend is nft to prevent duplicate dumps. */ #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) - iptables_cmd = get_legacy_iptables_bin(false); + iptables_cmd = get_legacy_iptables_bin(false, false); if (kdat.ipv6) - ip6tables_cmd = get_legacy_iptables_bin(true); + ip6tables_cmd = get_legacy_iptables_bin(true, false); #endif if (!iptables_cmd) { @@ -2360,9 +2360,19 @@ static int prepare_xtable_lock(void) static inline int restore_iptables(int pid) { + char *iptables_cmd = "iptables-restore"; + char *ip6tables_cmd = "ip6tables-restore"; + char comm[32]; int ret = -1; struct cr_img *img; +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + iptables_cmd = get_legacy_iptables_bin(false, true); + + if (kdat.ipv6) + ip6tables_cmd = get_legacy_iptables_bin(true, true); +#endif + img = open_image(CR_FD_IPTABLES, O_RSTR, pid); if (img == NULL) return -1; @@ -2372,7 +2382,19 @@ static inline int restore_iptables(int pid) goto ipt6; } - ret = run_iptables_tool("iptables-restore -w", img_raw_fd(img), -1); + if (!iptables_cmd) { + pr_err("Can't restore iptables dump - no legacy version present\n"); + close_image(img); + return -1; + } + + if (snprintf(comm, sizeof(comm), "%s -w", iptables_cmd) >= sizeof(comm)) { + pr_err("Can't fit '%s -w' to buffer\n", iptables_cmd); + close_image(img); + return -1; + } + + ret = run_iptables_tool(comm, img_raw_fd(img), -1); close_image(img); if (ret) return ret; @@ -2383,7 +2405,19 @@ ipt6: if (empty_image(img)) goto out; - ret = run_iptables_tool("ip6tables-restore -w", img_raw_fd(img), -1); + if (!ip6tables_cmd) { + pr_err("Can't restore ip6tables dump - no legacy version present\n"); + close_image(img); + return -1; + } + + if (snprintf(comm, sizeof(comm), "%s -w", ip6tables_cmd) >= sizeof(comm)) { + pr_err("Can't fit '%s -w' to buffer\n", ip6tables_cmd); + close_image(img); + return -1; + } + + ret = run_iptables_tool(comm, img_raw_fd(img), -1); out: close_image(img); diff --git a/criu/util.c b/criu/util.c index 959e60938..db96cf938 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1594,44 +1594,46 @@ err: return ret; } -char *get_legacy_iptables_bin(bool ipv6) +char *get_legacy_iptables_bin(bool ipv6, bool restore) { - static char iptables_bin[2][32]; + static char iptables_bin[2][2][32]; /* 0 - means we don't know yet, * -1 - not present, * 1 - present. */ - static int iptables_present[2] = { 0, 0 }; - char bins[2][2][32] = { { "iptables-save", "iptables-legacy-save" }, - { "ip6tables-save", "ip6tables-legacy-save" } }; + static int iptables_present[2][2] = { { 0, 0 }, { 0, 0 } }; + char bins[2][2][2][32] = { { { "iptables-save", "iptables-legacy-save" }, + { "iptables-restore", "iptables-legacy-restore" } }, + { { "ip6tables-save", "ip6tables-legacy-save" }, + { "ip6tables-restore", "ip6tables-legacy-restore" } } }; int ret; - if (iptables_present[ipv6] == -1) + if (iptables_present[ipv6][restore] == -1) return NULL; - if (iptables_present[ipv6] == 1) - return iptables_bin[ipv6]; + if (iptables_present[ipv6][restore] == 1) + return iptables_bin[ipv6][restore]; - memcpy(iptables_bin[ipv6], bins[ipv6][0], strlen(bins[ipv6][0]) + 1); - ret = is_iptables_nft(iptables_bin[ipv6]); + memcpy(iptables_bin[ipv6][restore], bins[ipv6][restore][0], strlen(bins[ipv6][restore][0]) + 1); + ret = is_iptables_nft(iptables_bin[ipv6][restore]); /* * iptables on host uses nft backend (or not installed), * let's try iptables-legacy */ if (ret < 0 || ret == 1) { - memcpy(iptables_bin[ipv6], bins[ipv6][1], strlen(bins[ipv6][1]) + 1); - ret = is_iptables_nft(iptables_bin[ipv6]); + memcpy(iptables_bin[ipv6][restore], bins[ipv6][restore][1], strlen(bins[ipv6][restore][1]) + 1); + ret = is_iptables_nft(iptables_bin[ipv6][restore]); if (ret < 0 || ret == 1) { - iptables_present[ipv6] = -1; + iptables_present[ipv6][restore] = -1; return NULL; } } /* we can come here with iptables-save or iptables-legacy-save */ - iptables_present[ipv6] = 1; + iptables_present[ipv6][restore] = 1; - return iptables_bin[ipv6]; + return iptables_bin[ipv6][restore]; } /* From 3f8e3220baccf91d8ffd98d7248132b12a3067ac Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 7 Mar 2023 11:04:15 +0000 Subject: [PATCH 158/775] CONTRIBUTING.md: document make lint / indent This patch documents how do we use `make lint` and `make indent` and adds a note about their integration with CI. Co-authored-by: Pavel Tikhomirov Signed-off-by: Radostin Stoyanov --- CONTRIBUTING.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 864caf93e..87da08b34 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -60,6 +60,36 @@ When you change the source code, please keep in mind the following code conventi Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. +The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), +text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). + +``` + make lint +``` + +In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) +based on the kernel source tree. However, compliance with the clang-format autoformat rules is optional. If the automatic code formatting +results in decreased readability, we may choose to ignore these errors. + +Run the following command to check if your changes are compliant with the clang-format rules: + +``` + make indent +``` + +This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to +specify a range of commits to check for coding style issues. By default, it is set to `HEAD~1`, so that only the last commit is checked. +If you are developing on top of the criu-dev branch and want to check all your commits for compliance with the clang-format rules, you +can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional options to `git-clang-format`. For example, if you want +to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. + +``` + make indent OPTS=--diff BASE=HEAD~N +``` + +Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected +we need to review the suggested changes and decide if they should be fixed before merging. + ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run From 4930c98020f5f13d9e02d891c84a7cf9bd765632 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 9 Mar 2021 13:24:20 +0000 Subject: [PATCH 159/775] x86/xsave: Set only used XFEATURE_* in xstate_bv Setting all supported by CPU features in xstate_bv may bring it into dirty-upper-state as documented in specs, resulting in lower performance. Let's not do this and set only those have been used by dumpee. P.S. Off course it has to be a one-liner! Fixes: #1171 Signed-off-by: Dmitry Safonov Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/crtools.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index d10e51e48..912a4348b 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -433,7 +433,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) #define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) #define assign_xsave(feature, xsave, member, area) \ do { \ - if (compel_fpu_has_feature(feature)) { \ + if (compel_fpu_has_feature(feature) && (xsave->xstate_bv & (1UL << feature))) { \ uint32_t off = compel_fpu_feature_offset(feature); \ void *to = &area[off]; \ void *from = xsave->member; \ From 156c8da33c53f680f7455f53e9e7d6003427ca87 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 7 Mar 2023 15:33:59 +0800 Subject: [PATCH 160/775] make: disable '-Wdangling-pointer' warning with gcc 12 The patch is similar to what has been done in linux kernel, as this warning effectively prevents us from adding list elements to local list head. See https://github.com/torvalds/linux/commit/49beadbd47c2 Else we have: CC criu/mount.o In file included from criu/include/cr_options.h:7, from criu/mount.c:13: In function '__list_add', inlined from 'list_add' at include/common/list.h:41:2, inlined from 'mnt_tree_for_each' at criu/mount.c:1977:2: include/common/list.h:35:19: error: storing the address of local variable 'postpone' in '((struct list_head *)((char *)start + 8))[24].prev' [-Werror=dangling-pointer=] 35 | new->prev = prev; | ~~~~~~~~~~^~~~~~ criu/mount.c: In function 'mnt_tree_for_each': criu/mount.c:1972:19: note: 'postpone' declared here 1972 | LIST_HEAD(postpone); | ^~~~~~~~ Signed-off-by: Pavel Tikhomirov --- Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Makefile b/Makefile index 24318d692..7d0f2350a 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,15 @@ DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes +# -Wdangling-pointer results in false warning when we add a list element to +# local list head variable. It is false positive because before leaving the +# function we always check that local list head variable is empty, thus +# insuring that pointer to it is not dangling anywhere, but gcc can't +# understand it. +# Note: There is similar problem with kernel list, where this warning is also +# disabled: https://github.com/torvalds/linux/commit/49beadbd47c2 +WARNINGS += -Wno-dangling-pointer -Wno-unknown-warning-option + CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline -fprofile-update=atomic export CFLAGS-GCOV From 6e681afb69cb2b5222d127341243fa102d3fb7a7 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 28 Feb 2023 14:01:19 +0100 Subject: [PATCH 161/775] net: fail restore if nftables isn't supported but image is present Fixes: e1c4871 ("net: add nftables c/r") Signed-off-by: Alexander Mikhalitsyn --- criu/net.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/criu/net.c b/criu/net.c index 230cc7433..78a5d3daf 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2425,29 +2425,18 @@ out: } #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) -static inline int restore_nftables(int pid) +static inline int do_restore_nftables(struct cr_img *img) { - int ret = -1; - struct cr_img *img; + int exit_code = -1; struct nft_ctx *nft; off_t img_data_size; char *buf; - img = open_image(CR_FD_NFTABLES, O_RSTR, pid); - if (img == NULL) - return -1; - if (empty_image(img)) { - /* Backward compatibility */ - pr_info("Skipping nft restore, no image\n"); - ret = 0; - goto image_close_out; - } - if ((img_data_size = img_raw_size(img)) < 0) - goto image_close_out; + goto out; if (read_img_str(img, &buf, img_data_size) < 0) - goto image_close_out; + goto out; nft = nft_ctx_new(NFT_CTX_DEFAULT); if (!nft) @@ -2465,18 +2454,44 @@ static inline int restore_nftables(int pid) #endif goto nft_ctx_free_out; - ret = 0; + exit_code = 0; nft_ctx_free_out: nft_ctx_free(nft); buf_free_out: xfree(buf); +out: + return exit_code; +} +#endif + +static inline int restore_nftables(int pid) +{ + int exit_code = -1; + struct cr_img *img; + + img = open_image(CR_FD_NFTABLES, O_RSTR, pid); + if (img == NULL) + return -1; + if (empty_image(img)) { + /* Backward compatibility */ + pr_info("Skipping nft restore, no image\n"); + exit_code = 0; + goto image_close_out; + } + +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + if (!do_restore_nftables(img)) + exit_code = 0; +#else + pr_err("Unable to restore nftables. CRIU was built without libnftables support\n"); +#endif + image_close_out: close_image(img); - return ret; + return exit_code; } -#endif int read_net_ns_img(void) { @@ -2805,10 +2820,8 @@ static int prepare_net_ns_second_stage(struct ns_id *ns) ret = restore_rule(nsid); if (!ret) ret = restore_iptables(nsid); -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) if (!ret) ret = restore_nftables(nsid); -#endif } if (!ret) From 529f29891349eadf89e7ff0c96c208b08cadb89e Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 16 Mar 2023 23:05:46 +0700 Subject: [PATCH 162/775] cgroup-v2: make new field is_threaded optional The new field is_threaded is currently marked as required which causes backward compatibility problem when using newer CRIU version to restore dumped image from older version. This commit makes this field optional and reworks the logic the skip fixing up threaded cgroup controllers if there is no information in dumped image. Signed-off-by: Bui Quang Minh --- criu/cgroup.c | 3 ++- images/cgroup.proto | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 0c730713a..8243ac6d3 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -930,6 +930,7 @@ static int dump_controllers(CgroupEntry *cg) list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); + ce->has_is_threaded = true; ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; @@ -2002,7 +2003,7 @@ static int cgroupd(int sk) * process must be in this controller. Main thread has been * restored, so this thread is in this controller already. */ - if (!ctrl->is_threaded) + if (!ctrl->has_is_threaded || !ctrl->is_threaded) continue; aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); diff --git a/images/cgroup.proto b/images/cgroup.proto index 5c7d16c6d..02f226835 100644 --- a/images/cgroup.proto +++ b/images/cgroup.proto @@ -24,7 +24,7 @@ message cgroup_dir_entry { message cg_controller_entry { repeated string cnames = 1; repeated cgroup_dir_entry dirs = 2; - required bool is_threaded = 3; + optional bool is_threaded = 3; } message cg_member_entry { From 69befdde1858cbc1235096f21955ccefa4191b00 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 16 Mar 2023 23:09:18 +0700 Subject: [PATCH 163/775] cgroup-v2: make new field cg_set optional The new field cg_set is currently marked as required which causes backward compatibility problem when using newer CRIU version to restore dumped image from older version. This commit makes this field optional and reworks the logic to fallback to use cg_set from task_core when it is not in thread_core. Signed-off-by: Bui Quang Minh --- criu/cr-dump.c | 2 ++ criu/cr-restore.c | 12 ++++++++---- images/core.proto | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 249c02226..1c1962e8f 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -811,6 +811,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item goto err; } + core->thread_core->has_cg_set = true; cg_set = &core->thread_core->cg_set; ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) @@ -1436,6 +1437,7 @@ static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstr return -1; } + core->thread_core->has_cg_set = true; if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) return -1; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 195fa5639..f02e95f6d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1354,10 +1354,14 @@ static inline int fork_with_pid(struct pstree_item *item) * Zombie tasks' cgroup is not dumped/restored. * cg_set == 0 is skipped in prepare_task_cgroup() */ - if (item->pid->state == TASK_DEAD) + if (item->pid->state == TASK_DEAD) { rsti(item)->cg_set = 0; - else - rsti(item)->cg_set = ca.core->thread_core->cg_set; + } else { + if (ca.core->thread_core->has_cg_set) + rsti(item)->cg_set = ca.core->thread_core->cg_set; + else + rsti(item)->cg_set = ca.core->tc->cg_set; + } if (ca.core->tc->has_stop_signo) item->pid->stop_signo = ca.core->tc->stop_signo; @@ -3824,7 +3828,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); - if (rsti(current)->cg_set != tcore->thread_core->cg_set) { + if (tcore->thread_core->has_cg_set && rsti(current)->cg_set != tcore->thread_core->cg_set) { thread_args[i].cg_set = tcore->thread_core->cg_set; thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); } else { diff --git a/images/core.proto b/images/core.proto index bc8b7a488..eddd1dc55 100644 --- a/images/core.proto +++ b/images/core.proto @@ -106,7 +106,7 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; optional rseq_entry rseq_entry = 15; - required uint32 cg_set = 16; + optional uint32 cg_set = 16; } message task_rlimits_entry { From 7cae16e971c7ba9b3c39230c168f728741156994 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 20 Jul 2020 13:32:42 +0300 Subject: [PATCH 164/775] mount: do collect_mntinfo of external mount namespace with no for_dump When we collect external mount namespace we don't want to dump mounts in it, so lets remove this flag. This way we can e.g. use for_dump in ->parse() callbacks to separate in-container mounts from others. This only affects rare case of `--ext-mount-map auto` but to be absolutely correct let's fix it anyway. Signed-off-by: Pavel Tikhomirov --- criu/mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/mount.c b/criu/mount.c index 3369fea34..db9db63b2 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -826,7 +826,7 @@ static struct ns_id *find_ext_ns_id(void) for (ns = ns_ids; ns->next; ns = ns->next) if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { - if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, true)) + if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, false)) break; return ns; } From 34e2b022198c08aab83d5f569221ff1b4c2203da Mon Sep 17 00:00:00 2001 From: KKrypt Date: Wed, 15 Mar 2023 12:32:37 +0530 Subject: [PATCH 165/775] Optimized shell code with <'s (instead of cat + |) This patch optimizes shell code as reading a single file as input using a 'cat' command to a program. It is considered to be a Useless Use of Cat (UUOC). It's more efficient to simply use redirection. However, in some cases, even using the redirection operator '<' seems unnecessary. Signed-off-by: KKrypt --- test/jenkins/criu-fault.sh | 2 +- test/others/mem-snap/run-predump-2.sh | 2 +- test/others/mem-snap/run-predump.sh | 2 +- test/others/mem-snap/run-snap-auto-dedup.sh | 2 +- test/others/mem-snap/run-snap-dedup-on-restore.sh | 2 +- test/others/mem-snap/run-snap-dedup.sh | 2 +- test/others/mem-snap/run-snap-maps04.sh | 2 +- test/others/mem-snap/run-snap.sh | 2 +- test/others/mounts/mounts.sh | 2 +- test/others/mounts/run.sh | 4 ++-- test/others/ns_ext/run.sh | 4 ++-- test/others/ns_ext/run_pidns.sh | 4 ++-- test/others/unix-callback/run.sh | 2 +- 13 files changed, 16 insertions(+), 16 deletions(-) diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index f41073230..7f503e817 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -9,7 +9,7 @@ prep ./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail # FIXME: fhandles looks broken on btrfs -cat /proc/self/mountinfo | grep -P "/.* / " | grep -q btrfs || NOBTRFS=$? +grep -P "/.* / " /proc/self/mountinfo | grep -q btrfs || NOBTRFS=$? if [ $NOBTRFS -eq 1 ] ; then ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail fi diff --git a/test/others/mem-snap/run-predump-2.sh b/test/others/mem-snap/run-predump-2.sh index 46af8063b..5ef1422b4 100755 --- a/test/others/mem-snap/run-predump-2.sh +++ b/test/others/mem-snap/run-predump-2.sh @@ -28,7 +28,7 @@ function stop_test { wtime=1 cd ../../zdtm/static/ make maps04.stop - cat maps04.out | fgrep PASS || fail "Test failed" + fgrep PASS maps04.out || fail "Test failed" echo "OK" } diff --git a/test/others/mem-snap/run-predump.sh b/test/others/mem-snap/run-predump.sh index d06d2d8fc..06ba74737 100755 --- a/test/others/mem-snap/run-predump.sh +++ b/test/others/mem-snap/run-predump.sh @@ -72,6 +72,6 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" echo "Test PASSED" diff --git a/test/others/mem-snap/run-snap-auto-dedup.sh b/test/others/mem-snap/run-snap-auto-dedup.sh index f77aa1fcb..a3801f5b4 100755 --- a/test/others/mem-snap/run-snap-auto-dedup.sh +++ b/test/others/mem-snap/run-snap-auto-dedup.sh @@ -84,7 +84,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-dedup-on-restore.sh b/test/others/mem-snap/run-snap-dedup-on-restore.sh index 6ae050bc7..5dbb5bf44 100755 --- a/test/others/mem-snap/run-snap-dedup-on-restore.sh +++ b/test/others/mem-snap/run-snap-dedup-on-restore.sh @@ -78,7 +78,7 @@ fi cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" if [ $restore_dedup_ok -ne 0 ]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-dedup.sh b/test/others/mem-snap/run-snap-dedup.sh index 27fcd55a9..40db95325 100755 --- a/test/others/mem-snap/run-snap-dedup.sh +++ b/test/others/mem-snap/run-snap-dedup.sh @@ -90,7 +90,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-maps04.sh b/test/others/mem-snap/run-snap-maps04.sh index 2def909d9..267d51deb 100755 --- a/test/others/mem-snap/run-snap-maps04.sh +++ b/test/others/mem-snap/run-snap-maps04.sh @@ -58,7 +58,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log --auto-dedup -d -v4 || fa make -C ../../zdtm/static/ maps04.stop sleep 1 -cat "../zdtm/static/maps04.out" | fgrep PASS || fail "Test failed" +fgrep PASS "../zdtm/static/maps04.out" || fail "Test failed" size=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) if [ $size -ne 0 ] ; then diff --git a/test/others/mem-snap/run-snap.sh b/test/others/mem-snap/run-snap.sh index b97bd295e..c91cd0098 100755 --- a/test/others/mem-snap/run-snap.sh +++ b/test/others/mem-snap/run-snap.sh @@ -69,6 +69,6 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" echo "Test PASSED" diff --git a/test/others/mounts/mounts.sh b/test/others/mounts/mounts.sh index 19116d0cf..51ea69540 100755 --- a/test/others/mounts/mounts.sh +++ b/test/others/mounts/mounts.sh @@ -12,7 +12,7 @@ cd $INMNTNS mount --make-rprivate / -for i in `cat /proc/self/mounts | awk '{ print $2 }'`; do +for i in `awk '{ print $2 }' < /proc/self/mounts`; do [ '/' = "$i" ] && continue [ '/proc' = "$i" ] && continue [ '/dev' = "$i" ] && continue diff --git a/test/others/mounts/run.sh b/test/others/mounts/run.sh index 35927fb5e..d665a726a 100755 --- a/test/others/mounts/run.sh +++ b/test/others/mounts/run.sh @@ -12,12 +12,12 @@ kill -0 $pid || exit cat /proc/$pid/mountinfo | sort -k 4 echo "Suspend server" ${CRIU} dump -D dump -o dump.log -t $pid -v4 || { - cat dump/dump.log | grep Error + grep Error dump/dump.log exit 1 } echo "Resume server" ${CRIU} restore -d -D dump -o restore.log -v4 || { - cat dump/dump.log | grep Error + grep Error dump/dump.log exit 1 } cat /proc/$pid/mountinfo | sort -k 4 diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index 2e9a6fe86..e416f95e5 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -61,7 +61,7 @@ exec 33< $MNT1 exec 34< $MNT2 $CRIU dump -v4 -t $pid -o dump.log -D images --external $NS[$ino]:test_ns --external $NS[$ino2]:test_ns2 RESULT=$? -cat images/dump.log | grep -B 5 Error || echo ok +grep -B 5 Error images/dump.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU dump failed" echo FAIL @@ -70,7 +70,7 @@ cat images/dump.log | grep -B 5 Error || echo ok $CRIU restore -v4 -o restore.log -D images --inherit-fd fd[33]:test_ns --inherit-fd fd[34]:test_ns2 -d RESULT=$? -cat images/restore.log | grep -B 5 Error || echo ok +grep -B 5 Error images/restore.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU restore failed" echo FAIL diff --git a/test/others/ns_ext/run_pidns.sh b/test/others/ns_ext/run_pidns.sh index 7ac855a18..08c5bff8e 100755 --- a/test/others/ns_ext/run_pidns.sh +++ b/test/others/ns_ext/run_pidns.sh @@ -36,7 +36,7 @@ mkdir -p images_pidns echo "$CRIU dump -v4 -o dump.log -t $PID -D images_pidns --external $PIDNS:exti" $CRIU dump -v4 -o dump.log -t $PID -D images_pidns --external $PIDNS:exti RESULT=$? -cat images_pidns/dump.log | grep -B 5 Error || echo ok +grep -B 5 Error images_pidns/dump.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU dump failed" echo FAIL @@ -48,7 +48,7 @@ exec {pidns_fd}< /proc/self/ns/pid echo "$CRIU restore -v4 -o restore.log -D images_pidns --restore-detached --inherit-fd fd[$pidns_fd]:exti" $CRIU restore -v4 -o restore.log -D images_pidns --restore-detached --inherit-fd fd[$pidns_fd]:exti --pidfile test.pidfile RESULT=$? -cat images_pidns/restore.log | grep -B 5 Error || echo ok +grep -B 5 Error images_pidns/restore.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU restore failed" echo FAIL diff --git a/test/others/unix-callback/run.sh b/test/others/unix-callback/run.sh index ec5b7f54e..b15daa289 100755 --- a/test/others/unix-callback/run.sh +++ b/test/others/unix-callback/run.sh @@ -40,7 +40,7 @@ done ${CRIU} restore -D data -o restore.log -v4 --lib `pwd`/lib -d || exit 1 kill $pid while :; do - cat output | grep PASS && break + grep PASS output && break sleep 1 done From 65407616e0870d8d5d34b934157be500e0dd0661 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 15 Mar 2023 12:20:47 +0000 Subject: [PATCH 166/775] ci/archlinux: initialize machine ID When installing packages within Archlinux container, pacman fails with the following errors: (3/7) Creating temporary files... /usr/lib/tmpfiles.d/journal-nocow.conf:26: Failed to replace specifiers in '/var/log/journal/%m': No such file or directory /usr/lib/tmpfiles.d/systemd.conf:23: Failed to replace specifiers in '/run/log/journal/%m': No such file or directory /usr/lib/tmpfiles.d/systemd.conf:25: Failed to replace specifiers in '/run/log/journal/%m': No such file or directory /usr/lib/tmpfiles.d/systemd.conf:26: Failed to replace specifiers in '/run/log/journal/%m/*.journal*': No such file or directory /usr/lib/tmpfiles.d/systemd.conf:29: Failed to replace specifiers in '/var/log/journal/%m': No such file or directory /usr/lib/tmpfiles.d/systemd.conf:30: Failed to replace specifiers in '/var/log/journal/%m/system.journal': No such file or directory /usr/lib/tmpfiles.d/systemd.conf:32: Failed to replace specifiers in '/var/log/journal/%m': No such file or directory /usr/lib/tmpfiles.d/systemd.conf:33: Failed to replace specifiers in '/var/log/journal/%m/system.journal': No such file or directory To solve this problem we need to initialize the machine ID. Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.archlinux | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index d226244ee..ce2a38bd4 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -2,6 +2,9 @@ FROM docker.io/library/archlinux:latest ARG CC=gcc +# Initialize machine ID +RUN systemd-machine-id-setup + RUN pacman -Syu --noconfirm \ $CC \ bash \ From 12423abdb5849c9214a4987221856820aad97f9f Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 22 Mar 2023 15:08:23 +0800 Subject: [PATCH 167/775] mount: allow bindmounts for external fuse mounts Currently we only allow external fuse mount itself, let's allow bindmount for it too. Other mount code is ready for this change and will be able to bindmount it from corresponding external mount. Signed-off-by: Pavel Tikhomirov --- criu/filesystems.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/filesystems.c b/criu/filesystems.c index 890d5d06d..093e1c492 100644 --- a/criu/filesystems.c +++ b/criu/filesystems.c @@ -547,7 +547,8 @@ static int fusectl_dump(struct mount_info *pm) } for (it = mntinfo; it; it = it->next) { - if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && !it->external) { + if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && + !mnt_is_external_bind(it)) { pr_err("%s is a fuse mount but not external\n", it->ns_mountpoint); goto out; } From a0158e6927a20a51f5ead8ecfebed9b82b1740c6 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 20 Mar 2023 11:33:37 +0800 Subject: [PATCH 168/775] zdtm: add MNTNS_ZDTM macro to fix initialization With this macro we can easily declare struct mntns_zdtm variables with all lists properly initiallized. Let's use it in mount_complex_sharing as without it we can have segfault on error path when accessing uninitialized list pointers. Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/mountinfo.h | 8 ++++++++ test/zdtm/static/mount_complex_sharing.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/test/zdtm/lib/mountinfo.h b/test/zdtm/lib/mountinfo.h index b5a8f5bcd..6d90e2c10 100644 --- a/test/zdtm/lib/mountinfo.h +++ b/test/zdtm/lib/mountinfo.h @@ -24,6 +24,14 @@ struct mntns_zdtm { struct list_head sharing_groups_list; }; +#define MNTNS_ZDTM_INIT(name) \ + { \ + .mountinfo_list = LIST_HEAD_INIT(name.mountinfo_list), \ + .topology_list = LIST_HEAD_INIT(name.topology_list), \ + .sharing_groups_list = LIST_HEAD_INIT(name.sharing_groups_list), \ + } +#define MNTNS_ZDTM(name) struct mntns_zdtm name = MNTNS_ZDTM_INIT(name) + struct sharing_group { int shared_id; int master_id; diff --git a/test/zdtm/static/mount_complex_sharing.c b/test/zdtm/static/mount_complex_sharing.c index c6402d646..5f247a8e4 100644 --- a/test/zdtm/static/mount_complex_sharing.c +++ b/test/zdtm/static/mount_complex_sharing.c @@ -212,7 +212,8 @@ static int mount_loop(void) int main(int argc, char **argv) { - struct mntns_zdtm mntns_before, mntns_after; + MNTNS_ZDTM(mntns_before); + MNTNS_ZDTM(mntns_after); int ret = 1; test_init(argc, argv); From 66fd45d51d4d74453507848ee4783fb0bbce818a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 17 Aug 2020 14:27:36 +0300 Subject: [PATCH 169/775] sk-unix: add some missed error printing In Virtuozzo tests we have seen uninformative errors: (26.575039) 151187 fdinfo 6: pos: 0 flags: 2/0 (26.575076) sockets: Searching for socket 0x346d1 family 1 (666.230281 ---------------------------------------- (666.230586 Error (criu/cr-dump.c:1850): Dump files (pid: 151187) failed with -1 So let's add some error messages to this stack. Signed-off-by: Pavel Tikhomirov --- criu/sk-unix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 5c0f57523..66034df65 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -595,12 +595,14 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U else ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); if (!ns) { + pr_err("Failed to lookup ns by mnt id %d\n", ue->mnt_id); ret = -ENOENT; goto out; } mntns_root = mntns_get_root_fd(ns); if (mntns_root < 0) { + pr_err("Failed to lookup mntns root for ns %d\n", ns->id); ret = -ENOENT; goto out; } From bd0f209c2bdaa24c86544f1e71fe4ff41599b70a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Mar 2023 10:34:54 +0800 Subject: [PATCH 170/775] pstree: improve id intersection detection in prepare_pstree_for_shell_job First, let's move lookup_create_item-s to the end so that on pgid replacement we don't have false positive pstree_pid_by_virt check founding item created by sid replacement. (note: we need those lookup_create_item-s for the sake of free pid selection mechanism) Second, let's add checks for sid/pgid in images intersecting with current_sid/pgid, as this would also bring problems on restore. Signed-off-by: Pavel Tikhomirov --- criu/pstree.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/criu/pstree.c b/criu/pstree.c index 72c4a3502..8c44e7134 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -381,20 +381,26 @@ static int prepare_pstree_for_shell_job(pid_t pid) } for_each_pstree_item(pi) { + if (pi->sid == current_sid) { + pr_err("Current sid %d intersects with sid of (%d) in images\n", current_sid, vpid(pi)); + return -1; + } if (pi->sid == old_sid) pi->sid = current_sid; + if (pi->pgid == current_sid) { + pr_err("Current sid %d intersects with pgid of (%d) in images\n", current_sid, + vpid(pi)); + return -1; + } if (pi->pgid == old_sid) pi->pgid = current_sid; } - - if (lookup_create_item(current_sid) == NULL) - return -1; } /* root_item is a group leader */ if (root_item->pgid == vpid(root_item)) - return 0; + goto add_fake_session_leader; old_gid = root_item->pgid; if (old_gid != current_gid) { @@ -407,14 +413,21 @@ static int prepare_pstree_for_shell_job(pid_t pid) } for_each_pstree_item(pi) { + if (current_gid != current_sid && pi->pgid == current_gid) { + pr_err("Current gid %d intersects with pgid of (%d) in images\n", current_gid, + vpid(pi)); + return -1; + } if (pi->pgid == old_gid) pi->pgid = current_gid; } - - if (lookup_create_item(current_gid) == NULL) - return -1; } + if (old_gid != current_gid && !lookup_create_item(current_gid)) + return -1; +add_fake_session_leader: + if (old_sid != current_sid && !lookup_create_item(current_sid)) + return -1; return 0; } From 7f0f07599a680258092a7790fb3046856f486f2f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 4 Dec 2022 16:27:09 +0000 Subject: [PATCH 171/775] crit: fix compatibility with Python 3.12 Python 3.12 includes a few breaking changes, such as the removal of the distutils module [1] and the deprecation of `setup.py install` in favour of pip install [2]. This patch updates the installation script for crit to reflect these changes by replacing the use of `setup.py install` with `pip install` and `distutils` with `setuptools`. In addition, a minimal pyproject.toml file has been added as it is required by the new version of pip [3]. It is worth noting that with this change we are switching from the egg packaging format to wheel [4] and add pip as a build dependency. [1] https://www.python.org/downloads/release/python-3120a2/ [2] https://github.com/pypa/setuptools/pull/2824 [3] https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/ [4] https://packaging.python.org/en/latest/discussions/wheel-vs-egg/ Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 4 +- .gitignore | 1 - Makefile | 3 +- crit/.gitignore | 2 + crit/pyproject.toml | 2 + crit/setup.py | 29 +++++++++++ lib/Makefile | 12 ++--- scripts/build/Dockerfile.alpine | 1 + scripts/build/Dockerfile.archlinux | 1 + scripts/ci/prepare-for-fedora-rawhide.sh | 2 + scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- scripts/crit-setup.py | 25 --------- scripts/uninstall_module.py | 65 ++++++++++++++++++++++++ 14 files changed, 114 insertions(+), 37 deletions(-) create mode 100644 crit/.gitignore create mode 100644 crit/pyproject.toml create mode 100644 crit/setup.py delete mode 100644 scripts/crit-setup.py create mode 100755 scripts/uninstall_module.py diff --git a/.cirrus.yml b/.cirrus.yml index 914ceb72c..bd4799fd0 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python-flake8 xmlto + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is @@ -108,7 +108,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-junit_xml xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-importlib-metadata python3-junit_xml xmlto alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed diff --git a/.gitignore b/.gitignore index 23894d631..1ea828bbc 100644 --- a/.gitignore +++ b/.gitignore @@ -38,7 +38,6 @@ criu/pie/parasite-blob.h criu/protobuf-desc-gen.h lib/build/ lib/c/criu.pc -lib/.crit-setup.files compel/include/asm include/common/asm include/common/config.h diff --git a/Makefile b/Makefile index 7d0f2350a..8061a42c4 100644 --- a/Makefile +++ b/Makefile @@ -428,7 +428,8 @@ lint: flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py flake8 --config=scripts/flake8.cfg lib/py/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns - flake8 --config=scripts/flake8.cfg scripts/crit-setup.py + flake8 --config=scripts/flake8.cfg crit/setup.py + flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ shellcheck --version shellcheck scripts/*.sh diff --git a/crit/.gitignore b/crit/.gitignore new file mode 100644 index 000000000..810661179 --- /dev/null +++ b/crit/.gitignore @@ -0,0 +1,2 @@ +crit.egg-info/ +build/ diff --git a/crit/pyproject.toml b/crit/pyproject.toml new file mode 100644 index 000000000..b1e1a4650 --- /dev/null +++ b/crit/pyproject.toml @@ -0,0 +1,2 @@ +[build-system] +requires = ["setuptools"] diff --git a/crit/setup.py b/crit/setup.py new file mode 100644 index 000000000..1aaa73a13 --- /dev/null +++ b/crit/setup.py @@ -0,0 +1,29 @@ +import os +from setuptools import setup, find_packages + + +def get_version(): + version = '0.0.1' + env = os.environ + if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: + version = '{}.{}'.format( + env['CRIU_VERSION_MAJOR'], + env['CRIU_VERSION_MINOR'] + ) + if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: + version += '.' + env['CRIU_VERSION_SUBLEVEL'] + return version + + +setup( + name='crit', + version=get_version(), + description='CRiu Image Tool', + author='CRIU team', + author_email='criu@openvz.org', + license='GPLv2', + url='https://github.com/checkpoint-restore/criu', + packages=find_packages('.'), + scripts=['crit'], + install_requires=[], +) diff --git a/lib/Makefile b/lib/Makefile index 575a7bad3..ff540fb75 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,10 +2,6 @@ CRIU_SO := libcriu.so CRIU_A := libcriu.a UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/version.h -# -# File to keep track of files installed by setup.py -CRIT_SETUP_FILES := lib/.crit-setup.files - all-y += lib-c lib-a lib-py # @@ -58,8 +54,10 @@ install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig +ifeq ($(PYTHON),python3) $(E) " INSTALL " crit - $(Q) $(PYTHON) scripts/crit-setup.py install --prefix=$(DESTDIR)$(PREFIX) --record $(CRIT_SETUP_FILES) + $(Q) $(PYTHON) -m pip install --upgrade --force-reinstall --prefix=$(DESTDIR)$(PREFIX) ./crit +endif .PHONY: install uninstall: @@ -71,6 +69,8 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) +ifeq ($(PYTHON),python3) $(E) " UNINSTALL" crit - $(Q) while read -r file; do $(RM) "$$file"; done < $(CRIT_SETUP_FILES) + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif .PHONY: uninstall diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 19b08315f..af1858ab5 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -40,6 +40,7 @@ RUN apk add \ e2fsprogs \ py-yaml \ py3-flake8 \ + py3-importlib-metadata \ asciidoctor # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index ce2a38bd4..f2bce1e5b 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -34,6 +34,7 @@ RUN pacman -Syu --noconfirm \ flake8 \ asciidoctor \ python-junit-xml \ + python-importlib-metadata \ diffutils COPY . /criu diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f4d3155f9..7c62aaaa2 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -27,6 +27,8 @@ dnf install -y \ python3-future \ python3-protobuf \ python3-junit_xml \ + python3-pip \ + python3-importlib-metadata \ python-unversioned-command \ redhat-rpm-config \ sudo \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 5b9f6d929..229de97c1 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -6,7 +6,7 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml libperl-dev pkg-config python3-future python3-protobuf - python3-junit.xml) + python3-pip python3-importlib-metadata python3-junit.xml) X86_64_PKGS=(gcc-multilib) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index f0996b01d..5cc842442 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -38,7 +38,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-future python3-protobuf \ + protobuf-devel python3-flake8 python3-future python3-protobuf python3-importlib-metadata \ python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd diff --git a/scripts/crit-setup.py b/scripts/crit-setup.py deleted file mode 100644 index 13df03e3b..000000000 --- a/scripts/crit-setup.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -from distutils.core import setup - -criu_version = "0.0.1" -env = os.environ - -if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: - criu_version = '{}.{}'.format( - env['CRIU_VERSION_MAJOR'], - env['CRIU_VERSION_MINOR'] - ) - - if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: - criu_version += '.' + env['CRIU_VERSION_SUBLEVEL'] - -setup(name="crit", - version=criu_version, - description="CRiu Image Tool", - author="CRIU team", - author_email="criu@openvz.org", - license="GPLv2", - url="https://github.com/checkpoint-restore/criu", - package_dir={'pycriu': 'lib/py'}, - packages=["pycriu", "pycriu.images"], - scripts=["crit/crit"]) diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py new file mode 100755 index 000000000..439fca18a --- /dev/null +++ b/scripts/uninstall_module.py @@ -0,0 +1,65 @@ +#!/usr/bin/python3 +""" +`pip uninstall` doesn't support `--prefix`. +https://github.com/pypa/pip/issues/11213 +""" +import argparse +import os +import shutil +import site +import subprocess +import sys + +import importlib_metadata + + +def add_site_dir(prefix: str): + """ + Add site directory with prefix to sys.path and update PYTHONPATH. + """ + # If prefix is used, we need to make sure that we + # do not uninstall other packages from the system paths. + sys.path = [] + site.PREFIXES = [prefix] + pkgs = site.getsitepackages() + for path in pkgs: + site.addsitedir(path) + if 'dist-packages' in path: + # Ubuntu / Debian might use both dist- and site- packages. + site.addsitedir(path.replace('dist-packages', 'site-packages')) + os.environ['PYTHONPATH'] = os.pathsep.join(sys.path) + + +def uninstall_module(package_name: str, prefix=None): + """ + Enable support for '--prefix' with 'pip uninstall'. + """ + dist_info_path = None + if prefix: + add_site_dir(prefix) + try: + dist_info_path = str(importlib_metadata.distribution(package_name)._path) + except importlib_metadata.PackageNotFoundError: + print(f"Skipping {package_name} as it is not installed.") + sys.exit(0) + + command = [sys.executable, '-m', 'pip', 'uninstall', '-y', package_name] + try: + subprocess.check_call(command, env=os.environ) + if dist_info_path and os.path.isdir(dist_info_path): + # .dist-info files are not cleaned up when the package + # has been installed with --prefix. + # https://github.com/pypa/pip/issues/5573 + shutil.rmtree(dist_info_path) + if 'dist-packages' in dist_info_path: + shutil.rmtree(dist_info_path.replace('dist-packages', 'site-packages')) + except subprocess.CalledProcessError as err: + print(f'Error uninstalling package {package_name}: {err}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('module_name', help='The name of the module to uninstall') + parser.add_argument('--prefix', help='The prefix where the module was installed') + args = parser.parse_args() + uninstall_module(args.module_name, args.prefix) From 8e6fa9c3b9cd5575c5e907314a8dbf8afbd26a57 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 21 Jun 2018 13:09:18 +0300 Subject: [PATCH 172/775] net: Add net log prefix For better logging. Signed-off-by: Cyrill Gorcunov Signed-off-by: Pavel Tikhomirov --- criu/net.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/net.c b/criu/net.c index 78a5d3daf..2793b18e6 100644 --- a/criu/net.c +++ b/criu/net.c @@ -51,6 +51,9 @@ #include "images/netdev.pb-c.h" #include "images/inventory.pb-c.h" +#undef LOG_PREFIX +#define LOG_PREFIX "net: " + #ifndef IFLA_NEW_IFINDEX #define IFLA_NEW_IFINDEX 49 #endif From ccc790d5404efceee387bde2ab76e18419a23516 Mon Sep 17 00:00:00 2001 From: Yuriy Vasiliev Date: Thu, 2 Dec 2021 11:32:01 +0100 Subject: [PATCH 173/775] zdtm/lib: fix cwd path freeing Fix cwd freeing on error path in get_cwd_check_perm and on non-error-path in unix_fill_sock_name. v2: use cleanup_free attribute in unix_fill_sock_name Signed-off-by: Yuriy Vasiliev Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/fs.c | 1 + test/zdtm/lib/unix.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index 7b8be5f9f..bf8cd9cd3 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -108,6 +108,7 @@ int get_cwd_check_perm(char **result) "Bit 'x' should be set in all path components of " "this directory\n", cwd, getuid(), getgid(), errno, strerror(errno)); + free(cwd); return -1; } diff --git a/test/zdtm/lib/unix.c b/test/zdtm/lib/unix.c index 49773dedd..288f1df24 100644 --- a/test/zdtm/lib/unix.c +++ b/test/zdtm/lib/unix.c @@ -5,7 +5,7 @@ int unix_fill_sock_name(struct sockaddr_un *name, char *relFilename) { - char *cwd; + cleanup_free char *cwd = NULL; if (get_cwd_check_perm(&cwd)) { pr_err("failed to get current working directory with valid permissions.\n"); From d93409cf119c98fbc17266b5cc76647817664f1a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 28 Mar 2023 11:59:49 +0800 Subject: [PATCH 174/775] sk-unix: remove bogus xfree from unix_resolve_name_old It is strange to free a pointer which is already in unix_sk_desc, either on error path or on skip as we leave freed pointer in desc and it can probably be used after free later and lead to some corruption. So I would prefer not to free it as we don't have full controll over it here. Fixes: 6d785e6cd ("unix: resolve a socket file when a socket descriptor is available") Signed-off-by: Pavel Tikhomirov --- criu/sk-unix.c | 1 - 1 file changed, 1 deletion(-) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 66034df65..f1105cb75 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -646,7 +646,6 @@ postprone: return 0; out: - xfree(name); return ret; skip: ret = 1; From de39bd2bd10a6f1ee319ab74c9d805e4fd5308bd Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 28 Mar 2023 12:09:38 +0800 Subject: [PATCH 175/775] sk-unix: simplify error handling in unix_resolve_name_old As we now don't have any calls to free in this function we can replace all lables with explicit returns. While on it: Replace useless -errno and 1 returns with -1 as from the very first implementation of unix_resolve_name (it changed name to _old later) in [1] any non-zero return was treated as error. 6d785e6cd ("unix: resolve a socket file when a socket descriptor is available") [1] Signed-off-by: Pavel Tikhomirov --- criu/sk-unix.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index f1105cb75..4bcc95182 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -596,15 +596,13 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); if (!ns) { pr_err("Failed to lookup ns by mnt id %d\n", ue->mnt_id); - ret = -ENOENT; - goto out; + return -1; } mntns_root = mntns_get_root_fd(ns); if (mntns_root < 0) { pr_err("Failed to lookup mntns root for ns %d\n", ns->id); - ret = -ENOENT; - goto out; + return -1; } if (name[0] != '/') { @@ -615,16 +613,15 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U ret = resolve_rel_name(id, d, p, &ue->name_dir); if (ret < 0) - goto out; - goto postprone; + return -1; + return 0; } snprintf(rpath, sizeof(rpath), ".%s", name); if (fstatat(mntns_root, rpath, &st, 0)) { if (errno != ENOENT) { - pr_warn("Can't stat socket %#" PRIx32 "(%s), skipping: %s (err %d)\n", id, rpath, - strerror(errno), errno); - goto skip; + pr_perror("Can't stat socket %#" PRIx32 "(%s)", id, rpath); + return -1; } pr_info("unix: Dropping path %s for unlinked sk %#x\n", name, id); @@ -642,14 +639,7 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U d->deleted = deleted; -postprone: return 0; - -out: - return ret; -skip: - ret = 1; - goto out; } static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, UnixSkEntry *ue, const struct fd_parms *p) From 5c8cdceec2d2c1fcf1814bcafb3fe264403c8cf3 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 28 Mar 2023 12:36:45 +0800 Subject: [PATCH 176/775] sk-unix: rework unix_resolve_name - use exit_code instead of returning ret - replace -errno return with -1 - move fallback to if (!kdat.sk_unix_file) - fix readlinkat error checking (ret < 0 && ret >= PATH_MAX) by using read_fd_link helper Signed-off-by: Pavel Tikhomirov --- criu/sk-unix.c | 69 +++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 4bcc95182..841152643 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -645,78 +645,71 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, UnixSkEntry *ue, const struct fd_parms *p) { char *name = d->name; - char path[PATH_MAX], tmp[PATH_MAX]; + char path[PATH_MAX]; struct stat st; - int fd, proc_fd, mnt_id, ret; + int fd, ret; + int exit_code = -1; if (d->namelen == 0 || name[0] == '\0') return 0; - if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { - if (get_mnt_id(lfd, &mnt_id)) + if (!kdat.sk_unix_file) { + pr_warn("Trying to resolve unix socket with obsolete method\n"); + if (unix_resolve_name_old(lfd, id, d, ue, p)) { + pr_err("Unable to resolve unix socket name with obsolete method. " + "Try a linux kernel newer than 4.10\n"); return -1; - ue->mnt_id = mnt_id; - ue->has_mnt_id = true; + } + return 0; } fd = ioctl(lfd, SIOCUNIXFILE); if (fd < 0) { - pr_warn("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl: %s\n", strerror(errno)); - goto fallback; + pr_perror("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl"); + return -1; } - ret = fstat(fd, &st); - if (ret) { + if (root_ns_mask & CLONE_NEWNS) { + struct fdinfo_common fdinfo = { .mnt_id = -1 }; + + if (parse_fdinfo(fd, FD_TYPES__UND, &fdinfo)) + goto out; + + ue->mnt_id = fdinfo.mnt_id; + ue->has_mnt_id = true; + } + + if (fstat(fd, &st)) { pr_perror("Unable to fstat socket fd"); - return -1; + goto out; } d->mode = st.st_mode; d->uid = st.st_uid; d->gid = st.st_gid; - proc_fd = get_service_fd(PROC_FD_OFF); - if (proc_fd < 0) { - pr_err("Unable to get service fd for proc\n"); - return -1; - } - - snprintf(tmp, sizeof(tmp), "self/fd/%d", fd); - ret = readlinkat(proc_fd, tmp, path, PATH_MAX); - if (ret < 0 && ret >= PATH_MAX) { - pr_perror("Unable to readlink %s", tmp); + ret = read_fd_link(fd, path, sizeof(path)); + if (ret < 0) goto out; - } - path[ret] = 0; d->deleted = strip_deleted(path, ret); if (name[0] != '/') { - ret = cut_path_ending(path, name); - if (ret) { - pr_err("Unable too resolve %s from %s\n", name, path); + if (cut_path_ending(path, name)) { + pr_err("Unable too cut %s from %s\n", name, path); goto out; } ue->name_dir = xstrdup(path); - if (!ue->name_dir) { - ret = -ENOMEM; + if (!ue->name_dir) goto out; - } pr_debug("Resolved socket relative name %s to %s/%s\n", name, ue->name_dir, name); } - ret = 0; + exit_code = 0; out: close(fd); - return ret; - -fallback: - pr_warn("Trying to resolve unix socket with obsolete method\n"); - ret = unix_resolve_name_old(lfd, id, d, ue, p); - if (ret < 0) - pr_err("Unable to resolve unix socket name with obsolete method. Try a linux kernel newer than 4.10\n"); - return ret; + return exit_code; } /* From 0c52399322429d430db14061879e860471c3ab15 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 12 Mar 2023 11:56:22 +0000 Subject: [PATCH 177/775] ci: cancel preceding workflows run This patch adds concurrency groups to the CI workflows to automatically cancel any in-progress workflows when a pull request has been updated. A `concurrency` group allows to ensure that a single job or workflow will run at a time. For example, when a pull request is updated with a force-push, the GiHub CI workflows currently in-progress will be automatically cancelled, and the CI would run only with the updated commits. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#concurrency Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 5 +++++ .github/workflows/archlinux-test.yml | 5 +++++ .github/workflows/codeql.yml | 5 +++++ .github/workflows/compat-test.yml | 5 +++++ .github/workflows/cross-compile.yml | 5 +++++ .github/workflows/docker-test.yml | 5 +++++ .github/workflows/fedora-asan-test.yml | 5 +++++ .github/workflows/fedora-rawhide-test.yml | 5 +++++ .github/workflows/gcov-test.yml | 5 +++++ .github/workflows/java-test.yml | 5 +++++ .github/workflows/lint.yml | 5 +++++ .github/workflows/podman-test.yml | 5 +++++ .github/workflows/stream-test.yml | 5 +++++ .github/workflows/x86-64-clang-test.yml | 5 +++++ .github/workflows/x86-64-gcc-test.yml | 5 +++++ 15 files changed, 75 insertions(+) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 6fc546ff5..06f466c51 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -2,6 +2,11 @@ name: Alpine Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: alpine-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index bb98623a8..328cc9d0f 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -2,6 +2,11 @@ name: Arch Linux Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: archlinux-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 2d1039a0e..518d9b8ae 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -8,6 +8,11 @@ on: schedule: - cron: "11 6 * * 3" +# Cancel any preceding run on the pull request. +concurrency: + group: codeql-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: analyze: name: Analyze diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index 5ae25fb73..79f8f0010 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -2,6 +2,11 @@ name: Compat Tests on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: compat-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index be8e7f09c..4da5d397c 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -2,6 +2,11 @@ name: Cross Compile Tests on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: cross-compile-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index 564691449..fabf399fd 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -2,6 +2,11 @@ name: Docker Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: docker-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 44b0f16d6..8b1bfcf32 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -2,6 +2,11 @@ name: Fedora ASAN Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: fedora-asan-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index b6d94d23e..5355aa192 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -2,6 +2,11 @@ name: Fedora Rawhide Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: fedora-rawhide-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index f782c5b9d..fcab47837 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -2,6 +2,11 @@ name: Coverage Tests on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: gcov-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml index 211953495..abed793bf 100644 --- a/.github/workflows/java-test.yml +++ b/.github/workflows/java-test.yml @@ -2,6 +2,11 @@ name: Java Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: java-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index a501af30e..e18f921f3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -2,6 +2,11 @@ name: Run code linter on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: lint-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-latest diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index 447cbf0b6..a7013a216 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -2,6 +2,11 @@ name: Podman Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: podman-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index ecdd81e0a..0f5b307db 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -2,6 +2,11 @@ name: CRIU Image Streamer Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: stream-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index e6e84ef52..b3b50829a 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -2,6 +2,11 @@ name: X86_64 CLANG Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: clang-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index b8b81ef15..ec70b61fb 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -2,6 +2,11 @@ name: X86_64 GCC Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: gcc-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-20.04 From 9683097f27b2b93c3429dea11d39dc74b8a64ffb Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 29 Mar 2023 18:27:16 +0200 Subject: [PATCH 178/775] zdtm: don't ignore rseq_cs mismatch in rseq01 test Kernel shouldn't clean up rseq_cs inside a critical section. If rseq_cs has been cleaned up, it means there is a bug in migration. Signed-off-by: Michal Clapinski --- test/zdtm/transition/rseq01.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c index b6d470785..7247b69c1 100644 --- a/test/zdtm/transition/rseq01.c +++ b/test/zdtm/transition/rseq01.c @@ -119,7 +119,7 @@ static void check_thread(void) #define rseq_after_asm_goto() asm volatile("" : : : "memory") -static int rseq_addv(intptr_t *v, intptr_t count, int cpu) +static int rseq_addv(intptr_t *v, intptr_t count, int cpu, bool ignore_abort) { double a = 10000000000000000.0; double b = -1; @@ -193,6 +193,8 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) abort: rseq_after_asm_goto(); test_msg("abort %lx %lx %f %f\n", rseq_cs1, rseq_cs2, a, b); + if (ignore_abort) + return 0; return -1; } @@ -202,6 +204,7 @@ int main(int argc, char *argv[]) int ret; intptr_t *cpu_data; long nr_cpus; + bool ignore_abort = true; rseq_ptr = &__rseq_abi; memset((void *)rseq_ptr, 0, sizeof(struct rseq)); @@ -225,6 +228,7 @@ int main(int argc, char *argv[]) * https://github.com/torvalds/linux/blob/ce522ba9/kernel/rseq.c#L192 */ #ifdef NORESTART + ignore_abort = false; rseq_ptr->flags = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE; #endif @@ -233,13 +237,7 @@ int main(int argc, char *argv[]) while (test_go()) { cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); - ret = rseq_addv(&cpu_data[cpu], 2, cpu); - -/* NORESTART is NOT set */ -#ifndef NORESTART - /* just ignore abort */ - ret = 0; -#endif + ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort); if (ret) break; From 85e46c44d6d63d94995f42ba797a0561f4077db7 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Tue, 4 Apr 2023 16:56:53 +0200 Subject: [PATCH 179/775] dump: extend parasite_thread_ctl lifetime to dump_task_thread Signed-off-by: Michal Clapinski --- criu/cr-dump.c | 1 + criu/parasite-syscall.c | 14 ++++---------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 1c1962e8f..b7edd294f 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -906,6 +906,7 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstr close_image(img); err: + compel_release_thread(tctl); pr_info("----------------------------------------\n"); return ret; } diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index d3541d996..35489634d 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -195,13 +195,13 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit ret = compel_get_thread_regs(tctl, save_task_regs, core); if (ret) { pr_err("Can't obtain regs for thread %d\n", pid); - goto err_rth; + return -1; } ret = compel_arch_fetch_thread_area(tctl); if (ret) { pr_err("Can't obtain thread area of %d\n", pid); - goto err_rth; + return -1; } compel_arch_get_tls_thread(tctl, &args->tls); @@ -211,23 +211,17 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD); if (ret) { pr_err("Can't init thread in parasite %d\n", pid); - goto err_rth; + return -1; } ret = alloc_groups_copy_creds(creds, pc); if (ret) { pr_err("Can't copy creds for thread %d\n", pid); - goto err_rth; + return -1; } - compel_release_thread(tctl); - tid->ns[0].virt = args->tid; return dump_thread_core(pid, core, args); - -err_rth: - compel_release_thread(tctl); - return -1; } int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) From 78c4e2c0f7cd179fee1be10f3a58ba182fb58179 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Tue, 4 Apr 2023 17:04:58 +0200 Subject: [PATCH 180/775] cr-dump: move rseq functions before dump_task_thread Signed-off-by: Michal Clapinski --- criu/cr-dump.c | 126 ++++++++++++++++++++++++------------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b7edd294f..83a44d157 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -875,6 +875,69 @@ static int collect_file_locks(void) return parse_file_locks(); } +static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) +{ + return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; +} + +static int fixup_thread_rseq(struct pstree_item *item, int i) +{ + CoreEntry *core = item->core[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + + /* equivalent to (struct rseq)->rseq_cs is NULL */ + if (!rseq_cs->start_ip) + return 0; + + pr_debug( + "fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", + tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, + rseq_cs->version, (unsigned long)TI_IP(core)); + + if (rseq_cs->version != 0) { + pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); + return -1; + } + + if (task_in_rseq(rseq_cs, TI_IP(core))) { + struct pid *tid = &item->threads[i]; + + /* + * We need to fixup task instruction pointer from + * the original one (which lays inside rseq critical section) + * to rseq abort handler address. But we need to look on rseq_cs->flags + * (please refer to struct rseq -> flags field description). + * Naive idea of flags support may be like... let's change instruction pointer (IP) + * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). + * But unfortunately, it doesn't work properly, because the kernel does + * clean up of rseq_cs field in the struct rseq (modifies userspace memory). + * So, we need to preserve original value of (struct rseq)->rseq_cs field in the + * image and restore it's value before releasing threads (see restore_rseq_cs()). + * + * It's worth to mention that we need to fixup IP in CoreEntry + * (used when full dump/restore is performed) and also in + * the parasite regs storage (used if --leave-running option is used, + * or if dump error occurred and process execution is resumed). + */ + + if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) { + pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", + tid->real); + + TI_IP(core) = rseq_cs->abort_ip; + + if (item->pid->real == tid->real) { + compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); + } else { + compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); + } + } + } + + return 0; +} + static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstree_item *item, int id) { struct parasite_thread_ctl *tctl = dmpi(item)->thread_ctls[id]; @@ -1184,69 +1247,6 @@ free_rseq: return -1; } -static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) -{ - return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; -} - -static int fixup_thread_rseq(struct pstree_item *item, int i) -{ - CoreEntry *core = item->core[i]; - struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; - pid_t tid = item->threads[i].real; - - /* equivalent to (struct rseq)->rseq_cs is NULL */ - if (!rseq_cs->start_ip) - return 0; - - pr_debug( - "fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", - tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, - rseq_cs->version, (unsigned long)TI_IP(core)); - - if (rseq_cs->version != 0) { - pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); - return -1; - } - - if (task_in_rseq(rseq_cs, TI_IP(core))) { - struct pid *tid = &item->threads[i]; - - /* - * We need to fixup task instruction pointer from - * the original one (which lays inside rseq critical section) - * to rseq abort handler address. But we need to look on rseq_cs->flags - * (please refer to struct rseq -> flags field description). - * Naive idea of flags support may be like... let's change instruction pointer (IP) - * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). - * But unfortunately, it doesn't work properly, because the kernel does - * clean up of rseq_cs field in the struct rseq (modifies userspace memory). - * So, we need to preserve original value of (struct rseq)->rseq_cs field in the - * image and restore it's value before releasing threads (see restore_rseq_cs()). - * - * It's worth to mention that we need to fixup IP in CoreEntry - * (used when full dump/restore is performed) and also in - * the parasite regs storage (used if --leave-running option is used, - * or if dump error occurred and process execution is resumed). - */ - - if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) { - pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", - tid->real); - - TI_IP(core) = rseq_cs->abort_ip; - - if (item->pid->real == tid->real) { - compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); - } else { - compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); - } - } - } - - return 0; -} - static int fixup_task_rseq(pid_t pid, struct pstree_item *item) { int ret = 0; From f8da250bb35372155c42cde4a132aa309d4a07d3 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Thu, 30 Mar 2023 19:14:04 +0200 Subject: [PATCH 181/775] cr-dump: properly apply rseq fixup for all threads Previously fixup was done before threads' registers were dumped so it didn't actually work. This commit splits rseq fixup into thread leader fixup and other threads fixup and applies them after the entities are seized. Signed-off-by: Michal Clapinski --- criu/cr-dump.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 83a44d157..90d763f49 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -880,12 +880,15 @@ static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; } -static int fixup_thread_rseq(struct pstree_item *item, int i) +static int fixup_thread_rseq(const struct pstree_item *item, int i) { CoreEntry *core = item->core[i]; struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; + if (!kdat.has_ptrace_get_rseq_conf) + return 0; + /* equivalent to (struct rseq)->rseq_cs is NULL */ if (!rseq_cs->start_ip) return 0; @@ -961,6 +964,12 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstr core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[id]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + ret = fixup_thread_rseq(item, id); + if (ret) { + pr_err("Can't fixup rseq for pid %d\n", pid); + goto err; + } + img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt); if (!img) goto err; @@ -1247,32 +1256,11 @@ free_rseq: return -1; } -static int fixup_task_rseq(pid_t pid, struct pstree_item *item) -{ - int ret = 0; - int i; - - if (!kdat.has_ptrace_get_rseq_conf) - return 0; - - for (i = 0; i < item->nr_threads; i++) { - if (fixup_thread_rseq(item, i)) { - ret = -1; - goto exit; - } - } - -exit: - xfree(dmpi(item)->thread_rseq_cs); - dmpi(item)->thread_rseq_cs = NULL; - return ret; -} - static struct proc_pid_stat pps_buf; static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) { - int i; + int i, ret = 0; for (i = 0; i < item->nr_threads; i++) { /* Leader is already dumped */ @@ -1280,11 +1268,14 @@ static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pst item->threads[i].ns[0].virt = vpid(item); continue; } - if (dump_task_thread(parasite_ctl, item, i)) - return -1; + ret = dump_task_thread(parasite_ctl, item, i); + if (ret) + break; } - return 0; + xfree(dmpi(item)->thread_rseq_cs); + dmpi(item)->thread_rseq_cs = NULL; + return ret; } /* @@ -1608,7 +1599,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; } - ret = fixup_task_rseq(pid, item); + ret = fixup_thread_rseq(item, 0); if (ret) { pr_err("Fixup rseq for %d failed %d\n", pid, ret); goto err; From 6c728df1dc71e80aee580256f1e56e4d3dcb042a Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Thu, 30 Mar 2023 17:47:38 +0200 Subject: [PATCH 182/775] zdtm: modify rseq01 to include a thread Testing only the thread group leader is not enough and can hide bugs. Signed-off-by: Michal Clapinski --- test/zdtm/transition/Makefile | 2 ++ test/zdtm/transition/rseq01.c | 57 +++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile index ab735bdd4..ddf2faaad 100644 --- a/test/zdtm/transition/Makefile +++ b/test/zdtm/transition/Makefile @@ -84,7 +84,9 @@ ptrace: LDFLAGS += -pthread fork2: CFLAGS += -D FORK2 thread-bomb.o: CFLAGS += -pthread thread-bomb: LDFLAGS += -pthread +rseq01: LDLIBS += -pthread rseq02: CFLAGS += -D NORESTART +rseq02: LDLIBS += -pthread %: %.sh cp $< $@ diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c index 7247b69c1..0fbcc2dca 100644 --- a/test/zdtm/transition/rseq01.c +++ b/test/zdtm/transition/rseq01.c @@ -86,7 +86,7 @@ struct rseq { #endif /* EOF */ -static volatile struct rseq *rseq_ptr; +static __thread volatile struct rseq *rseq_ptr; static __thread volatile struct rseq __rseq_abi; static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) @@ -119,7 +119,7 @@ static void check_thread(void) #define rseq_after_asm_goto() asm volatile("" : : : "memory") -static int rseq_addv(intptr_t *v, intptr_t count, int cpu, bool ignore_abort) +static int rseq_addv(intptr_t *v, intptr_t count, int cpu, bool ignore_abort, const char *id) { double a = 10000000000000000.0; double b = -1; @@ -177,7 +177,7 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu, bool ignore_abort) ); /* clang-format on */ rseq_after_asm_goto(); - test_msg("exit %lx %lx %f %f\n", rseq_cs1, rseq_cs2, a, b); + test_msg("exit %s, %lx %lx %f %f\n", id, rseq_cs1, rseq_cs2, a, b); if (rseq_cs1 != rseq_cs2) { /* * It means that we finished critical section @@ -192,19 +192,45 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu, bool ignore_abort) return 0; abort: rseq_after_asm_goto(); - test_msg("abort %lx %lx %f %f\n", rseq_cs1, rseq_cs2, a, b); + test_msg("abort %s, %lx %lx %f %f\n", id, rseq_cs1, rseq_cs2, a, b); if (ignore_abort) return 0; return -1; } +static task_waiter_t waiter; +static intptr_t *cpu_data; +bool ignore_abort = true; +int thread_ret; + +void *thread_routine(void *args) +{ + int cpu; + + rseq_ptr = &__rseq_abi; + memset((void *)rseq_ptr, 0, sizeof(struct rseq)); + register_thread(); + task_waiter_complete(&waiter, 1); + task_waiter_wait4(&waiter, 2); + + while (test_go()) { + cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); + thread_ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort, "thread"); + + if (thread_ret) + break; + } + + check_thread(); + return NULL; +} + int main(int argc, char *argv[]) { int cpu = 0; int ret; - intptr_t *cpu_data; long nr_cpus; - bool ignore_abort = true; + pthread_t thread; rseq_ptr = &__rseq_abi; memset((void *)rseq_ptr, 0, sizeof(struct rseq)); @@ -233,21 +259,32 @@ int main(int argc, char *argv[]) RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE; #endif + task_waiter_init(&waiter); + if (pthread_create(&thread, NULL, thread_routine, NULL)) { + fail("pthread_create"); + exit(EXIT_FAILURE); + } + task_waiter_wait4(&waiter, 1); + test_daemon(); + task_waiter_complete(&waiter, 2); while (test_go()) { cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); - ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort); + ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort, "task"); if (ret) break; } - test_waitsig(); - check_thread(); - if (ret) + if (pthread_join(thread, NULL)) { + fail("pthread_join"); + exit(EXIT_FAILURE); + } + + if (ret || thread_ret) fail(); else pass(); From 45e4a6b27452f8f51ff12d4a8def9216ecb00200 Mon Sep 17 00:00:00 2001 From: hdzhoujie Date: Thu, 30 Mar 2023 16:33:20 +0800 Subject: [PATCH 183/775] netlink: fix netlink fd flags dump/restore failed During the restore process, netlink fd uses the flags in the NetlinkSkEntry structure to restore the file state, but during the dump process, the flags values is not saved to the structure. Signed-off-by: zhoujie Signed-off-by: hejingxian --- criu/sk-netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index 754eed932..a219b69be 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -161,7 +161,7 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.protocol = val; } - + ne.flags = p->flags; ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; From 94ac9ee3cc9a53c5200ffc7c79ca7444e10cd09a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 5 Apr 2023 13:58:18 +0800 Subject: [PATCH 184/775] proc_parse: fix while condition in parse_pid_status In parse_pid_status there are 13 places where we do done++, so when "done" is 13 it means that we have matched each of those 13 places and we are ready to stop. In next lines we are not going to find anything. So the right condition for the while loop is (done < 13). Signed-off-by: Pavel Tikhomirov --- criu/proc_parse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index bcb8256b4..5e96b5c96 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1043,7 +1043,7 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) if (bfdopenr(&f)) return -1; - while (done < 14) { + while (done < 13) { str = breadline(&f); if (str == NULL) break; From 9b3496043d6df7839944e97cdf660c1633b6a4d4 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Thu, 13 Apr 2023 16:51:02 +0200 Subject: [PATCH 185/775] log: fix timestamp logging when tv_sec>=100 Previously when tv_sec>=100, the line would look like this: (269.189615 Error [...] Now the last char is overwritten with ')'. Signed-off-by: Michal Clapinski --- criu/log.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/log.c b/criu/log.c index 47419313b..89ae8f820 100644 --- a/criu/log.c +++ b/criu/log.c @@ -71,7 +71,8 @@ static void print_ts(void) gettimeofday(&t, NULL); timediff(&start, &t); - snprintf(buffer, TS_BUF_OFF, "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec); + snprintf(buffer, TS_BUF_OFF, "(%02u.%06u", (unsigned)t.tv_sec, (unsigned)t.tv_usec); + buffer[TS_BUF_OFF - 2] = ')'; /* this will overwrite the last digit if tv_sec>=100 */ buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */ } From b689bcc3548b41eb28c2f733b625751e1e189b16 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 13 Apr 2023 11:20:57 +0800 Subject: [PATCH 186/775] cr-check: remove excess kerndat_has_nspid from check_ns_pid We do kerndat_has_nspid in kerndat_init already and save result to kerndat cache, we don't need to recheck it each time. Signed-off-by: Pavel Tikhomirov --- criu/cr-check.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index e4e590c4d..a4166f76b 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1324,9 +1324,6 @@ static int check_pidfd_store(void) static int check_ns_pid(void) { - if (kerndat_has_nspid() < 0) - return -1; - if (!kdat.has_nspid) return -1; From 4c1a2ac41bb80843c927d2fde8f2ff4186f8d278 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 19 Apr 2023 08:00:45 -0700 Subject: [PATCH 187/775] criu: Version 3.18 (Silver Sandpiper) The highlight feature of this release is the ability to use CRIU for non-root users. Adrian Reber implemented the kernel part and created the initial version of CRIU changes. Then Younes Manton joined the effort and pushed it to the finish line. The full change log is here: https://criu.org/Download/criu/3.18 Signed-off-by: Andrei Vagin --- Makefile.versions | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.versions b/Makefile.versions index 73bc2d5fa..4c645cd6c 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. CRIU_VERSION_MAJOR := 3 -CRIU_VERSION_MINOR := 17 -CRIU_VERSION_SUBLEVEL := 1 +CRIU_VERSION_MINOR := 18 +CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := Radiant Redstart +CRIU_VERSION_NAME := Silver Sandpiper CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL From 04cdbd6106a03fba3299eb99a155c11d800a3969 Mon Sep 17 00:00:00 2001 From: Suraj Shirvankar Date: Wed, 12 Apr 2023 13:38:06 +0000 Subject: [PATCH 188/775] sk-inet: Add IP TOS socket option The TOS(type of service) field in the ip header allows you specify the priority of the socket data. Signed-off-by: Suraj Shirvankar --- criu/sk-inet.c | 4 ++++ images/sk-inet.proto | 1 + 2 files changed, 5 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 4bd5abff1..24e92a852 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -416,9 +416,11 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io } else { ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); } ioe->has_freebind = ioe->freebind; ioe->has_pktinfo = !!ioe->pktinfo; + ioe->has_tos = !!ioe->tos; return ret; } @@ -813,6 +815,8 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); if (ioe->has_pktinfo) ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + if (ioe->has_tos) + ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); } if (ioe->raw) diff --git a/images/sk-inet.proto b/images/sk-inet.proto index ee1f0ae41..666326fa4 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -19,6 +19,7 @@ message ip_opts_entry { optional ip_opts_raw_entry raw = 4; optional bool pktinfo = 5; + optional uint32 tos = 6; } message inet_sk_entry { From 1c0f8787b2621ac8f7f530556aa1c6c97e04562a Mon Sep 17 00:00:00 2001 From: Suraj Shirvankar Date: Thu, 13 Apr 2023 22:27:11 +0200 Subject: [PATCH 189/775] zdtm: Add tests for ip tos restore Signed-off-by: Suraj Shirvankar --- test/zdtm/static/sock_ip_opts00.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c index 08970c0da..d890410d8 100644 --- a/test/zdtm/static/sock_ip_opts00.c +++ b/test/zdtm/static/sock_ip_opts00.c @@ -3,6 +3,7 @@ #include #include +#include #include #include "zdtmtst.h" @@ -19,11 +20,13 @@ const char *test_author = "Pavel Tikhomirov "; struct sk_opt { int level; int opt; + int val; }; struct sk_opt sk_opts_v4[] = { - { SOL_IP, IP_FREEBIND }, - { SOL_IP, IP_PKTINFO }, + { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, + { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, + { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, }; #ifndef IPV6_FREEBIND @@ -31,8 +34,8 @@ struct sk_opt sk_opts_v4[] = { #endif struct sk_opt sk_opts_v6[] = { - { SOL_IPV6, IPV6_FREEBIND }, - { SOL_IPV6, IPV6_RECVPKTINFO }, + { SOL_IPV6, IPV6_FREEBIND, IP_OPT_VAL }, + { SOL_IPV6, IPV6_RECVPKTINFO, IP_OPT_VAL }, }; struct sk_conf { @@ -71,7 +74,7 @@ int main(int argc, char **argv) n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); for (j = 0; j < n_opts; j++) { - val = IP_OPT_VAL; + val = opts[j].val; if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); goto close; @@ -93,7 +96,7 @@ int main(int argc, char **argv) goto close; } - if (val != IP_OPT_VAL) { + if (val != opts[j].val) { fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); goto close; From fa6af25e75353a417145d98352a4eaa2523e4854 Mon Sep 17 00:00:00 2001 From: hdzhoujie Date: Tue, 18 Apr 2023 21:03:53 +0800 Subject: [PATCH 190/775] dump: increase fcntl call failure judgment The pipe_size type is unsigned int, when the fcntl call fails and return -1, it will cause a negative rollover problem. Signed-off-by: zhoujie --- criu/page-pipe.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 54dc3ccc4..aab6742be 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -99,6 +99,7 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl { struct page_pipe_buf *prev = pp_prev_ppb(pp, ppb_flags); struct page_pipe_buf *ppb; + int ppb_size = 0; ppb = xmalloc(sizeof(*ppb)); if (!ppb) @@ -120,7 +121,13 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl cnt_add(CNT_PAGE_PIPES, 1); ppb->pipe_off = 0; - ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; + ppb_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0); + if (ppb_size < 0) { + xfree(ppb); + pr_perror("Can't get pipe size"); + return NULL; + } + ppb->pipe_size = ppb_size / PAGE_SIZE; pp->nr_pipes++; } From 727d796505df52c4a4922986cefeb22675def353 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 24 Apr 2023 09:28:19 +0200 Subject: [PATCH 191/775] compel: support XSAVE on newer Intel CPUs Newer Intel CPUs (Sapphire Rapids) have a much larger xsave area than before. Looking at older CPUs I see 2440 bytes. # cpuid -1 -l 0xd -s 0 ... bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) On newer CPUs (Sapphire Rapids) it grows to 11008 bytes. # cpuid -1 -l 0xd -s 0 ... bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) This increase the xsave area from one page to four pages. Without this patch the fpu03 test fails, with this patch it works again. Signed-off-by: Adrian Reber --- .../arch/x86/src/lib/include/uapi/asm/fpu.h | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index bd3b0cbd5..8c83dd9ae 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -21,7 +21,28 @@ #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 -#define XSAVE_SIZE 4096 +/* + * This used to be 4096 (one page). There is a comment below concerning + * this size: + * "One page should be enough for the whole xsave state ;-)" + * Which is kind of funny as it is no longer enough ;-) + * + * Older CPUs: + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) + * + * Newer CPUs (Sapphire Rapids): + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) + * + * So one page is no longer enough... But: + * + * Four pages should be enough for the whole xsave state ;-) + */ + +#define XSAVE_SIZE 4*4096 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE @@ -235,8 +256,11 @@ struct pkru_state { * * * One page should be enough for the whole xsave state ;-) + * + * Of course it was not ;-) Now using four pages... + * */ -#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) /* * cpu requires it to be 64 byte aligned From df7b897a22c456796599d3766f65aace818d58ef Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 24 Apr 2023 13:53:41 +0000 Subject: [PATCH 192/775] ci: fix new codespell errors Signed-off-by: Adrian Reber --- .codespellrc | 2 +- compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h | 2 +- compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h | 2 +- criu/include/image.h | 2 +- criu/mem.c | 2 +- criu/mount.c | 2 +- criu/namespaces.c | 2 +- criu/net.c | 2 +- criu/pie/restorer.c | 2 +- include/common/scm.h | 2 +- plugins/amdgpu/README.md | 2 +- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- test/exhaustive/unix.py | 2 +- test/others/app-emu/java/HelloWorld/run.sh | 2 +- test/others/app-emu/make/run.sh | 2 +- test/zdtm/static/child_opened_proc.c | 2 +- test/zdtm/static/maps00.c | 2 +- test/zdtm/static/mntns_root_bind.c | 2 +- test/zdtm/static/stopped.c | 2 +- test/zdtm/transition/ipc.c | 4 ++-- test/zdtm/transition/lazy-thp.c | 2 +- 21 files changed, 22 insertions(+), 22 deletions(-) diff --git a/.codespellrc b/.codespellrc index 765dacfab..dd31dd851 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index f8ec55d6c..9152024fd 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -18,7 +18,7 @@ struct aux_context { struct _aarch64_ctx end; }; -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index 8cc94ba74..0c4ccb648 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -14,7 +14,7 @@ */ #include -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include diff --git a/criu/include/image.h b/criu/include/image.h index 5cb01bde2..9a275565f 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -41,7 +41,7 @@ * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar - * the vDSO area, it might reqire additional memory + * the vDSO area, it might require additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap diff --git a/criu/mem.c b/criu/mem.c index ab86a1f6d..9bf7cae97 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -161,7 +161,7 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but - * the memory contents is present in the pagent image set. + * the memory contents is present in the parent image set. */ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, diff --git a/criu/mount.c b/criu/mount.c index db9db63b2..c26aaa58d 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -2823,7 +2823,7 @@ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *mi; /* child is remapped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; diff --git a/criu/namespaces.c b/criu/namespaces.c index b1b5303fa..b7c0ab400 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1454,7 +1454,7 @@ int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) * each other easily. Stream socket require manual * messages boundaries. * - * b) Make callers note the damon death by seeing the + * b) Make callers note the daemon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. diff --git a/criu/net.c b/criu/net.c index 2793b18e6..84250598c 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3433,7 +3433,7 @@ struct ns_id *net_get_root_ns(void) /* * socket_diag doesn't report unbound and unconnected sockets, - * so we have to get their network namesapces explicitly + * so we have to get their network namespaces explicitly */ struct ns_id *get_socket_ns(int lfd) { diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5e78e74d4..9873fdc11 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1068,7 +1068,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) * |G|----tgt----| | * * 3. remap src to any other place. - * G prevents src from being remaped on tgt again + * G prevents src from being remapped on tgt again * | |-------------| -> |+++++src+++++| * |G|---tgt-----| | * diff --git a/include/common/scm.h b/include/common/scm.h index bcb198882..5b6f78a8b 100644 --- a/include/common/scm.h +++ b/include/common/scm.h @@ -11,7 +11,7 @@ * Because of kernel doing kmalloc for user data passed * in SCM messages, and there is kernel's SCM_MAX_FD as a limit * for descriptors passed at once we're trying to reduce - * the pressue on kernel memory manager and use predefined + * the pressure on kernel memory manager and use predefined * known to work well size of the message buffer. */ #define CR_SCM_MSG_SIZE (1024) diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 6809ec8b9..1078eafe6 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -263,7 +263,7 @@ ROCm | Radeon Open Compute Platform Thunk | User-mode API interface to interact with amdgpu.ko KFD | AMD Kernel Fusion Driver Mesa | Open source OpenGL implementation -GTT | Graphis Translation Table, also used to denote kernel-managed system memory for GPU access +GTT | Graphics Translation Table, also used to denote kernel-managed system memory for GPU access VRAM | Video RAM BO | Buffer Object HMM | Heterogeneous Memory Management diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 42689933e..6d004247b 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -1241,7 +1241,7 @@ static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, s return true; } else { /* We could not map remaining nodes in the list. Add dest node back - * to list and try to map next dest ndoe in list to current src + * to list and try to map next dest node in list to current src * node. */ pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, " diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 5b4c972cb..6f72dd44b 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -462,7 +462,7 @@ fail_desc = { def chk_real_state(st): - # Before enything else -- check that we still have + # Before anything else -- check that we still have # all the sockets at hands for sk in st.sockets: if not sk.visible: diff --git a/test/others/app-emu/java/HelloWorld/run.sh b/test/others/app-emu/java/HelloWorld/run.sh index 0ed6afd14..e6dcbd9fc 100644 --- a/test/others/app-emu/java/HelloWorld/run.sh +++ b/test/others/app-emu/java/HelloWorld/run.sh @@ -18,7 +18,7 @@ setsid java HelloWorld & pid=${!} -echo Lanuched java application with pid $pid in background +echo Launched java application with pid $pid in background ${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { echo "Dump failed" diff --git a/test/others/app-emu/make/run.sh b/test/others/app-emu/make/run.sh index 7cb44c770..d871b7d9c 100644 --- a/test/others/app-emu/make/run.sh +++ b/test/others/app-emu/make/run.sh @@ -28,7 +28,7 @@ setsid make -j4 & pid=${!} -echo Lanuched make in $pid background +echo Launched make in $pid background sleep 2 ${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { diff --git a/test/zdtm/static/child_opened_proc.c b/test/zdtm/static/child_opened_proc.c index 2125cd264..cfe04fa4b 100644 --- a/test/zdtm/static/child_opened_proc.c +++ b/test/zdtm/static/child_opened_proc.c @@ -10,7 +10,7 @@ #include "zdtmtst.h" const char *test_doc = "Check that tree prior to files opening"; -const char *test_author = "Stanislav Kinsbursky Date: Mon, 24 Apr 2023 07:49:57 +0000 Subject: [PATCH 193/775] scripts: make newer versions of shellcheck happy Signed-off-by: Adrian Reber --- scripts/install-debian-pkgs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh index 540c2c094..8be49c787 100755 --- a/scripts/install-debian-pkgs.sh +++ b/scripts/install-debian-pkgs.sh @@ -15,7 +15,7 @@ function print_help() function process() { sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' ${REQ_PKGS} )" + sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" } if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then From cc607f810342ac27da03a3b64c7c872b33432f28 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 25 Apr 2023 12:40:12 +0800 Subject: [PATCH 194/775] criu-ns: make --pidfile option show pid in caller pidns Using the fact that we know criu_pid and criu is a parent of restored process we can create pidfile with pid on caller pidns level. We need to move mount namespace creation to child so that criu-ns can see caller pidns proc. Signed-off-by: Pavel Tikhomirov --- scripts/criu-ns | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index d51e7772c..0f83ca336 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -6,6 +6,7 @@ import sys import os import fcntl import termios +import time # constants for unshare CLONE_NEWNS = 0x00020000 @@ -110,8 +111,8 @@ def wrap_restore(): if '--restore-sibling' in restore_args: raise OSError(errno.EINVAL, "--restore-sibling is not supported") - # Unshare pid and mount namespaces - if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: + # Unshare pid namespace + if _unshare(CLONE_NEWPID) != 0: _errno = ctypes.get_errno() raise OSError(_errno, errno.errorcode[_errno]) @@ -123,8 +124,32 @@ def wrap_restore(): restore_detached = True restore_args.remove('--restore-detached') + restore_pidfile = None + if '--pidfile' in restore_args: + try: + opt_index = restore_args.index('--pidfile') + restore_pidfile = restore_args[opt_index + 1] + del restore_args[opt_index:opt_index + 2] + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--pidfile missing argument") + + if not restore_pidfile.startswith('/'): + for base_dir_opt in ['--work-dir', '-W', '--images-dir', '-D']: + if base_dir_opt in restore_args: + try: + opt_index = restore_args.index(base_dir_opt) + restore_pidfile = os.path.join(restore_args[opt_index + 1], restore_pidfile) + break + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, base_dir_opt + " missing argument") + criu_pid = os.fork() if criu_pid == 0: + # Unshare mount namespace + if _unshare(CLONE_NEWNS) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + os.setsid() # Set stdin tty to be a controlling tty of our new session, this is # required by --shell-job option, as for it CRIU would try to set a @@ -139,6 +164,25 @@ def wrap_restore(): _mount_new_proc() run_criu(restore_args) + if restore_pidfile: + restored_pid = None + retry = 5 + + while not restored_pid and retry: + with open('/proc/%d/task/%d/children' % (criu_pid, criu_pid)) as f: + line = f.readline().strip() + if len(line): + restored_pid = line + break + retry -= 1 + time.sleep(1) + + if restored_pid: + with open(restore_pidfile, 'w+') as f: + f.write(restored_pid) + else: + print("Warn: Search of restored pid for --pidfile option timeouted") + if restore_detached: return 0 From b665dce3c71d2f85ca7741c60734d803c12665eb Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 17 May 2023 09:06:15 +0100 Subject: [PATCH 195/775] docs: rename amdgpu_plugin.txt to criu-amdgpu-plugin.txt By default, the file name 'amdgpu_plugin.txt' is used also as the name for the corresponding man page (`man amdgpu_plugin`). However, when this man page is installed system-wide it would be more appropriate to have a prefix 'criu-' (e.g., `man criu-amdgpu-plugin`). Signed-off-by: Radostin Stoyanov --- Documentation/Makefile | 2 +- Documentation/{amdgpu_plugin.txt => criu-amdgpu-plugin.txt} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename Documentation/{amdgpu_plugin.txt => criu-amdgpu-plugin.txt} (94%) diff --git a/Documentation/Makefile b/Documentation/Makefile index 508551450..72bf0e862 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -16,7 +16,7 @@ ifeq ($(PYTHON),python3) SRC1 += criu-ns.txt endif SRC1 += compel.txt -SRC1 += amdgpu_plugin.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/Documentation/amdgpu_plugin.txt b/Documentation/criu-amdgpu-plugin.txt similarity index 94% rename from Documentation/amdgpu_plugin.txt rename to Documentation/criu-amdgpu-plugin.txt index 0d490b429..48a8e2f6d 100644 --- a/Documentation/amdgpu_plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -3,7 +3,7 @@ ROCM Support(1) NAME ---- -amdgpu_plugin - A plugin extension to CRIU to support checkpoint/restore in +criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in userspace for AMD GPUs. @@ -22,7 +22,7 @@ Though *criu* is a great tool for checkpointing and restoring running applications, it has certain limitations such as it cannot handle applications that have device files open. In order to support *ROCm* based workloads with *criu* we need to augment criu's core functionality with a -plugin based extension mechanism. *amdgpu_plugin* provides the necessary support +plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support to criu to allow Checkpoint / Restore with ROCm. From 36709536e5556b86fd880ea846af0c40245ccc73 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 11 May 2023 16:18:31 +0000 Subject: [PATCH 196/775] lib/c: add empty_ns interfaces to libcriu crun wants to set empty_ns and this interface is missing from the library. This adds it to libcriu. Signed-off-by: Adrian Reber --- lib/c/criu.c | 11 +++++++++++ lib/c/criu.h | 3 +++ 2 files changed, 14 insertions(+) diff --git a/lib/c/criu.c b/lib/c/criu.c index fc8159999..0095bcc9b 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -2030,3 +2030,14 @@ int criu_feature_check(struct criu_feature_check *features, size_t size) { return criu_local_feature_check(global_opts, features, size); } + +void criu_local_set_empty_ns(criu_opts *opts, int namespaces) +{ + opts->rpc->has_empty_ns = true; + opts->rpc->empty_ns = namespaces; +} + +void criu_set_empty_ns(int namespaces) +{ + criu_local_set_empty_ns(global_opts, namespaces); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index 28a083d88..3b9cedfd0 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -322,6 +322,9 @@ struct criu_feature_check { int criu_feature_check(struct criu_feature_check *features, size_t size); int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size); +void criu_local_set_empty_ns(criu_opts *opts, int namespaces); +void criu_set_empty_ns(int namespaces); + #ifdef __GNUG__ } #endif From 733f1655122bc8ae518f6c5b160f2cd346af8289 Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Tue, 4 Apr 2023 13:54:11 +0530 Subject: [PATCH 197/775] criu-ns: Add --criu-binary argument to run_criu() --criu-binary argument provides a way to supply the CRIU binary location to run_criu(). Related to: #1909 Signed-off-by: Dhanuka Warusadura --- scripts/criu-ns | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 0f83ca336..d4d867b66 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -81,8 +81,21 @@ def run_criu(args): Spawn CRIU binary """ print(sys.argv) - os.execlp('criu', *['criu'] + args) - raise OSError(errno.ENOENT, "No such command") + + if "--criu-binary" in args: + try: + opt_index = args.index("--criu-binary") + path = args[opt_index + 1] + del args[opt_index:opt_index + 2] + args.insert(0, "criu") + os.execv(path, args) + raise OSError(errno.ENOENT, "No such command") + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--criu-binary missing argument") + else: + args.insert(0, "criu") + os.execvp("criu", args) + raise OSError(errno.ENOENT, "No such command") # pidns_holder creates a process that is reparented to the init. From e4b6fb2d1f2cc8e90c430e1055b7c91385e61c88 Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Mon, 17 Apr 2023 13:00:39 +0530 Subject: [PATCH 198/775] criu-ns: Add support for older Python version in CI These changes remove and update the changes introduced in 7177938e60b81752a44a8116b3e7e399c24c4fcb in favor of the Python version in CI. os.waitstatus_to_exitcode() function appeared in Python 3.9 Related to: #1909 Signed-off-by: Dhanuka Warusadura --- scripts/criu-ns | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index d4d867b66..4c032aa14 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -71,7 +71,19 @@ def _wait_for_process_status(criu_pid): try: (pid, status) = os.wait() if pid == criu_pid: - return os.waitstatus_to_exitcode(status) + # The following code block is based on + # os.waitstatus_to_exitcode() introduced in Python 3.9 + # and we implement this for comparability with older + # versions of Python. + if os.WIFSIGNALED(status): + return os.WTERMSIG(status) + elif os.WIFEXITED(status): + return os.WEXITSTATUS(status) + elif os.WIFSTOPPED(status): + return os.WSTOPSIG(status) + else: + raise Exception("CRIU was terminated by an " + "unidentified reason") except OSError: return -251 From 9c9e8ea3f2e04ea89f9d2d9418177fd6180ed557 Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Wed, 8 Mar 2023 18:19:17 +0530 Subject: [PATCH 199/775] criu-ns: Add tests for criu-ns script These changes add test implementations for criu-ns script. Fixes: #1909 Signed-off-by: Dhanuka Warusadura --- Makefile | 1 + scripts/ci/run-ci-tests.sh | 1 + test/Makefile | 2 +- test/others/criu-ns/Makefile | 3 + test/others/criu-ns/run.py | 258 +++++++++++++++++++++++++++++++++++ 5 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 test/others/criu-ns/Makefile create mode 100755 test/others/criu-ns/run.py diff --git a/Makefile b/Makefile index 8061a42c4..8efdb760d 100644 --- a/Makefile +++ b/Makefile @@ -428,6 +428,7 @@ lint: flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py flake8 --config=scripts/flake8.cfg lib/py/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns + flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py flake8 --config=scripts/flake8.cfg crit/setup.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 229de97c1..b45183a84 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -260,6 +260,7 @@ if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/criu-ns/ run make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run diff --git a/test/Makefile b/test/Makefile index e8fcffe3f..5784b6a49 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job skip-file-rwx-check +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job criu-ns skip-file-rwx-check other: for t in $(TESTS); do \ diff --git a/test/others/criu-ns/Makefile b/test/others/criu-ns/Makefile new file mode 100644 index 000000000..4d901a111 --- /dev/null +++ b/test/others/criu-ns/Makefile @@ -0,0 +1,3 @@ +run: + @make -C ../.. zdtm_ct + ../../zdtm_ct run.py diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py new file mode 100755 index 000000000..6967b46b2 --- /dev/null +++ b/test/others/criu-ns/run.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python + +import fcntl +import os +import pathlib +import pty +import shutil +import subprocess +import sys +import termios +import time + + +CRIU_BIN = "../../../criu/criu" +CRIU_NS = "../../../scripts/criu-ns" +IMG_DIR = "dumpdir" +DUMP_LOG = "dump.log" +RESTORE_LOG = "restore.log" +PIDFILE = "pidfile" + + +def check_dumpdir(path=IMG_DIR): + if os.path.isdir(path): + shutil.rmtree(path) + os.mkdir(path, 0o755) + + +def set_blocking(fd, blocking): + """Implement os.set_blocking() for compatibility with Python + versions earlier than 3.5""" + flags = fcntl.fcntl(fd, fcntl.F_GETFL) + + if blocking: + flags &= ~os.O_NONBLOCK + else: + flags |= os.O_NONBLOCK + + fcntl.fcntl(fd, fcntl.F_SETFL, flags) + + +def run_task_with_own_pty(task): + fd_m, fd_s = pty.openpty() + + pid = os.fork() + if pid == 0: + os.close(fd_m) + os.setsid() + os.dup2(fd_s, 0) + os.dup2(fd_s, 1) + os.dup2(fd_s, 2) + fcntl.ioctl(fd_s, termios.TIOCSCTTY, 1) + os.close(fd_s) + task() + exit(0) + + os.close(fd_s) + fd_m = os.fdopen(fd_m, "rb") + set_blocking(fd_m.fileno(), False) + + while True: + try: + data = fd_m.read() + except IOError: + break + if data is not None: + print(data.decode("utf-8")) + + _, status = os.waitpid(pid, 0) + + try: + data = fd_m.read() + except IOError as err: + print(err) + + if data is not None: + print(data.decode("utf-8")) + fd_m.close() + + if status != 0: + print("task %s exited badly: %d" % (task.__name__, status)) + exit(1) + + return 0 + + +def create_pty(): + fd_m, fd_s = pty.openpty() + return (os.fdopen(fd_m, "wb"), os.fdopen(fd_s, "wb")) + + +def create_isolated_dumpee(): + pathlib.Path("running").touch() + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + os.dup2(fd_s.fileno(), 0) + os.dup2(fd_s.fileno(), 1) + os.dup2(fd_s.fileno(), 2) + fcntl.ioctl(fd_s.fileno(), termios.TIOCSCTTY, 1) + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + fd_m.close() + fd_s.close() + return pid + + +def criu_ns_dump(pid, shell_job=False): + cmd = [CRIU_NS, "dump", "-D", IMG_DIR, "-v4", "-t", str(pid), + "--log-file", DUMP_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + ret = subprocess.Popen(cmd).wait() + return ret + + +def criu_ns_restore(shell_job=False, restore_detached=False): + cmd = [CRIU_NS, "restore", "-D", IMG_DIR, "-v4", "--log-file", + RESTORE_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + if restore_detached: + cmd += ["--restore-detached", "--pidfile", PIDFILE] + ret = subprocess.Popen(cmd).wait() + return ret + + +def read_log_file(filename): + logfile_path = os.path.join(IMG_DIR, filename) + with open(logfile_path) as logfile: + print(logfile.read()) + + +def test_dump_and_restore_with_shell_job(): + print("Test criu-ns dump and restore with --shell-job option") + check_dumpdir() + pathlib.Path("running").touch() + pid = os.fork() + if pid == 0: + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + + ret = criu_ns_dump(pid, shell_job=True) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + os.unlink("running") + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + fd_m.close() + # since criu-ns takes control of the tty stdin + os.dup2(fd_s.fileno(), 0) + ret = criu_ns_restore(shell_job=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + fd_s.close() + os.waitpid(pid, 0) + + +def test_dump_and_restore_without_shell_job(restore_detached=False): + print("Test criu-ns dump and restore with an isolated process" + "(%d)" % restore_detached) + check_dumpdir() + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + if not restore_detached: + os.unlink("running") + + pid = os.fork() + if pid == 0: + os.setsid() + ret = criu_ns_restore(restore_detached=restore_detached) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + os.waitpid(pid, 0) + + +def test_dump_and_restore_in_pidns(): + if os.system("grep NSpid /proc/self/status"): + return + + print("Test criu-ns dump and restore in namespaces") + + def _dump(): + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _restore(): + ret = criu_ns_restore(restore_detached=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + def _get_restored_pid(): + restored_pid = 0 + pidfile_path = os.path.join(IMG_DIR, PIDFILE) + if not os.path.exists(pidfile_path): + raise FileNotFoundError("pidfile not found") + with open(pidfile_path, "r") as pidfile: + restored_pid = pidfile.read().strip() + return int(restored_pid) + + def _redump(): + global IMG_DIR + try: + restored_pid = _get_restored_pid() + except FileNotFoundError: + sys.exit(1) + IMG_DIR = "dumpdir2" + check_dumpdir(IMG_DIR) + ret = criu_ns_dump(restored_pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _re_restore(): + os.unlink("running") + ret = criu_ns_restore() + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + check_dumpdir() + _dump() + _restore() + _redump() + _re_restore() + + +def main(): + test_dump_and_restore_with_shell_job() + test_dump_and_restore_without_shell_job() + test_dump_and_restore_without_shell_job(restore_detached=True) + test_dump_and_restore_in_pidns() + + +if __name__ == "__main__": + run_task_with_own_pty(main) From 9cd09f5860eefec864705107049692a25220740d Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Mon, 22 May 2023 14:51:13 +0530 Subject: [PATCH 200/775] criu-ns: Install Python pathlib module in CentOS 7 These changes fix the `ImportError: No module named pathlib` error when executing criu-ns tests located at criu/test/others/criu-ns Signed-off-by: Dhanuka Warusadura --- .cirrus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index bd4799fd0..80f3296fc 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -134,11 +134,11 @@ task: memory: 8G setup_script: | - # EPEL is needed for python2-future, python2-junit_xml, python-flake8 and libbsd-devel. + # EPEL is needed for python2-future, python2-junit_xml, python-pathlib, python-flake8 and libbsd-devel. # Do not fail if latest epel repository definition is already installed yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel + yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six python-pathlib sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel # Even with selinux in permissive mode the selinux tests will be executed # The Cirrus CI user runs as a service from selinux point of view and is # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) From fc08fa9077a87afad9f7c0f882c2743b96715705 Mon Sep 17 00:00:00 2001 From: Dhanuka Warusadura Date: Mon, 22 May 2023 15:06:14 +0530 Subject: [PATCH 201/775] criu-ns: Update shebang line to python CentOS 7 CI environment uses Python 2. To execute criu-ns script in CentOS 7 changing the current shebang line to python is required. This reverse the changes made in a15a63fce0ad4d1a9119771577fa7ef562bbfd6b Signed-off-by: Dhanuka Warusadura --- scripts/criu-ns | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 4c032aa14..3c77b8eb4 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import ctypes import ctypes.util import errno From c6ac396aa3818e47aa82f6c36bea076f64d51c24 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 May 2023 22:59:28 +0800 Subject: [PATCH 202/775] timers: improve and fix posix timer id sequence checks This is a patch proposed by Thomas here: https://lore.kernel.org/all/87ilczc7d9.ffs@tglx/ It removes (created id > desired id) "sanity" check and adds proper checking that ids start at zero and increment by one each time when we create/delete a posix timer. First purpose of it is to fix infinite looping in create_posix_timers on old pre 3.11 kernels. Second purpose is to allow kernel interface of creating posix timers with desired id change from iterating with predictable next id to just setting next id directly. And at the same time removing predictable next id so that criu with this patch would not get to infinite loop in create_posix_timers if this happens. Thanks a lot to Thomas! Suggested-by: Thomas Gleixner Signed-off-by: Pavel Tikhomirov --- criu/pie/restorer.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9873fdc11..1f08bc2a0 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1169,7 +1169,7 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { int ret, i; - kernel_timer_t next_id; + kernel_timer_t next_id = 0, timer_id; struct sigevent sev; for (i = 0; i < args->posix_timers_n; i++) { @@ -1183,25 +1183,26 @@ static int create_posix_timers(struct task_restore_args *args) sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; while (1) { - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); return ret; } - if (next_id == args->posix_timers[i].spt.it_id) - break; - - ret = sys_timer_delete(next_id); - if (ret < 0) { - pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); - return ret; - } - - if ((long)next_id > args->posix_timers[i].spt.it_id) { + if (timer_id != next_id) { pr_err("Can't create timers, kernel don't give them consequently\n"); return -1; } + next_id++; + + if (timer_id == args->posix_timers[i].spt.it_id) + break; + + ret = sys_timer_delete(timer_id); + if (ret < 0) { + pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); + return ret; + } } } From 2ac15e3ad0cee63a2b49466b51de3f6219cf3d54 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 2 Apr 2023 15:45:05 +0100 Subject: [PATCH 203/775] action-scripts: Add pre-stream hook This hook allows to start image streamer process from an action script. Signed-off-by: Radostin Stoyanov --- criu/action-scripts.c | 1 + criu/img-streamer.c | 8 ++++++++ criu/include/action-scripts.h | 1 + 3 files changed, 10 insertions(+) diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 1ce6d9c10..ec0563e16 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -18,6 +18,7 @@ #include "common/scm.h" static const char *action_names[ACT_MAX] = { + [ACT_PRE_STREAM] = "pre-stream", [ACT_PRE_DUMP] = "pre-dump", [ACT_POST_DUMP] = "post-dump", [ACT_PRE_RESTORE] = "pre-restore", diff --git a/criu/img-streamer.c b/criu/img-streamer.c index 7e36eae01..305e6fae5 100644 --- a/criu/img-streamer.c +++ b/criu/img-streamer.c @@ -12,6 +12,7 @@ #include "rst-malloc.h" #include "common/scm.h" #include "common/lock.h" +#include "action-scripts.h" /* * We use different path names for the dump and restore sockets because: @@ -49,10 +50,17 @@ static const char *socket_name_for_mode(int mode) int img_streamer_init(const char *image_dir, int mode) { struct sockaddr_un addr; + int pre_stream_ret; int sockfd; img_streamer_mode = mode; + pre_stream_ret = run_scripts(ACT_PRE_STREAM); + if (pre_stream_ret != 0) { + pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); + return -1; + } + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { pr_perror("Unable to instantiate UNIX socket"); diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index c2e8850aa..793698c27 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -4,6 +4,7 @@ #include "asm/int.h" enum script_actions { + ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, From a638043a7fb35307533f16b3ed1108773a37cd37 Mon Sep 17 00:00:00 2001 From: Valeriy Vdovin Date: Tue, 9 Feb 2021 16:55:48 +0300 Subject: [PATCH 204/775] cgroup/restore: split prepare_task_cgroup code into two separate functions This does cgroup namespace creation separately from joining task cgroups. This makes the code more logical, because creating cgroup namespace also involves joining cgroups but these cgroups can be different to task's cgroups as they are cgroup namespace roots (cgns_prefix), and mixing all of them together may lead to misunderstanding. Another positive thing is that we consolidate !item->parent checks in one place in restore_task_with_children. Signed-off-by: Valeriy Vdovin Signed-off-by: Pavel Tikhomirov --- criu/cgroup.c | 54 ++++++++++++++++++++++++++++++++----------- criu/cr-restore.c | 9 +++++++- criu/include/cgroup.h | 3 ++- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 8243ac6d3..bcb7b405a 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1202,17 +1202,12 @@ static int prepare_cgns(CgSetEntry *se) return 0; } -static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) +static int move_in_cgroup(CgSetEntry *se) { int i; pr_info("Move into %d\n", se->id); - if (setup_cgns && prepare_cgns(se) < 0) { - pr_err("failed preparing cgns\n"); - return -1; - } - for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; @@ -1252,7 +1247,44 @@ static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) return 0; } -int prepare_task_cgroup(struct pstree_item *me) +int prepare_cgroup_namespace(struct pstree_item *root_task) +{ + CgSetEntry *se; + + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; + + if (root_task->parent) { + pr_err("Expecting root_task to restore cgroup namespace\n"); + return -1; + } + + /* + * If on dump all dumped tasks are in same cgset with criu we don't + * dump cgsets and thus cgroup namespaces and rely that on restore + * criu caller would prepare proper cgset/cgns for us. Also in case + * of --unprivileged we don't even have the root cgset here. + */ + if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { + pr_info("Cgroup namespace inherited from parent\n"); + return 0; + } + + se = find_rst_set_by_id(rsti(root_task)->cg_set); + if (!se) { + pr_err("No set %d found\n", rsti(root_task)->cg_set); + return -1; + } + + if (prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + + return 0; +} + +int restore_task_cgroup(struct pstree_item *me) { struct pstree_item *parent = me->parent; CgSetEntry *se; @@ -1284,13 +1316,7 @@ int prepare_task_cgroup(struct pstree_item *me) return -1; } - /* Since don't support nesting of cgroup namespaces, let's only set up - * the cgns (if it exists) in the init task. In the future, we should - * just check that the cgns prefix string matches for all the entries - * in the cgset, and only unshare if that's true. - */ - - return move_in_cgroup(se, !me->parent); + return move_in_cgroup(se); } void fini_cgroup(void) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index f02e95f6d..2b99a775d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1827,6 +1827,13 @@ static int restore_task_with_children(void *_arg) /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; + + /* + * Since we don't support nesting of cgroup namespaces, let's + * only set up the cgns (if it exists) in the init task. + */ + if (prepare_cgroup_namespace(current) < 0) + goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) @@ -1838,7 +1845,7 @@ static int restore_task_with_children(void *_arg) * we will only move the root one there, others will * just have it inherited. */ - if (prepare_task_cgroup(current) < 0) + if (restore_task_cgroup(current) < 0) goto err; /* Restore root task */ diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 93f61539c..dc264032e 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -9,7 +9,8 @@ struct parasite_dump_cgroup_args; extern u32 root_cg_set; int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); -int prepare_task_cgroup(struct pstree_item *); +int restore_task_cgroup(struct pstree_item *); +int prepare_cgroup_namespace(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); From 0218b1e8f24312e2f23ed2822b30f5361aa93998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 29 May 2023 19:28:19 +0200 Subject: [PATCH 205/775] Fix dumping hugetlb-based memfd on kernels < 4.16. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4.15-based kernels don't allow F_*SEAL for memfds created with MFD_HUGETLB. Since seals are not possible in this case, fake F_GETSEALS result as if it was queried for a non-sealing-enabled memfd. Signed-off-by: Michał Mirosław --- criu/memfd.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/criu/memfd.c b/criu/memfd.c index da2937703..6a43dece6 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -93,8 +93,17 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * } mie.seals = fcntl(fd, F_GET_SEALS); - if (mie.seals == -1) - goto out; + if (mie.seals == -1) { + if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { + pr_perror("fcntl(F_GET_SEALS)"); + goto out; + } + /* Kernels before 4.16 don't allow MFD_HUGETLB | + * MFD_ALLOW_SEALING and return EINVAL for + * fcntl(MFD_HUGETLB-enabled fd). + */ + mie.seals = F_SEAL_SEAL; + } if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) goto out; From 9943dcde17b58cfe84b721785cb51cf9f1a769ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 15:07:43 +0200 Subject: [PATCH 206/775] Fix mount(cgroup2) for older kernels. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux 4.15 doesn't like empty string for cgroup2 mount options. Pass NULL then to satisfy the kernel check. Log the options for easier debugging. Signed-off-by: Michał Mirosław --- criu/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index bcb7b405a..0bf7b3818 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -639,8 +639,8 @@ static int open_cgroupfs(struct cg_ctl *cc) return -1; } - if (mount("none", prefix, fstype, 0, mopts) < 0) { - pr_perror("Unable to mount %s", mopts); + if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { + pr_perror("Unable to mount %s %s", fstype, mopts); rmdir(prefix); return -1; } From 21ce76263b46bddacaae8d8dafe356ab047097db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 18:02:38 +0200 Subject: [PATCH 207/775] Restore THP_DISABLE prctl. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original commit added saving THP_DISABLED flag value, but missed restoring it. There is restoring code, but used only when --lazy_pages mode is enabled. Restore the prctl flag always. While at it, rename the `has_thp_enabled` -> `!thp_disabled` for consistency. Fixes: bbbd597b4124 (2017-06-28 "mem: add dump state of THP_DISABLED prctl") Signed-off-by: Michał Mirosław --- criu/cr-restore.c | 2 +- criu/include/restorer.h | 2 +- criu/include/rst_info.h | 2 -- criu/mem.c | 4 ---- criu/pie/restorer.c | 16 ++++++---------- 5 files changed, 8 insertions(+), 18 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 2b99a775d..bff41dc56 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2971,7 +2971,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) args->fd_exe_link = exe_fd; - args->has_thp_enabled = rsti(current)->has_thp_enabled; + args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; ret = 0; out: diff --git a/criu/include/restorer.h b/criu/include/restorer.h index bc0beb5cb..e232f5404 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -144,7 +144,7 @@ struct task_restore_args { struct timeval logstart; int uffd; - bool has_thp_enabled; + bool thp_disabled; /* threads restoration */ int nr_threads; /* number of threads */ diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index d0a3db6c5..704b42a72 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -73,8 +73,6 @@ struct rst_info { */ bool has_old_seccomp_filter; - bool has_thp_enabled; - struct rst_rseq *rseqe; void *breakpoint; diff --git a/criu/mem.c b/criu/mem.c index 9bf7cae97..417e0a21d 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1217,8 +1217,6 @@ err_addr: static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { - MmEntry *mm = rsti(t)->mm; - /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until @@ -1241,8 +1239,6 @@ static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) pr_perror("Cannot disable THP"); return -1; } - if (!(mm->has_thp_disabled && mm->thp_disabled)) - rsti(t)->has_thp_enabled = true; return 0; } diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 1f08bc2a0..0d1360c52 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1635,17 +1635,13 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } - if (args->uffd > -1) { - /* re-enable THP if we disabled it previously */ - if (args->has_thp_enabled) { - int ret; - ret = sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0); - if (ret) { - pr_err("Cannot re-enable THP: %d\n", ret); - goto core_restore_end; - } - } + ret = sys_prctl(PR_SET_THP_DISABLE, args->thp_disabled, 0, 0, 0); + if (ret) { + pr_err("Cannot restore THP_DISABLE=%d flag: %ld\n", args->thp_disabled, ret); + goto core_restore_end; + } + if (args->uffd > -1) { pr_debug("lazy-pages: closing uffd %d\n", args->uffd); /* * All userfaultfd configuration has finished at this point. From 2364c963c65c7f3640ef515dfb2046b78bc9a975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 17:22:06 +0200 Subject: [PATCH 208/775] Log if prctl(SET_THP_DISABLE) doesn't work as expected. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If prctl(SET_THP_DISABLE) is not used due to bad semantics, log it for easier debugging. Signed-off-by: Michał Mirosław --- criu/kerndat.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index bc0c7ba05..d38e8898e 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1324,6 +1324,8 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); + if (!kdat.has_thp_disable) + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE"); break; } } From a0c78a79020b04ddc1ad88528471892bcdd5b710 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 17 May 2023 21:51:59 +0200 Subject: [PATCH 209/775] zdtm: thp_disable: Output a single failure message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While at it, don't carry over stale errno to the fail() message. Signed-off-by: Michał Mirosław --- test/zdtm/static/thp_disable.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index ab88120c2..58d6039f8 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -47,15 +47,14 @@ int main(int argc, char **argv) if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) return -1; + errno = 0; if (orig_flags != new_flags) { - pr_err("Flags are changed %lx -> %lx\n", orig_flags, new_flags); - fail(); + fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - pr_err("Madvs are changed %lx -> %lx\n", orig_madv, new_madv); - fail(); + fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); return -1; } From 01238d2706e4c2e7eca784baa306a309e3e5a823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 17:11:16 +0200 Subject: [PATCH 210/775] zdtm: thp_disable: Verify prctl(THP_DISABLE) migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- test/zdtm/static/thp_disable.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index 58d6039f8..e38508778 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -17,6 +17,7 @@ int main(int argc, char **argv) unsigned long orig_flags = 0, new_flags = 0; unsigned long orig_madv = 0, new_madv = 0; void *area; + int ret; test_init(argc, argv); @@ -35,9 +36,31 @@ int main(int argc, char **argv) return -1; } + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + return -1; + } + test_daemon(); test_waitsig(); + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting post-migration THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + return -1; + } + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_perror("Enabling THP failed"); return -1; From 93ad8d40de3283130ebad5bd96e188946c018279 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 2 Jun 2023 19:01:29 +0200 Subject: [PATCH 211/775] zdtm: thp_disable: Verify MADV_NOHUGEPAGE before migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a sanity check for THP_DISABLE. This discovered a broken commit in Google's kernel tree. Signed-off-by: Michał Mirosław --- test/zdtm/static/thp_disable.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index e38508778..eabb45650 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -47,6 +47,21 @@ int main(int argc, char **argv) return -1; } + test_msg("Fetch pre-migration flags/adv\n"); + if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) + return -1; + + errno = 0; + if (orig_flags != new_flags) { + fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); + return -1; + } + + if (orig_madv != new_madv) { + fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); + return -1; + } + test_daemon(); test_waitsig(); From 4c1409b8f69f001aa5db0c0c229e9dad0ae56e3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 13 Oct 2021 06:32:43 +0200 Subject: [PATCH 212/775] Fill FPU init state if it's not provided by kernel. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apparently Skylake uses init-optimization when saving FPU state, and ptrace() returns XSTATE_BV[0] = 0 meaning FPU was not used by a task (in init state). Since CRIU restore uses sigreturn to restore registers, FPU state is always restored. Fill the state with default values on dump to make restore happy. Signed-off-by: Michał Mirosław --- compel/arch/x86/src/lib/infect.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 01959b95b..88bdb4047 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -220,6 +220,16 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) +{ + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} + static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { struct iovec iov; @@ -232,14 +242,15 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) return -1; } - return 0; -} - -static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) -{ - if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { - pr_perror("Can't obtain FPU registers for %d", pid); - return -1; + if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { + // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. + // Since those are restored unconditionally, make sure the init values are + // filled by retrying with old PTRACE_GETFPREGS. + // + // [1] Intel® 64 and IA-32 Architectures Software Developer's + // Manual Volume 1: Basic Architecture + // Section 13.6: Processor tracking of XSAVE-managed state + return get_task_fpregs(pid, xsave); } return 0; From 4455444eebd004de2bd80d9fdb99811a1d72d90c Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Thu, 15 Jun 2023 15:13:58 -0700 Subject: [PATCH 213/775] compel/test: Return 0 in case of error in fdspy This commit revises the error handling in the fdspy test. Previously, a failure case could have been incorrectly reported as successful because of a specific check `pass != 0`, leading to potential false positives when `check_pipe_ends()` returned `-1` due to a read/write pipe error. To improve this, we've adjusted the error handling to return `0` in case of any error. As such, the final success condition remains unchanged. This approach will help accurately differentiate between successful and failed cases, ensuring the output "All OK" is printed for success, and "Something went WRONG" for any failure. Fixes: 5364ca3 ("compel/test: Fix warn_unused_result") Signed-off-by: Haorong Lu --- compel/test/fdspy/spy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 7f20ea2a7..41de99e20 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -110,11 +110,11 @@ static int check_pipe_ends(int wfd, int rfd) printf("Check pipe ends are connected\n"); if (write(wfd, "1", 2) != 2) { fprintf(stderr, "write to pipe failed\n"); - return -1; + return 0; } if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { fprintf(stderr, "read from pipe failed\n"); - return -1; + return 0; } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); From 0e88a91ff025c11f53caa53d75b76c9c6b00dfdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 16 Sep 2022 11:01:48 +0200 Subject: [PATCH 214/775] Allow passing --leave_stopped by RPC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/cr-service.c | 3 +++ images/rpc.proto | 1 + 2 files changed, 4 insertions(+) diff --git a/criu/cr-service.c b/criu/cr-service.c index 314c309be..ed4f1edef 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -428,6 +428,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; + if (req->has_leave_stopped && req->leave_stopped) + opts.final_state = TASK_STOPPED; + if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; diff --git a/images/rpc.proto b/images/rpc.proto index afd2c7b43..6451e9b73 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -140,6 +140,7 @@ message criu_opts { optional bool mntns_compat_mode = 65; optional bool skip_file_rwx_check = 66; optional bool unprivileged = 67; + optional bool leave_stopped = 69; /* optional bool check_mounts = 128; */ } From 1e5ebec39dbe43fa2aec23feb724b332511be886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20C=C5=82api=C5=84ski?= Date: Fri, 16 Sep 2022 11:06:28 +0200 Subject: [PATCH 215/775] Allow passing --display_stats via RPC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Cłapiński Signed-off-by: Michał Mirosław --- criu/cr-service.c | 3 +++ images/rpc.proto | 1 + 2 files changed, 4 insertions(+) diff --git a/criu/cr-service.c b/criu/cr-service.c index ed4f1edef..9aa9d5bc8 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -723,6 +723,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->orphan_pts_master) opts.orphan_pts_master = true; + if (req->has_display_stats) + opts.display_stats = req->display_stats; + /* Evaluate additional configuration file a second time to overwrite * all RPC settings. */ if (req->config_file) { diff --git a/images/rpc.proto b/images/rpc.proto index 6451e9b73..cde162f1c 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -141,6 +141,7 @@ message criu_opts { optional bool skip_file_rwx_check = 66; optional bool unprivileged = 67; optional bool leave_stopped = 69; + optional bool display_stats = 70; /* optional bool check_mounts = 128; */ } From 4b764a9dcea83cc8ca96923cfc9a757e6b3020ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20C=C5=82api=C5=84ski?= Date: Fri, 16 Sep 2022 11:14:59 +0200 Subject: [PATCH 216/775] Allow passing --log_to_stderr via RPC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Cłapiński Signed-off-by: Michał Mirosław --- criu/cr-service.c | 3 +++ images/rpc.proto | 1 + 2 files changed, 4 insertions(+) diff --git a/criu/cr-service.c b/criu/cr-service.c index 9aa9d5bc8..915ba3870 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -394,6 +394,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; } else if (!opts.output) { SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); } diff --git a/images/rpc.proto b/images/rpc.proto index cde162f1c..79623f9f6 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -142,6 +142,7 @@ message criu_opts { optional bool unprivileged = 67; optional bool leave_stopped = 69; optional bool display_stats = 70; + optional bool log_to_stderr = 71; /* optional bool check_mounts = 128; */ } From 516fade932f96a93fb29f5dae3f0e9c9b49ccf92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 21 Apr 2023 15:56:06 +0200 Subject: [PATCH 217/775] zdtm: Allow overriding /tmp. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use $TMPDIR for tests_root as the host's /tmp might not have enough features or space. Signed-off-by: Michał Mirosław --- test/zdtm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 33859f61e..2a657e44d 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -79,7 +79,8 @@ def clean_tests_root(): def make_tests_root(): global tests_root if not tests_root: - tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", "/tmp")) + tmpdir = os.environ.get("TMPDIR", "/tmp") + tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", tmpdir)) atexit.register(clean_tests_root) os.mkdir(os.path.join(tests_root[1], "root")) os.chmod(tests_root[1], 0o777) @@ -404,7 +405,7 @@ class zdtm_test: self.__flavor = flavor self.__freezer = freezer self._bins = [name] - self._env = {} + self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) self.auto_reap = True self.__timeout = int(self.__desc.get('timeout') or 30) @@ -828,7 +829,7 @@ class groups_test(zdtm_test): self._bins += self.__subs self._deps += get_test_desc('zdtm/lib/groups')['deps'] - self._env = {'ZDTM_TESTS': self.__real_name} + self._env['ZDTM_TESTS'] = self.__real_name def __get_start_cmd(self, name): tdir = os.path.dirname(name) From ed88e3241cd586ec545a7f78803e933b764eadcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 25 Apr 2023 21:30:20 +0200 Subject: [PATCH 218/775] zdtm: Add timeouts for test commands. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend ability to limit time taken to all CRIU invocations. Signed-off-by: Michał Mirosław --- test/zdtm.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 2a657e44d..b8a0c5a3b 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -399,6 +399,7 @@ class zdtm_test: self.__name = name self.__desc = desc self.__freezer = None + self.__timeout = int(self.__desc.get('timeout') or 30) self.__rootless = rootless self.__make_action('cleanout') self.__pid = 0 @@ -408,7 +409,6 @@ class zdtm_test: self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) self.auto_reap = True - self.__timeout = int(self.__desc.get('timeout') or 30) def __make_action(self, act, env=None, root=None): sys.stdout.flush() # Not to let make's messages appear before ours @@ -430,7 +430,7 @@ class zdtm_test: preexec_fn=self.__freezer and self.__freezer.attach or None) if act == "pid": try_run_hook(self, ["--post-start"]) - if s.wait(): + if s.wait(timeout=self.__timeout): raise test_fail_exc(str(s_args)) if self.__freezer: @@ -839,7 +839,7 @@ class groups_test(zdtm_test): subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout=subprocess.PIPE) - out, _ = s.communicate() + out, _ = s.communicate(timeout=self.__timeout) cmd = out.decode().splitlines()[-1].strip() return 'cd /' + tdir + ' && ' + cmd @@ -883,7 +883,8 @@ class criu_cli: fault=None, strace=[], preexec=None, - nowait=False): + nowait=False, + timeout=60): env = dict( os.environ, ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") @@ -899,7 +900,7 @@ class criu_cli: preexec_fn=preexec) if nowait: return cr - return cr.wait() + return cr.wait(timeout=timeout) class criu_rpc_process: @@ -982,7 +983,8 @@ class criu_rpc: fault=None, strace=[], preexec=None, - nowait=False): + nowait=False, + timeout=None): if fault: raise test_fail_exc('RPC and FAULT not supported') if strace: From 1fb5c410c8d7e6992fc1ae270195815d02cee609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 20:20:18 +0200 Subject: [PATCH 219/775] zdtm: Allow --keep-going for single test. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't want test framework to change its behaviour on whether we run a single or multiple tests in a run. When we shard the test suite it can result in some shards having a single test to run and unexpectedly change the test output format. Signed-off-by: Michał Mirosław --- test/zdtm.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index b8a0c5a3b..c278fafff 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2344,11 +2344,6 @@ def run_tests(opts): return torun = list(torun) - if opts['keep_going'] and len(torun) < 2: - print( - "[WARNING] Option --keep-going is more useful when running multiple tests" - ) - opts['keep_going'] = False if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") From 6bc00fcb84dcb1fc55e745bac5c6939e31045efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 29 May 2023 17:16:15 +0200 Subject: [PATCH 220/775] zdtm: Implement test sharding. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow to split test suite into predictable sets to parallelize runs on multiple machines or VMs. Signed-off-by: Michał Mirosław --- test/zdtm.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index c278fafff..1ef941b4e 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2231,9 +2231,21 @@ def all_tests(opts): continue files.append(fp) excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) - tlist = list(filter( + tlist = list(sorted(filter( lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and - x not in excl, map(lambda x: x.strip(), files))) + x not in excl, map(lambda x: x.strip(), files)))) + + if opts.get('test_shard_count'): + if opts.get('test_shard_index') is None: + raise KeyError('--test_shard_count > 0 must come with --test_shard_index') + slice_idx = opts['test_shard_index'] + slices = opts['test_shard_count'] + if slice_idx >= slices: + raise IndexError('--test_shard_index not less than --test_shard_count ({} >= {})'.format(slice_idx, slices)) + slist = list(tlist[slice_idx::slices]) + print("We're shard #{} of {}. Running {} of {} tests.\n".format(slice_idx, slices, len(slist), len(tlist))) + tlist = slist + return tlist @@ -2765,6 +2777,10 @@ def get_cli_args(): rp.add_argument("--mntns-compat-mode", help="Use old compat mounts restore engine", action='store_true') + rp.add_argument("--test-shard-index", type=int, default=None, + help="Select tests for a shard (0-based)") + rp.add_argument("--test-shard-count", type=int, default=0, + help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) From d0ac547b3dca04ba7b3b1d5c36d60859a4dccb8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 18 May 2023 12:15:51 +0200 Subject: [PATCH 221/775] zdtm: sock_opts00: Improve error messages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it clear that the option numbers are indexes not the option identifiers ("names"). Also show the value change that prompted test failure. Signed-off-by: Michał Mirosław --- test/zdtm/static/sock_opts00.c | 51 +++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index 5b4624f6d..fcf00ffed 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -12,21 +12,27 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY -#define NOPTS 8 - int main(int argc, char **argv) { - int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; - socklen_t len = sizeof(int); + #define OPT(x) { x, #x } + static const struct { + int opt; + const char *name; + } vname[] = { + OPT(SO_PRIORITY), + OPT(SO_RCVLOWAT), + OPT(SO_MARK), + OPT(SO_PASSCRED), + OPT(SO_PASSSEC), + OPT(SO_DONTROUTE), + OPT(SO_NO_CHECK), + OPT(SO_OOBINLINE), + }; + static const int NOPTS = sizeof(vname) / sizeof(*vname); + #undef OPT - vname[0] = SO_PRIORITY; - vname[1] = SO_RCVLOWAT; - vname[2] = SO_MARK; - vname[3] = SO_PASSCRED; - vname[4] = SO_PASSSEC; - vname[5] = SO_DONTROUTE; - vname[6] = SO_NO_CHECK; - vname[7] = SO_OOBINLINE; + int sock, ret = 0, val[NOPTS], rval, i; + socklen_t len = sizeof(int); test_init(argc, argv); @@ -37,29 +43,29 @@ int main(int argc, char **argv) } for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { - pr_perror("can't get option %d", i); + pr_perror("can't get %s", vname[i].name); return 1; } val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); + ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { - pr_perror("can't set option %d", i); + pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d 2", i); + pr_perror("can't re-get %s", vname[i].name); return 1; } if (rval != val[i]) { if (rval + 1 == val[i]) { - pr_perror("can't reset option %d want %d have %d", i, val[i], rval); + pr_perror("failed to set %s: want %d have %d", vname[i].name, val[i], rval); return 1; } @@ -72,14 +78,15 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d again", i); + pr_perror("can't verify %s", vname[i].name); return 1; } if (val[i] != rval) { - fail("option %d changed", i); + errno = 0; + fail("%s changed: %d -> %d", vname[i].name, val[i], rval); return 1; } } From c97cc6a6ce9094ad78ccdaa3a174b5ff7ee07a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 12 Sep 2022 16:17:43 +0200 Subject: [PATCH 222/775] Allow skipping iptables/nftables invocation. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it possible to skip network lock to enable uses that break connections anyway to work without iptables/nftables being present. Signed-off-by: Michał Mirosław --- Documentation/criu.txt | 3 +++ criu/config.c | 2 ++ criu/cr-service.c | 3 +++ criu/include/cr_options.h | 1 + criu/net.c | 6 ++++++ criu/sk-tcp.c | 6 ++++++ images/rpc.proto | 1 + lib/c/criu.c | 2 +- lib/c/criu.h | 1 + 9 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 0e7d19c4c..0c4cf8b61 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -457,6 +457,9 @@ The 'mode' may be one of the following: *nftables*::: Use nftables rules to drop the packets. + *skip*::: Don't lock the network. If *--tcp-close* is not used, the network + must be locked externally to allow CRIU to dump TCP connections. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. diff --git a/criu/config.c b/criu/config.c index 9f02ae992..1322a490a 100644 --- a/criu/config.c +++ b/criu/config.c @@ -1036,6 +1036,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else if (!strcmp("nftables", optarg)) { opts.network_lock_method = NETWORK_LOCK_NFTABLES; + } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { + opts.network_lock_method = NETWORK_LOCK_SKIP; } else { pr_err("Invalid value for --network-lock: %s\n", optarg); return 1; diff --git a/criu/cr-service.c b/criu/cr-service.c index 915ba3870..fa7490370 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -526,6 +526,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) case CRIU_NETWORK_LOCK_METHOD__NFTABLES: opts.network_lock_method = NETWORK_LOCK_NFTABLES; break; + case CRIU_NETWORK_LOCK_METHOD__SKIP: + opts.network_lock_method = NETWORK_LOCK_SKIP; + break; default: goto err; } diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index c7e98c756..60cf9437e 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -67,6 +67,7 @@ struct cg_root_opt { enum NETWORK_LOCK_METHOD { NETWORK_LOCK_IPTABLES, NETWORK_LOCK_NFTABLES, + NETWORK_LOCK_SKIP, }; #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES diff --git a/criu/net.c b/criu/net.c index 84250598c..4abfc182a 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3131,6 +3131,9 @@ int network_lock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3193,6 +3196,9 @@ static int network_unlock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 96d5d13bf..630a182a2 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -39,6 +39,8 @@ static int lock_connection(struct inet_sk_desc *sk) return iptables_lock_connection(sk); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) return nftables_lock_connection(sk); + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -50,6 +52,8 @@ static int unlock_connection(struct inet_sk_desc *sk) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -483,6 +487,8 @@ static int unlock_connection_info(struct inet_sk_info *si) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } diff --git a/images/rpc.proto b/images/rpc.proto index 79623f9f6..8748bdaff 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -52,6 +52,7 @@ enum criu_cg_mode { enum criu_network_lock_method { IPTABLES = 1; NFTABLES = 2; + SKIP = 3; }; enum criu_pre_dump_mode { diff --git a/lib/c/criu.c b/lib/c/criu.c index 0095bcc9b..7f766db85 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -1868,7 +1868,7 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { opts->rpc->has_network_lock = true; - if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES) { + if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } diff --git a/lib/c/criu.h b/lib/c/criu.h index 3b9cedfd0..c1c607869 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -50,6 +50,7 @@ enum criu_cg_mode { enum criu_network_lock_method { CRIU_NETWORK_LOCK_IPTABLES = 1, CRIU_NETWORK_LOCK_NFTABLES = 2, + CRIU_NETWORK_LOCK_SKIP = 3, }; enum criu_pre_dump_mode { CRIU_PRE_DUMP_SPLICE = 1, CRIU_PRE_DUMP_READ = 2 }; From 304a309aed9fd28f5b55d9814c6e0e1febe02f04 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 18 Jun 2023 09:53:48 +0200 Subject: [PATCH 223/775] test/thp_disable: fix lint The fail() macro provides a new line character at the end of the message. This patch fixes the following lint check that currently fails in CI: $ git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' test/zdtm/static/thp_disable.c: fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); test/zdtm/static/thp_disable.c: fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); test/zdtm/static/thp_disable.c: fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); test/zdtm/static/thp_disable.c: fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); test/zdtm/static/thp_disable.c: fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); test/zdtm/static/thp_disable.c: fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); Fixes: #2193 Signed-off-by: Radostin Stoyanov --- criu/kerndat.c | 2 +- test/zdtm/static/thp_disable.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index d38e8898e..4565e5307 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1325,7 +1325,7 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); if (!kdat.has_thp_disable) - pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE"); + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index eabb45650..55609f260 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -43,7 +43,7 @@ int main(int argc, char **argv) } if (ret != 1) { errno = 0; - fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); return -1; } @@ -53,12 +53,12 @@ int main(int argc, char **argv) errno = 0; if (orig_flags != new_flags) { - fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); + fail("Flags changed %lx -> %lx", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); return -1; } @@ -72,7 +72,7 @@ int main(int argc, char **argv) } if (ret != 1) { errno = 0; - fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1\n", ret); + fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); return -1; } @@ -87,12 +87,12 @@ int main(int argc, char **argv) errno = 0; if (orig_flags != new_flags) { - fail("Flags changed %lx -> %lx\n", orig_flags, new_flags); + fail("Flags changed %lx -> %lx", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - fail("Madvs changed %lx -> %lx\n", orig_madv, new_madv); + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); return -1; } From ffa1e47fd81c13a315a095949100677505af4b1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:40:10 +0200 Subject: [PATCH 224/775] sockets: Increase the size of sockets hashmap to 16K. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During dump, CRIU stores the structs representing sockets in a statically sized hashmap of size 32. We have some (admittedly crazy) tasks that use tens of thousands of sockets, and seem to spend most of the dump time iterating over the linked lists of the map. 16K is chosen arbitrarily, so that it reduces the lengths of the chains to few elements on average, while not introducing significant memory overhead. From: Radosław Burny Signed-off-by: Michał Mirosław --- criu/sockets.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/sockets.c b/criu/sockets.c index d17e0a986..560c76517 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -38,7 +38,7 @@ #define SOCK_DIAG_BY_FAMILY 20 #endif -#define SK_HASH_SIZE 32 +#define SK_HASH_SIZE (1 << 14) #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER From eea0d6edeea67c830ac8edb212e1ad66b4757b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:38:31 +0200 Subject: [PATCH 225/775] pipes: Plug pipe fd leak in "Unable to set a pipe size" error case. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Piotr Figiel Signed-off-by: Michał Mirosław --- criu/pipes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/pipes.c b/criu/pipes.c index 43ff06e3d..daada8830 100644 --- a/criu/pipes.c +++ b/criu/pipes.c @@ -434,7 +434,7 @@ int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms /* steal_pipe has to be able to fit all data from a target pipe */ if (fcntl(steal_pipe[1], F_SETPIPE_SZ, pipe_size) < 0) { pr_perror("Unable to set a pipe size"); - goto err; + goto err_close; } bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); From e5c9bc4d087cdbed31fe27ef4b6adae5719e975b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 18:54:28 +0200 Subject: [PATCH 226/775] kerndat: Make socket feature probing work on IPv6-only host. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Try IPv6 if IPv4 sockets are not supported. Signed-off-by: Michał Mirosław --- criu/cr-check.c | 2 ++ criu/kerndat.c | 55 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index a4166f76b..cb083b16c 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1086,6 +1086,8 @@ static int kerndat_tcp_repair_window(void) int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0 && errno == EAFNOSUPPORT) + sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; diff --git a/criu/kerndat.c b/criu/kerndat.c index 4565e5307..b2e39cb40 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -12,7 +12,7 @@ #include #include #include -#include /* for sockaddr_in and inet_ntoa() */ +#include #include #include #include @@ -615,29 +615,52 @@ static int kerndat_iptables_has_xtlocks(void) return 0; } -int kerndat_tcp_repair(void) -{ - int sock, clnt = -1, yes = 1, exit_code = -1; - struct sockaddr_in addr; - socklen_t aux; +/* + * Unfortunately in C htonl() is not constexpr and cannot be used in a static + * initialization below. + */ +#define constant_htonl(x) \ + (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ + (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); - addr.sin_port = 0; +static int kerndat_tcp_repair(void) +{ + static const struct sockaddr_in loopback_ip4 = { + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, + }; + static const struct sockaddr_in6 loopback_ip6 = { + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + int sock, clnt = -1, yes = 1, exit_code = -1; + const struct sockaddr *addr; + struct sockaddr_storage listener_addr; + socklen_t addrlen; + + addr = (const struct sockaddr *)&loopback_ip4; + addrlen = sizeof(loopback_ip4); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) { + addr = (const struct sockaddr *)&loopback_ip6; + addrlen = sizeof(loopback_ip6); + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); + } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } - if (bind(sock, (struct sockaddr *)&addr, sizeof(addr))) { + if (bind(sock, addr, addrlen)) { pr_perror("Unable to bind a socket"); goto err; } - aux = sizeof(addr); - if (getsockname(sock, (struct sockaddr *)&addr, &aux)) { + addrlen = sizeof(listener_addr); + if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { pr_perror("Unable to get a socket name"); goto err; } @@ -647,13 +670,13 @@ int kerndat_tcp_repair(void) goto err; } - clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } - if (connect(clnt, (struct sockaddr *)&addr, sizeof(addr))) { + if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { pr_perror("Unable to connect a socket"); goto err; } @@ -977,6 +1000,8 @@ int kerndat_sockopt_buf_lock(void) int sock; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("Unable to create a socket"); return -1; From 2bf10c8d279613b864cdd1844b0bba457bf2fcf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 18:23:16 +0200 Subject: [PATCH 227/775] restore: remove unused `secbits` field. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/include/restorer.h | 1 - 1 file changed, 1 deletion(-) diff --git a/criu/include/restorer.h b/criu/include/restorer.h index e232f5404..2475ee0bc 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -72,7 +72,6 @@ struct thread_creds_args { u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; - unsigned int secbits; char *lsm_profile; unsigned int *groups; char *lsm_sockcreate; From 1a1fa439c7d141e30fec81855521e97109d8d8b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 20:36:31 +0200 Subject: [PATCH 228/775] build: Remove HAS_MEMFD test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test for HAS_MEMFD is empty and noit used. Remove it. Fixes: 5ee1ac1f28e6 ("criu: remove FEATURE_TEST_MEMFD") Change-Id: I43b8f0cfd50ce9bdf93dafb647377318df1deae8 Signed-off-by: Michał Mirosław --- Makefile.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index 270ec61c0..a13165aa7 100644 --- a/Makefile.config +++ b/Makefile.config @@ -78,7 +78,7 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW MEMFD_CREATE \ + SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name From cbbe6c683d5181c572d1e9c2ed98c1fd41f3b80b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 20:36:30 +0200 Subject: [PATCH 229/775] build: Debug system feature tests. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `make` without `-s` option will normally show the commands executed. In the case of detecting build environment features current makefile will cause detected features to be seen as 'echo #define' commands, but not detected ones will be silent. Change it so that all tried features can be seen (outside of make's silent mode) regardless of detection result. Signed-off-by: Michał Mirosław --- Makefile.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index a13165aa7..8f2b5208e 100644 --- a/Makefile.config +++ b/Makefile.config @@ -85,7 +85,8 @@ FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ - $(Q) echo '' >> $$@ +else + $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ endif endef From 88f8fdda82a75d130a95a6691fc9141828a0025c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 21:00:20 +0200 Subject: [PATCH 230/775] build: Fix LIBS vs LDFLAGS order. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit $LDFLAGS can contain `-Ldir`s that are required by '-lib's in $LIBS. Reverse the order so that `-L` options make effect. Signed-off-by: Michał Mirosław --- criu/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/Makefile b/criu/Makefile index 55bdb1b7a..c6050d582 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -85,7 +85,7 @@ $(obj)/%: pie $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ UNIT-BUILTINS += $(obj)/util.o UNIT-BUILTINS += $(obj)/config.o @@ -102,7 +102,7 @@ $(obj)/unittest/built-in.o: .FORCE $(obj)/unittest/unittest: $(UNIT-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ unittest: $(obj)/unittest/unittest $(Q) $(obj)/unittest/$@ From c5142104a2cba9684ea307901c9ce7153ae9b0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 21 Apr 2023 15:51:41 +0200 Subject: [PATCH 231/775] build: Use make-provided AR for building libzdtmtst. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make $(AR) used also for libzdtmtst build. Signed-off-by: Michał Mirosław --- test/zdtm/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 90bd28f9e..b574e1d3e 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -34,4 +34,4 @@ clean: clean-more $(LIB): $(LIBOBJ) $(E) " AR " $@ - $(Q)ar rcs $@ $^ + $(Q)$(AR) rcs $@ $^ From f043cb22af5558c339c085c21a869d46ffc35c39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 20 Jun 2022 20:36:28 +0200 Subject: [PATCH 232/775] build: Guard against libbsd's version of `__aligned`. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When trying to build CRIU with libbsd enabled the compilation fails due to duplicate definition of __aligned macro. Other such definitions are already wrapped with #ifndef make __aligned definition consistent and make it easier in the future to use the libbsd features if needed. Signed-off-by: Michał Mirosław --- include/common/compiler.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/common/compiler.h b/include/common/compiler.h index bd3de01df..1c9d3db8d 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -47,7 +47,9 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __aligned #define __aligned(x) __attribute__((aligned(x))) +#endif /* * Macro to define stack alignment. From 3e0a8ffd6dbbacf3cca4b84e4046c619d7a23c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 18:34:58 +0200 Subject: [PATCH 233/775] build: libnfnetlink: Remove nla_get_s32(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nla_get_s32() was added to libnl 3.2.7 in 2015. Remove CRIU's definition as it breaks build when statically linking the binary. From: Uros Prestor Signed-off-by: Michał Mirosław --- criu/libnetlink.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/criu/libnetlink.c b/criu/libnetlink.c index f0304b0db..c7a84a44d 100644 --- a/criu/libnetlink.c +++ b/criu/libnetlink.c @@ -214,8 +214,3 @@ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], in return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } - -int32_t nla_get_s32(const struct nlattr *nla) -{ - return *(const int32_t *)nla_data(nla); -} From 28d00563887de7fb4d9892bb980abc3450b01213 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 24 Jun 2023 11:39:21 +0100 Subject: [PATCH 234/775] action-scripts: allow shell scripts in rpc mode Container runtimes commonly use CRIU with RPC. However, this prevents the use of action-scripts set in a CRIU configuration file due to the explicit scripts mode introduced with the following commit: ac78f13bdfaee260dd4234f054bf4c5d2a373783 actions: Introduce explicit scripts mode This patch enables container checkpoint/restore with action-scripts specified via configuration file. Signed-off-by: Radostin Stoyanov --- criu/action-scripts.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/criu/action-scripts.c b/criu/action-scripts.c index ec0563e16..1c9a8f091 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -52,6 +52,9 @@ static int run_shell_scripts(const char *action) #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 + if (list_empty(&scripts)) + return 0; + if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; @@ -119,23 +122,24 @@ int run_scripts(enum script_actions act) pr_debug("Running %s scripts\n", action); - if (scripts_mode == SCRIPTS_NONE) + switch (scripts_mode) { + case SCRIPTS_NONE: return 0; - - if (scripts_mode == SCRIPTS_RPC) { + case SCRIPTS_RPC: ret = rpc_send_fd(act, -1); - goto out; - } - - if (scripts_mode == SCRIPTS_SHELL) { + if (ret) + break; + /* Enable scripts from config file in RPC mode (fallthrough) */ + case SCRIPTS_SHELL: ret = run_shell_scripts(action); - goto out; + break; + default: + BUG(); } - BUG(); -out: if (ret) pr_err("One of more action scripts failed\n"); + return ret; } @@ -143,8 +147,9 @@ int add_script(char *path) { struct script *script; - BUG_ON(scripts_mode == SCRIPTS_RPC); - scripts_mode = SCRIPTS_SHELL; + /* Set shell mode when a script is added but don't overwrite RPC mode */ + if (scripts_mode == SCRIPTS_NONE) + scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) @@ -170,7 +175,6 @@ int add_rpc_notify(int sk) return -1; } - BUG_ON(scripts_mode == SCRIPTS_SHELL); scripts_mode = SCRIPTS_RPC; if (install_service_fd(RPC_SK_OFF, fd) < 0) From 82a0db036e3e039e5f659b6b8e56a10e4fcb4939 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 24 Jun 2023 11:58:47 +0100 Subject: [PATCH 235/775] docker/podman: test c/r with action-script Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 3 +++ scripts/ci/podman-test.sh | 3 +++ 2 files changed, 6 insertions(+) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index beb7da6da..bd46d5dd3 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -28,6 +28,9 @@ CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + export SKIP_CI_TEST=1 ./run-ci-tests.sh diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 687acb8ff..72ad59a50 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -17,6 +17,9 @@ mkdir -p /etc/criu echo "manage-cgroups ignore" > /etc/criu/runc.conf sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers.conf +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf + podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' From 439b522433f776e3664105270adc26472ab6db9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 31 Mar 2022 06:59:34 -0700 Subject: [PATCH 236/775] rpc: Support gathering external file list after freezing process tree. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New 'query-ext-files' action for `criu dump` is sent after freezing the process tree. This allows to defer gathering the external file list when the process tree is in a stable state and avoids race with the process creating and deleting files. Change-Id: Iae32149dc3992dea086f513ada52cf6863beaa1f Signed-off-by: Michał Mirosław --- Documentation/criu.txt | 5 ++++ criu/action-scripts.c | 15 ++++++++++++ criu/cr-dump.c | 3 +++ criu/cr-service.c | 43 +++++++++++++++++++++++++++++++++++ criu/include/action-scripts.h | 3 +++ 5 files changed, 69 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 0c4cf8b61..606935790 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,11 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty + *query-ext-files*::: + called after the process tree is stopped and network is locked. + This hook is used only in the RPC mode. The notification reply + contains file ids to be added to external file list (may be empty). + *--unprivileged*:: This option tells *criu* to accept the limitations when running as non-root. Running as non-root requires *criu* at least to have diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 1c9a8f091..6f7900186 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -31,6 +31,7 @@ static const char *action_names[ACT_MAX] = { [ACT_POST_RESUME] = "post-resume", [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", [ACT_STATUS_READY] = "status-ready", + [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { @@ -115,6 +116,20 @@ int rpc_send_fd(enum script_actions act, int fd) return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } +int rpc_query_external_files(void) +{ + int rpc_sk; + + if (scripts_mode != SCRIPTS_RPC) + return 0; + + rpc_sk = get_service_fd(RPC_SK_OFF); + if (rpc_sk < 0) + return -1; + + return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); +} + int run_scripts(enum script_actions act) { int ret = 0; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 90d763f49..340fb96ec 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2180,6 +2180,9 @@ int cr_dump_tasks(pid_t pid) if (network_lock()) goto err; + if (rpc_query_external_files()) + goto err; + if (collect_file_locks()) goto err; diff --git a/criu/cr-service.c b/criu/cr-service.c index fa7490370..f62245d5f 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -240,6 +240,49 @@ int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) return 0; } +int exec_rpc_query_external_files(char *name, int sk) +{ + int i, ret; + CriuNotify cn = CRIU_NOTIFY__INIT; + CriuResp msg = CRIU_RESP__INIT; + CriuReq *req; + + cn.script = name; + + msg.type = CRIU_REQ_TYPE__NOTIFY; + msg.success = true; + msg.notify = &cn; + + ret = send_criu_msg_with_fd(sk, &msg, -1); + if (ret < 0) + return ret; + + ret = recv_criu_msg(sk, &req); + if (ret < 0) + return ret; + + if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { + pr_err("RPC client reported script error\n"); + return -1; + } + + ret = 0; + if (req->opts) + for (i = 0; i < req->opts->n_external; i++) { + char *key = req->opts->external[i]; + pr_info("Adding external object: %s\n", key); + if (add_external(key)) { + pr_err("Failed to add external object: %s\n", key); + ret = -1; + } + } + else + pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); + + criu_req__free_unpacked(req, NULL); + return ret; +} + static char images_dir[PATH_MAX]; static int setup_opts_from_req(int sk, CriuOpts *req) diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index 793698c27..6a331a32f 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -17,6 +17,7 @@ enum script_actions { ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_STATUS_READY, + ACT_QUERY_EXT_FILES, ACT_MAX }; @@ -25,6 +26,8 @@ extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); +extern int rpc_query_external_files(void); +extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ From 2aa9cb933394848935c8796669eb6c6788989f8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:38:33 +0200 Subject: [PATCH 237/775] rpc: Support setting images_dir by path. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Google's RPC client process is in a different pidns and has more privileges -- CRIU can't open its /proc//fd/. For images_dir_fd to be useful here it would need to refer to a passed or CRIU's fd. From: Michał Cłapiński Change-Id: Icbfb5af6844b21939a15f6fbb5b02264c12341b1 Signed-off-by: Michał Mirosław --- criu/cr-service.c | 8 +++++++- images/rpc.proto | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index f62245d5f..61a04c5ff 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -382,8 +382,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req) */ if (imgs_changed_by_rpc_conf) strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else + else if (req->images_dir_fd != -1) sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + else if (req->images_dir) + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); diff --git a/images/rpc.proto b/images/rpc.proto index 8748bdaff..1a4722a9c 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -61,7 +61,8 @@ enum criu_pre_dump_mode { }; message criu_opts { - required int32 images_dir_fd = 1; + required int32 images_dir_fd = 1 [default = -1]; + optional string images_dir = 68; /* used only if images_dir_fd == -1 */ optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional bool leave_running = 3; From f2011e1c765600a0559b26c2f66b57641dd26afd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:31:57 +0200 Subject: [PATCH 238/775] util: Downgrade ignored errors to warnings. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the error is ignored it is not important enough - make it a warning instead. From: Mian Luo Change-Id: If2641c3d4e0a4d57fdf04e4570c49be55f526535 Signed-off-by: Michał Mirosław --- Makefile | 12 ++++++------ criu/include/log.h | 2 ++ criu/util.c | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 8efdb760d..201122a48 100644 --- a/Makefile +++ b/Makefile @@ -440,12 +440,12 @@ lint: shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh codespell -S tags - # Do not append \n to pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' - # Do not use %m with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|debug|info|msg)|fail)\>.*%m' - # Do not use errno with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>\(".*".*errno' + # Do not append \n to pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' + # Do not use %m with pr_* or fail + ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' + # Do not use errno with pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files diff --git a/criu/include/log.h b/criu/include/log.h index 85e6dc2e7..cbed33007 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -60,6 +60,8 @@ void flush_early_log_buffer(int fd); #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) + #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ diff --git a/criu/util.c b/criu/util.c index db96cf938..a4975b92f 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1076,14 +1076,14 @@ void tcp_cork(int sk, bool on) { int val = on ? 1 : 0; if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_perror("Unable to restore TCP_CORK (%d)", val); + pr_pwarn("Unable to restore TCP_CORK (%d)", val); } void tcp_nodelay(int sk, bool on) { int val = on ? 1 : 0; if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_perror("Unable to restore TCP_NODELAY (%d)", val); + pr_pwarn("Unable to restore TCP_NODELAY (%d)", val); } static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) From bf8446ae5f7396860c6911abca75254d675e8a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 21:39:05 +0200 Subject: [PATCH 239/775] kerndat: unexport kerndat_nsid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kerndat_nsid() is not used outside kerndat.c. Make it static. Change-Id: I52e518ecb7c627cc1866e373411b2be3f71a2c9d Signed-off-by: Michał Mirosław --- criu/include/net.h | 1 - criu/kerndat.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/include/net.h b/criu/include/net.h index 0da4cad13..5e8a84862 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -50,7 +50,6 @@ extern int kerndat_has_newifindex(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); extern struct ns_id *net_get_root_ns(void); -extern int kerndat_nsid(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); diff --git a/criu/kerndat.c b/criu/kerndat.c index b2e39cb40..fbc5b99d0 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -703,7 +703,7 @@ err: return exit_code; } -int kerndat_nsid(void) +static int kerndat_nsid(void) { int nsid, sk; From eecc53d05a64e74e4d90b9f9dab7dec20844ef8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 21:33:16 +0200 Subject: [PATCH 240/775] kerndat: Don't fail on NETLINK/nsid support missing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If not dumping netns nor connections, nsid support is not used. Don't fail the run as if the support is needed, the dumping process will fail later. Change-Id: I39a086756f6d520c73bb6b21eaf6d9fb49a18879 Signed-off-by: Michał Mirosław --- criu/kerndat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index fbc5b99d0..597fe5d92 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -707,16 +707,18 @@ static int kerndat_nsid(void) { int nsid, sk; + kdat.has_nsid = false; + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { - pr_perror("Unable to create a netlink socket"); - return -1; + pr_pwarn("Unable to create a netlink socket: NSID can't be used."); + return 0; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { - pr_err("NSID is not supported\n"); + pr_warn("NSID is not supported\n"); close(sk); - return -1; + return 0; } kdat.has_nsid = true; From 9ad59f58ff2a3c8a3cfa2a218fba8f7d64c1d8e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 9 Dec 2022 15:24:32 +0100 Subject: [PATCH 241/775] util: Make CRIU run_id machine-level unique. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of relying on chance of CLOCK_MONOTONIC reading being unique, use pid namespace ID that combined with the process ID will make it unique on the machine level. If pidns is not enabled on a kernel we'll get ENOENT, but then CRIU's pid will already be unique. If there is some other error, log it but continue, as the socket clash (if it happens) will result in a failed run anyway. Fixes: 45e048d77a6a (2022-03-31 "criu: generate unique socket names") Fixes: 408a7d82d644 (2022-02-12 "util: add an unique ID of the current criu run") Change-Id: I111c006e1b5b1db8932232684c976a84f4256e49 Signed-off-by: Michał Mirosław --- criu/util.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/util.c b/criu/util.c index a4975b92f..744ec6032 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1880,11 +1880,16 @@ uint64_t criu_run_id; void util_init(void) { - struct timespec tp; + struct stat statbuf; + + criu_run_id = getpid(); + if (!stat("/proc/self/ns/pid", &statbuf)) + criu_run_id |= (uint64_t)statbuf.st_ino << 32; + else if (errno != ENOENT) + pr_perror("Can't stat /proc/self/ns/pid - CRIU run id might not be unique"); - clock_gettime(CLOCK_MONOTONIC, &tp); - criu_run_id = ((uint64_t)getpid() << 32) + tp.tv_sec + tp.tv_nsec; compel_run_id = criu_run_id; + pr_info("CRIU run id = %#" PRIx64 "\n", criu_run_id); } /* From 7bda5e656cd891109cee67715ddae15db7bea5c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 19 Jun 2023 11:29:22 +0200 Subject: [PATCH 242/775] zdtm: Update netns purpose comment in zdtm_ct. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the parasite socket clash now guaranteed not to happen, the comment becomes obsolete. netns is steel needed though, so update the comment to point at the requirement. Change-Id: I3cfb253cd5c53b91b955fcb001530b4aee5129f4 Signed-off-by: Michał Mirosław --- test/zdtm_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 5e849b904..44316893d 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -102,7 +102,7 @@ int main(int argc, char **argv) /* * pidns is used to avoid conflicts * mntns is used to mount /proc - * net is used to avoid conflicts of parasite sockets + * net is used to avoid conflicts between network tests */ if (!uid) if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) From 94c9e478727b5ce2571c7659939e7bfbe7af17f0 Mon Sep 17 00:00:00 2001 From: Abhishek Guleri Date: Thu, 6 Jul 2023 16:19:45 +0530 Subject: [PATCH 243/775] readme: refactor asciinema link for video playback Instead of opening the image directly, the commit refactors the asciinema image embedded link to redirect users to the corresponding video. Signed-off-by: Abhishek Guleri --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ff4aa1a23..11d1c490b 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Pages worth starting with are: - Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) ### Checkpoint and restore of simple loop process -[

](https://asciinema.org/a/232445) +

## Advanced features From 362d8fa5c22da1c930253cb6cf76b29bc37d4dd3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 07:51:26 +0100 Subject: [PATCH 244/775] ci: disable CentOS 7 test in Cirrus CI Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 32 ----------------------- scripts/build/Dockerfile.centos7 | 45 -------------------------------- 2 files changed, 77 deletions(-) delete mode 100644 scripts/build/Dockerfile.centos7 diff --git a/.cirrus.yml b/.cirrus.yml index 80f3296fc..8b8212d69 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -120,38 +120,6 @@ task: build_script: | make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" -task: - name: CentOS 7 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-7 - platform: linux - cpu: 4 - memory: 8G - - setup_script: | - # EPEL is needed for python2-future, python2-junit_xml, python-pathlib, python-flake8 and libbsd-devel. - # Do not fail if latest epel repository definition is already installed - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm || : - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six python-pathlib sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - # Enable user namespaces on CentOS 7 - echo 10000 > /proc/sys/user/max_user_namespaces - # Adapt sudoers to our needs - echo 'root ALL=(ALL:ALL) ALL' | EDITOR='tee -a' visudo - - build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" - task: name: aarch64 build GCC (native) arm_container: diff --git a/scripts/build/Dockerfile.centos7 b/scripts/build/Dockerfile.centos7 deleted file mode 100644 index 21e70ff0e..000000000 --- a/scripts/build/Dockerfile.centos7 +++ /dev/null @@ -1,45 +0,0 @@ -FROM centos:7 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm -RUN yum install -y \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - protobuf-python \ - python \ - python-flake8 \ - python-ipaddress \ - python2-future \ - python2-junit_xml \ - python-yaml \ - python-six \ - sudo \ - tar \ - which \ - e2fsprogs \ - python2-pip \ - rubygem-asciidoctor - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) From 05f0535de41870812dd64b9488359b655a9f809e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 07:58:33 +0100 Subject: [PATCH 245/775] ci: clean up CentOS 7 related tweaks We have disabled CentOS 7 tests in CI. This patch reverts the changes introduced in the following commit: 24bc083653f7d2b984653194e921b1ff32292b3b ci: disable some tests on CentOS 7 Signed-off-by: Radostin Stoyanov --- test/others/ns_ext/run.sh | 2 -- test/others/ns_ext/run_pidns.sh | 3 --- test/others/rpc/run.sh | 10 +--------- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index e416f95e5..4ebe3e280 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -4,8 +4,6 @@ set -x if [[ "$1" == "pid" ]]; then NS=pid - # CentOS 7 kernels do not have NSpid -> skip this test - grep NSpid /proc/self/status || exit 0 else NS=net fi diff --git a/test/others/ns_ext/run_pidns.sh b/test/others/ns_ext/run_pidns.sh index 08c5bff8e..db12106e0 100755 --- a/test/others/ns_ext/run_pidns.sh +++ b/test/others/ns_ext/run_pidns.sh @@ -2,9 +2,6 @@ set -e -# CentOS 7 kernels do not have NSpid -> skip this test -grep NSpid /proc/self/status || exit 0 - # This test creates a process in non-host pidns and then dumps it and restores # it into host pidns. We use pid >100000 in non-host pidns to make sure it does # not intersect with some host pid on restore but it is potentially racy so diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 9be577587..afd4fb5e3 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -1,14 +1,6 @@ #!/bin/bash -set -ex - -if [ -e /etc/os-release ]; then - . /etc/os-release - if [ "$ID" == "centos" ] && [[ "$VERSION_ID" == "7"* ]];then - echo "Skipping tests on CentOS 7 because they do not work in CI" - exit 0 - fi -fi +set -e CRIU=./criu From 7825f4ebfadbd23dbf456d0eacce98154c076174 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 08:34:23 +0100 Subject: [PATCH 246/775] coredump: drop python 2 support This patch reverts changes introduced for Python 2 compatibility in commits: 1c866db (Add new files for running criu-coredump via python 2 or 3) 3180d35 (Add support for python3 in criu-coredump). Signed-off-by: Radostin Stoyanov --- Makefile | 2 +- coredump/{coredump.py => coredump} | 1 + coredump/coredump-python2 | 6 ------ coredump/coredump-python3 | 6 ------ coredump/criu_coredump/coredump.py | 16 ++-------------- test/others/env.sh | 2 +- 6 files changed, 5 insertions(+), 28 deletions(-) rename coredump/{coredump.py => coredump} (98%) mode change 100644 => 100755 delete mode 100755 coredump/coredump-python2 delete mode 100755 coredump/coredump-python3 diff --git a/Makefile b/Makefile index 201122a48..1030a5a4e 100644 --- a/Makefile +++ b/Makefile @@ -431,7 +431,7 @@ lint: flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py flake8 --config=scripts/flake8.cfg crit/setup.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py - flake8 --config=scripts/flake8.cfg coredump/ + flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install diff --git a/coredump/coredump.py b/coredump/coredump old mode 100644 new mode 100755 similarity index 98% rename from coredump/coredump.py rename to coredump/coredump index 88a1b374c..f70d37c13 --- a/coredump/coredump.py +++ b/coredump/coredump @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import argparse import os import sys diff --git a/coredump/coredump-python2 b/coredump/coredump-python2 deleted file mode 100755 index 564c05ce9..000000000 --- a/coredump/coredump-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -import coredump - -if __name__ == '__main__': - coredump.main() diff --git a/coredump/coredump-python3 b/coredump/coredump-python3 deleted file mode 100755 index 3032dbadf..000000000 --- a/coredump/coredump-python3 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -import coredump - -if __name__ == '__main__': - coredump.main() diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 8ee402676..0b8a02e0a 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -35,12 +35,6 @@ import ctypes from pycriu import images from . import elf - -try: - from itertools import ifilter as filter -except ImportError: - pass - # Some memory-related constants PAGESIZE = 4096 status = { @@ -318,10 +312,7 @@ class coredump_generator: # prpsinfo.pr_psargs has a limit of 80 characters which means it will # fail here if the cmdline is longer than 80 prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] - if (sys.version_info > (3, 0)): - prpsinfo.pr_fname = core["tc"]["comm"].encode() - else: - prpsinfo.pr_fname = core["tc"]["comm"] + prpsinfo.pr_fname = core["tc"]["comm"].encode() nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -581,10 +572,7 @@ class coredump_generator: setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - if (sys.version_info > (3, 0)): - setattr(data, "name" + str(i), info.name.encode()) - else: - setattr(data, "name" + str(i), info.name) + setattr(data, "name" + str(i), info.name.encode()) nhdr = elf.Elf64_Nhdr() diff --git a/test/others/env.sh b/test/others/env.sh index 45066f760..a76207360 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -13,5 +13,5 @@ fi #export PYTHON CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump-"${PYTHON}") +CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump) criu_coredump=$CRIU_COREDUMP From f5d06571c58e2a052863226753018c1f5d42d693 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 20 Jun 2023 08:43:34 +0100 Subject: [PATCH 247/775] crit: drop python 2 support This patch reverts changes introduced with the following commits: 4feb07020dedbf845fc00268d8ca02f4645641cd crit: enable python2 or python3 based crit b78c4e071a42ebe34aac82fa0711df07ed375e2b test: fix crit test and extend it Signed-off-by: Radostin Stoyanov --- .gitignore | 1 - Makefile | 16 ++++------------ crit/Makefile | 13 ------------- crit/{crit-python3 => crit} | 2 +- crit/crit-python2 | 6 ------ lib/Makefile | 2 +- lib/py/cli.py | 1 + test/others/env.sh | 11 +---------- 8 files changed, 8 insertions(+), 44 deletions(-) delete mode 100644 crit/Makefile rename crit/{crit-python3 => crit} (79%) delete mode 100755 crit/crit-python2 diff --git a/.gitignore b/.gitignore index 1ea828bbc..2f2ab2029 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,6 @@ images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest -crit/crit criu/arch/*/sys-exec-tbl*.c # x86 syscalls-table is not generated !criu/arch/x86/sys-exec-tbl.c diff --git a/Makefile b/Makefile index 1030a5a4e..020ef85fc 100644 --- a/Makefile +++ b/Makefile @@ -156,7 +156,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: criu lib .PHONY: all # @@ -259,26 +259,19 @@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu -crit/Makefile: ; -crit/%: criu .FORCE - $(Q) $(MAKE) $(build)=crit $@ -crit: criu - $(Q) $(MAKE) $(build)=crit all -.PHONY: crit - unittest: $(criu-deps) $(Q) $(MAKE) $(build)=criu unittest .PHONY: unittest # -# Libraries next once crit it ready +# Libraries next once criu is ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; -lib/%: crit .FORCE +lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ -lib: crit +lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib @@ -290,7 +283,6 @@ clean mrproper: $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper clean-amdgpu_plugin: diff --git a/crit/Makefile b/crit/Makefile deleted file mode 100644 index 988b481b6..000000000 --- a/crit/Makefile +++ /dev/null @@ -1,13 +0,0 @@ - -all-y += crit - -crit/crit: crit/crit-$(PYTHON) - $(Q) cp $^ $@ -crit: crit/crit -.PHONY: crit - -clean-crit: - $(Q) $(RM) crit/crit -.PHONY: clean-crit -clean: clean-crit -mrproper: clean diff --git a/crit/crit-python3 b/crit/crit similarity index 79% rename from crit/crit-python3 rename to crit/crit index 80467cba7..3b15ca654 100755 --- a/crit/crit-python3 +++ b/crit/crit @@ -3,4 +3,4 @@ from pycriu import cli if __name__ == '__main__': - cli.main() + cli.main() diff --git a/crit/crit-python2 b/crit/crit-python2 deleted file mode 100755 index b0b7d3c3a..000000000 --- a/crit/crit-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/lib/Makefile b/lib/Makefile index ff540fb75..7ed73f9ab 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -41,7 +41,7 @@ clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc mrproper: clean -install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in +install: lib-c lib-a lib-py lib/c/criu.pc.in $(E) " INSTALL " lib $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) diff --git a/lib/py/cli.py b/lib/py/cli.py index 5419384c3..82079c7f4 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 from __future__ import print_function import argparse import sys diff --git a/test/others/env.sh b/test/others/env.sh index a76207360..6d830fb58 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -2,16 +2,7 @@ CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) criu=$CRIU -if [ $(which python3) ]; then - PYTHON=python3 -elif [ $(which python2) ]; then - PYTHON=python2 -else - echo "FAIL: Neither python3 nor python2" - exit 1 -fi -#export PYTHON -CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") +CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit) crit=$CRIT CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump) criu_coredump=$CRIU_COREDUMP From 376f3d18004fdc72d101eeb6c80f73c3d908323f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 18 Jun 2023 14:20:04 +0200 Subject: [PATCH 248/775] crit: add requirements.txt for pip>=20.1 When building with pip version 20.0.2 or older, the pip install command creates a temporary directory and copies all files from ./crit. This results in the following error message: ModuleNotFoundError: No module named 'pycriu' This error appears because the symlink 'pycriu' uses a relative path that becomes invalid '../lib/py/'. The '--no-build-isolation' option for pip install is needed to enable the use of pre-installed dependencies (e.g., protobuf) during build. The '--ignore-installed' option for pip is needed to avoid an error when crit is already installed. For example, crit is installed in the GitHub CI environment as part of the criu OBS package as a dependency for podman. Distributions such as Arch Linux have adopted an externally managed python installation in compliance with PEP 668 [1] that prevents pip from breaking the system by either installing packages to the system or locally in the home folder. The '--break-system-packages' [2] option allows pip to modify an externally managed Python installation. [1] https://peps.python.org/pep-0668/ [2] https://pip.pypa.io/en/stable/cli/pip_uninstall/ Signed-off-by: Radostin Stoyanov --- crit/pyproject.toml | 3 ++- crit/requirements.txt | 7 +++++++ lib/Makefile | 27 ++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 crit/requirements.txt diff --git a/crit/pyproject.toml b/crit/pyproject.toml index b1e1a4650..019b0d848 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -1,2 +1,3 @@ [build-system] -requires = ["setuptools"] +# Minimum requirements for the build system to execute. +requires = ["setuptools", "wheel"] # PEP 508 specifications. diff --git a/crit/requirements.txt b/crit/requirements.txt new file mode 100644 index 000000000..c27e6d4f0 --- /dev/null +++ b/crit/requirements.txt @@ -0,0 +1,7 @@ +# We need pip version 20.1 or newer to correctly build with 'pycriu' symlink. +# - Building of local directories with pip 20.1 or newer is done in place, +# instead of a temporary location containing a copy of the directory tree. +# (https://github.com/pypa/pip/issues/7555) +pip>=20.1 +setuptools>=42.0.0 +wheel diff --git a/lib/Makefile b/lib/Makefile index 7ed73f9ab..32d238de4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,6 +4,9 @@ UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/ve all-y += lib-c lib-a lib-py +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES := 0 + # # C language bindings. lib/c/Makefile: ; @@ -54,9 +57,19 @@ install: lib-c lib-a lib-py lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig -ifeq ($(PYTHON),python3) +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" +else $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --force-reinstall --prefix=$(DESTDIR)$(PREFIX) ./crit + $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt + $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt + $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit endif .PHONY: install @@ -69,7 +82,15 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) -ifeq ($(PYTHON),python3) +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +else $(E) " UNINSTALL" crit $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit endif From 642fd99bfdaf1dfb006f800f5e048e4dcf436dac Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 21 Jun 2023 23:48:32 +0100 Subject: [PATCH 249/775] remove python-future dependency This commit removes the dependency on the __future__ module, which was used to enable Python 3 features in Python 2 code. With support for Python 2 being dropped, it is no longer necessary to maintain backward compatibility. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 4 ++-- .lgtm.yml | 1 - contrib/debian/dev-packages.lst | 1 - criu/Makefile.packages | 2 -- lib/py/cli.py | 1 - scripts/build/Dockerfile.amd-rocm | 1 - scripts/build/Dockerfile.centos8 | 1 - scripts/build/Dockerfile.hotspot-ubuntu | 2 -- scripts/build/Dockerfile.linux32.tmpl | 3 +-- scripts/build/Dockerfile.openj9-ubuntu | 1 - scripts/build/Dockerfile.tmpl | 3 +-- scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 4 ++-- scripts/ci/vagrant.sh | 2 +- soccr/test/tcp-test.py | 3 +-- test/zdtm.py | 8 +------- 16 files changed, 9 insertions(+), 29 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 8b8212d69..e559ec772 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-future python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is @@ -108,7 +108,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf python3-importlib-metadata python3-junit_xml xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed diff --git a/.lgtm.yml b/.lgtm.yml index a884a53ef..0dd49cda4 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -22,7 +22,6 @@ extraction: - "libbsd-dev" - "python3-yaml" - "libnl-route-3-dev" - - "python-future" - "gnutls-dev" configure: command: diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst index c2d1509fa..ce45f1b7c 100644 --- a/contrib/debian/dev-packages.lst +++ b/contrib/debian/dev-packages.lst @@ -17,4 +17,3 @@ libcap-dev libaio-dev python3-yaml libnl-route-3-dev -python-future diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 13c346f44..f436737fd 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,7 +6,6 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel -REQ-RPM-PKG-NAMES += $(PYTHON)-future REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -15,7 +14,6 @@ REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf -REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev diff --git a/lib/py/cli.py b/lib/py/cli.py index 82079c7f4..221f7be0d 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -from __future__ import print_function import argparse import sys import json diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c0d181b03..c466a73d2 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -55,7 +55,6 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta protobuf-compiler \ python-protobuf \ python3-minimal \ - python3-future \ python-ipaddress \ curl \ wget \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index 488f95d65..b06524674 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -28,7 +28,6 @@ RUN yum install -y --allowerasing \ python3-devel \ python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-pip \ sudo \ diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 350102818..0318f650f 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ @@ -31,4 +30,3 @@ WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index a15038631..13e992642 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -21,8 +21,7 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python3-minimal \ - python3-future + python3-minimal COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 23db14e8d..c2cf20a36 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -6,7 +6,6 @@ COPY scripts/ci/apt-install /bin/apt-install RUN apt-install protobuf-c-compiler \ libprotobuf-c-dev \ libaio-dev \ - python3-future \ libprotobuf-dev \ protobuf-compiler \ libcap-dev \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index e0e72372d..9f6b1d096 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -30,8 +30,7 @@ RUN apt-install \ python-is-python3 \ python3-minimal \ python3-protobuf \ - python3-yaml \ - python3-future + python3-yaml COPY . /criu WORKDIR /criu diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 7c62aaaa2..1c8a46fbf 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -24,7 +24,6 @@ dnf install -y \ protobuf-devel \ python3-flake8 \ python3-PyYAML \ - python3-future \ python3-protobuf \ python3-junit_xml \ python3-pip \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index b45183a84..6d837fe06 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -5,8 +5,8 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml - libperl-dev pkg-config python3-future python3-protobuf - python3-pip python3-importlib-metadata python3-junit.xml) + libperl-dev pkg-config python3-protobuf python3-pip + python3-importlib-metadata python3-junit.xml) X86_64_PKGS=(gcc-multilib) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 5cc842442..ac4b5579d 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -38,7 +38,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-future python3-protobuf python3-importlib-metadata \ + protobuf-devel python3-flake8 python3-protobuf python3-importlib-metadata \ python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd diff --git a/soccr/test/tcp-test.py b/soccr/test/tcp-test.py index ff3fe29dc..b48f532eb 100755 --- a/soccr/test/tcp-test.py +++ b/soccr/test/tcp-test.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 -from __future__ import print_function import sys, socket import hashlib diff --git a/test/zdtm.py b/test/zdtm.py index 1ef941b4e..b56f06ef1 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1,10 +1,4 @@ -#!/usr/bin/env python -from __future__ import ( - absolute_import, - division, - print_function, - unicode_literals -) +#!/usr/bin/env python3 import argparse import atexit From ede018176c2bf34749270466f6a21c32ee834005 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 00:06:53 +0100 Subject: [PATCH 250/775] make: remove checks for python 2 binary This commit removes the checks for the Python 2 binary in the makefile and makes sure that ZDTM tests always use python3. Since support for Python 2 has been dropped, these checks are no longer needed. Signed-off-by: Radostin Stoyanov --- Documentation/Makefile | 4 +--- criu/Makefile | 2 -- criu/Makefile.packages | 5 +---- scripts/build/Dockerfile.tmpl | 1 - scripts/ci/run-ci-tests.sh | 4 ---- scripts/criu-ns | 2 +- scripts/magic-gen.py | 2 +- scripts/nmk/scripts/tools.mk | 2 +- soccr/test/Makefile | 3 +-- soccr/test/run.py | 2 +- test/check_actions.py | 2 +- test/crit-recode.py | 2 +- test/exhaustive/pipe.py | 2 +- test/exhaustive/unix.py | 2 +- test/inhfd/memfd.py.checkskip | 2 +- test/others/criu-ns/run.py | 2 +- test/others/ext-tty/run.py | 2 +- test/others/mnt-ext-dev/run.sh | 4 ++-- test/others/mounts/mounts.sh | 2 +- test/others/rpc/Makefile | 2 +- test/others/rpc/config_file.py | 2 +- test/others/rpc/errno.py | 2 +- test/others/rpc/ps_test.py | 2 +- test/others/rpc/restore-loop.py | 2 +- test/others/rpc/test.py | 2 +- test/others/rpc/version.py | 2 +- test/others/shell-job/run.py | 2 +- test/zdtm/static/cgroup_yard.hook | 2 +- test/zdtm/static/file_locks06.checkskip | 2 +- test/zdtm/static/net_lock_socket_iptables.hook | 2 +- test/zdtm/static/netns_lock_iptables.hook | 4 ++-- test/zdtm/static/socket-tcp-fin-wait1.hook | 2 +- 32 files changed, 31 insertions(+), 44 deletions(-) diff --git a/Documentation/Makefile b/Documentation/Makefile index 72bf0e862..de0cc448d 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,11 +12,9 @@ endif FOOTER := footer.txt SRC1 += crit.txt -ifeq ($(PYTHON),python3) SRC1 += criu-ns.txt -endif SRC1 += compel.txt -SRC1 += criu-amdgpu-plugin.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/criu/Makefile b/criu/Makefile index c6050d582..bafdd980b 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -145,10 +145,8 @@ install: $(obj)/criu $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts -ifeq ($(PYTHON),python3) $(E) " INSTALL " scripts/criu-ns $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR) -endif .PHONY: install uninstall: diff --git a/criu/Makefile.packages b/criu/Makefile.packages index f436737fd..7f6113c8f 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -20,13 +20,10 @@ REQ-DEB-PKG-NAMES += libcap-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev -ifeq ($(PYTHON),python3) REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -else -REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml -endif + export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 9f6b1d096..9b53a76aa 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -27,7 +27,6 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ - python-is-python3 \ python3-minimal \ python3-protobuf \ python3-yaml diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 6d837fe06..79744c750 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -58,10 +58,6 @@ ci_prep () { scripts/ci/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" - - # zdtm uses an unversioned python binary to run the tests. - # let's point python to python3 - ln -sf /usr/bin/python3 /usr/bin/python } test_stream() { diff --git a/scripts/criu-ns b/scripts/criu-ns index 3c77b8eb4..4c032aa14 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import ctypes import ctypes.util import errno diff --git a/scripts/magic-gen.py b/scripts/magic-gen.py index 3b1f29fb5..38dff1424 100755 --- a/scripts/magic-gen.py +++ b/scripts/magic-gen.py @@ -1,4 +1,4 @@ -#!/bin/env python2 +#!/bin/env python3 import sys diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 1681d4e90..724204a03 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null || which python2 2>/dev/null) +FULL_PYTHON := $(shell which python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ diff --git a/soccr/test/Makefile b/soccr/test/Makefile index 458540045..499901b0c 100644 --- a/soccr/test/Makefile +++ b/soccr/test/Makefile @@ -21,7 +21,6 @@ tcp-conn-v6: tcp-conn-v6.c test: tcp-constructor tcp-conn tcp-conn-v6 unshare -n sh -c "ip link set up dev lo; ./tcp-conn" unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" - python run.py ./$(RUN) + python3 run.py ./$(RUN) .PHONY: test - diff --git a/soccr/test/run.py b/soccr/test/run.py index 1ffe58a58..57c556e36 100644 --- a/soccr/test/run.py +++ b/soccr/test/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys, os import hashlib diff --git a/test/check_actions.py b/test/check_actions.py index 4973e3938..84d738dbb 100755 --- a/test/check_actions.py +++ b/test/check_actions.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/crit-recode.py b/test/crit-recode.py index 4135681e1..f119271d8 100755 --- a/test/crit-recode.py +++ b/test/crit-recode.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import pycriu import sys import os diff --git a/test/exhaustive/pipe.py b/test/exhaustive/pipe.py index 7f1c53d34..afe20846a 100755 --- a/test/exhaustive/pipe.py +++ b/test/exhaustive/pipe.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import os diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 6f72dd44b..689b1fb3a 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 252778969..27e2b7b15 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import ctypes libc = ctypes.CDLL(None) diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py index 6967b46b2..9d068476f 100755 --- a/test/others/criu-ns/run.py +++ b/test/others/criu-ns/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fcntl import os diff --git a/test/others/ext-tty/run.py b/test/others/ext-tty/run.py index 8109033cb..2c268a2c8 100755 --- a/test/others/ext-tty/run.py +++ b/test/others/ext-tty/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import os, sys, time, signal, pty diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 5a1f44450..3f6163e08 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -2,7 +2,7 @@ set -e -x # construct root -python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns +python3 ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop @@ -11,7 +11,7 @@ dev=`losetup --find --show zdtm.loop` mkdir -p ../../dev cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev -python ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? +python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev unlink zdtm.loop exit $ret diff --git a/test/others/mounts/mounts.sh b/test/others/mounts/mounts.sh index 51ea69540..bed156a50 100755 --- a/test/others/mounts/mounts.sh +++ b/test/others/mounts/mounts.sh @@ -20,7 +20,7 @@ for i in `awk '{ print $2 }' < /proc/self/mounts`; do umount -l $i done -python mounts.py +python3 mounts.py kill $INMNTNS_PID while :; do sleep 10 diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index fc64f0c97..69537bb0d 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -4,7 +4,7 @@ all: test-c rpc_pb2.py criu CFLAGS += -g -Werror -Wall -I. LDLIBS += -lprotobuf-c -PYTHON ?= python +PYTHON ?= python3 run: all @make -C .. loop diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 90c80fcae..6cffe270d 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import os diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index f84757efd..b600b6d1c 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Test criu errno import socket, os, errno diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index b51357d42..daeda49bc 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys, errno import rpc_pb2 as rpc diff --git a/test/others/rpc/restore-loop.py b/test/others/rpc/restore-loop.py index 84a2ce56d..67110c2cf 100755 --- a/test/others/rpc/restore-loop.py +++ b/test/others/rpc/restore-loop.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index 80f6338f4..ce8411bc6 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/version.py b/test/others/rpc/version.py index 9d7fa745b..a18cd5b7b 100755 --- a/test/others/rpc/version.py +++ b/test/others/rpc/version.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import rpc_pb2 as rpc diff --git a/test/others/shell-job/run.py b/test/others/shell-job/run.py index a59945d6a..969965f00 100755 --- a/test/others/shell-job/run.py +++ b/test/others/shell-job/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os, pty, sys, subprocess import termios, fcntl, time diff --git a/test/zdtm/static/cgroup_yard.hook b/test/zdtm/static/cgroup_yard.hook index d06bc45fd..b70bd59e9 100755 --- a/test/zdtm/static/cgroup_yard.hook +++ b/test/zdtm/static/cgroup_yard.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/zdtm/static/file_locks06.checkskip b/test/zdtm/static/file_locks06.checkskip index 06ab58521..c5039a2d2 100755 --- a/test/zdtm/static/file_locks06.checkskip +++ b/test/zdtm/static/file_locks06.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fcntl import tempfile import struct diff --git a/test/zdtm/static/net_lock_socket_iptables.hook b/test/zdtm/static/net_lock_socket_iptables.hook index 0ee147eb2..e9fcd7350 100755 --- a/test/zdtm/static/net_lock_socket_iptables.hook +++ b/test/zdtm/static/net_lock_socket_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import socket import time diff --git a/test/zdtm/static/netns_lock_iptables.hook b/test/zdtm/static/netns_lock_iptables.hook index e7daf8a65..b51d3c2cc 100755 --- a/test/zdtm/static/netns_lock_iptables.hook +++ b/test/zdtm/static/netns_lock_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import socket @@ -67,7 +67,7 @@ if sys.argv[1] == "--post-start": cln, addr = srv.accept() cln.sendall(str.encode("--post-restore")) cln.close() - + # Server will be closed when zdtm sends SIGKILL if sys.argv[1] == "--pre-dump": diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 9504557da..9dcd08999 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys sys.path.append("../crit") From ee9983d4a949bc2385b712295285240ed61af5cf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 02:54:30 +0100 Subject: [PATCH 251/775] test/criu-ns: drop python 2 compatibility This patch is replacing the set_blocking() function with os.set_blocking(). This function was introduced for compatibility with Python 2 in commit 8094df8di (criu-ns: Add tests for criu-ns script). Signed-off-by: Radostin Stoyanov --- test/others/criu-ns/run.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py index 9d068476f..0a36438e8 100755 --- a/test/others/criu-ns/run.py +++ b/test/others/criu-ns/run.py @@ -25,19 +25,6 @@ def check_dumpdir(path=IMG_DIR): os.mkdir(path, 0o755) -def set_blocking(fd, blocking): - """Implement os.set_blocking() for compatibility with Python - versions earlier than 3.5""" - flags = fcntl.fcntl(fd, fcntl.F_GETFL) - - if blocking: - flags &= ~os.O_NONBLOCK - else: - flags |= os.O_NONBLOCK - - fcntl.fcntl(fd, fcntl.F_SETFL, flags) - - def run_task_with_own_pty(task): fd_m, fd_s = pty.openpty() @@ -55,7 +42,7 @@ def run_task_with_own_pty(task): os.close(fd_s) fd_m = os.fdopen(fd_m, "rb") - set_blocking(fd_m.fileno(), False) + os.set_blocking(fd_m.fileno(), False) while True: try: From d388b91de27119d79dd8cafb821ec884ef9eb52a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 03:11:20 +0100 Subject: [PATCH 252/775] test/others: drop setup_swrk() py2 compatibility This patch removes the code introduced for compatibility with Python 2 in commits: 4c1ee3e227045fc1dc07b10ac7a538a68299693b test/other: Resolve Py3 compatibility issues 6b615ca15277fc14b52a09b4eb18314b7c6cbe75 test/others: Reuse setup_swrk() Signed-off-by: Radostin Stoyanov --- test/others/rpc/setup_swrk.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/test/others/rpc/setup_swrk.py b/test/others/rpc/setup_swrk.py index c7f84f952..ffaa01de4 100644 --- a/test/others/rpc/setup_swrk.py +++ b/test/others/rpc/setup_swrk.py @@ -5,12 +5,6 @@ import subprocess def setup_swrk(): print('Connecting to CRIU in swrk mode.') s1, s2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) - - kwargs = {} - if sys.version_info.major == 3: - kwargs["pass_fds"] = [s1.fileno()] - - swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], **kwargs) + swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], pass_fds=[s1.fileno()]) s1.close() return swrk, s2 - From 002f2372a9487fe03a6c4f3fe28df00b7033748f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 03:53:03 +0100 Subject: [PATCH 253/775] lib/py: drop python 2 compatibility This patch removes code introduced for compatibility with Python 2 in commits: bf80fee (lib: correctly handle stdin/stdout (Python 3)) b82f222 (lib: fix crit-recode fix for Python 2) Signed-off-by: Radostin Stoyanov --- lib/py/cli.py | 4 ---- lib/py/images/images.py | 11 ++--------- lib/py/images/pb2dict.py | 11 ++--------- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/lib/py/cli.py b/lib/py/cli.py index 221f7be0d..594035d27 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -11,8 +11,6 @@ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: - if (sys.version_info < (3, 0)): - return sys.stdin if sys.stdin.isatty(): # If we are reading from a terminal (not a pipe) we want text input and not binary return sys.stdin @@ -28,8 +26,6 @@ def outf(opts, decode): mode = 'w+' return open(opts['out'], mode) else: - if (sys.version_info < (3, 0)): - return sys.stdout if decode: return sys.stdout return sys.stdout.buffer diff --git a/lib/py/images/images.py b/lib/py/images/images.py index a1d76e7cf..9db506e1e 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -42,7 +42,6 @@ import base64 import struct import os import array -import sys from . import magic from . import pb @@ -71,18 +70,12 @@ class MagicException(Exception): def decode_base64_data(data): """A helper function to decode base64 data.""" - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(data)) - else: - return base64.decodebytes(data) + return base64.decodebytes(str.encode(data)) def write_base64_data(f, data): """A helper function to write base64 encoded data to a file.""" - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(data))) - else: - f.write(base64.decodebytes(data)) + f.write(base64.decodebytes(str.encode(data))) # Generic class to handle loading/dumping criu images entries from/to bin diff --git a/lib/py/images/pb2dict.py b/lib/py/images/pb2dict.py index 9d581c375..c7046429e 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/py/images/pb2dict.py @@ -3,7 +3,6 @@ import collections import os import quopri import socket -import sys from ipaddress import IPv4Address, IPv6Address, ip_address from google.protobuf.descriptor import FieldDescriptor as FD @@ -247,17 +246,11 @@ def encode_dev(field, value): def encode_base64(value): - if (sys.version_info > (3, 0)): - return base64.encodebytes(value).decode() - else: - return base64.encodebytes(value) + return base64.encodebytes(value).decode() def decode_base64(value): - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(value)) - else: - return base64.decodebytes(value) + return base64.decodebytes(str.encode(value)) def encode_unix(value): From 056cac474e5d1e88d86285ffcb26de776d9c550e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 22 Jun 2023 04:00:39 +0100 Subject: [PATCH 254/775] zdtm: drop python 2 compatibility This patch removes the code for Python 2 compatibility introduced with commit e65c7b5 (zdtm: Replace imp module with importlib). Signed-off-by: Radostin Stoyanov --- test/zdtm.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index b56f06ef1..c6e852dc1 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -635,14 +635,10 @@ class zdtm_test: def load_module_from_file(name, path): - if sys.version_info[0] == 3 and sys.version_info[1] >= 5: - import importlib.util - spec = importlib.util.spec_from_file_location(name, path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - else: - import imp - mod = imp.load_source(name, path) + import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) return mod From a5fe99d2c5475e932fe13979ea3af1415cacadde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 14:37:40 +0200 Subject: [PATCH 255/775] cgroup: Propagate error on cgroup mount failure. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes the error to mount cgroup hierarchy a bit less noisy: Error (criu/cgroup.c:623): cg: Unable to mount cgroup2 : Invalid argument' Instead of Error (criu/cgroup.c:623): cg: Unable to mount cgroup2 : Invalid argument' Error (criu/cgroup.c:715): cg: failed walking /proc/self/fd/-1/zdtmtst for empty cgroups: No such file or directory' Signed-off-by: Michał Mirosław --- criu/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/cgroup.c b/criu/cgroup.c index 0bf7b3818..267a5b6b4 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -714,6 +714,8 @@ static int collect_cgroups(struct list_head *ctls) } } else { fd = open_cgroupfs(cc); + if (fd < 0) + return -1; } path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); From 664598dc74842143a3e486b151123d8f1be2ca6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 9 Dec 2022 16:02:03 +0100 Subject: [PATCH 256/775] files-reg: Debug "open file on overmounted mount" error. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Log the mount and file that were the cause of failing a dump. Signed-off-by: Michał Mirosław --- criu/files-reg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index ed8b9c889..512097716 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1818,7 +1818,8 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) } if (!skip_for_shell_job && mnt_is_overmounted(mi)) { - pr_err("Open files on overmounted mounts are not supported yet\n"); + pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", + p->mnt_id, p->fd, link->name + 1); return -1; } From 5a723937a2fb2a5428eaadceb69c3a71546efea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:31:33 +0200 Subject: [PATCH 257/775] compel: Log the status word with "Task is still running" errors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- compel/src/lib/infect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 5aab7aa3e..022d4ebf3 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -589,7 +589,7 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t } if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); goto err; } @@ -1398,7 +1398,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); return -1; } From 1cb7916524bcdfff3f40ebb8e8d08322472f615e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:28:22 +0200 Subject: [PATCH 258/775] sk-unix: Log both peer names when failing on an external stream unix socket. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make debugging dump failures resulting in "sk unix: Can't dump half of stream unix connection" errors easier. Signed-off-by: Michał Mirosław --- criu/sk-unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 841152643..fd38ee7b1 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -878,7 +878,8 @@ static int __dump_external_socket(struct unix_sk_desc *sk, struct unix_sk_desc * if (peer->type != SOCK_DGRAM) { show_one_unix("Ext stream not supported", peer); - pr_err("Can't dump half of stream unix connection.\n"); + pr_err("Can't dump half of stream unix connection. name: %s; peer name: %s\n", + sk->name, peer->name); return -1; } From 4018b787784121c922a8b52fbd4fdc03f6dd1419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 17 May 2023 20:04:20 +0200 Subject: [PATCH 259/775] soccr: Log offset when failed to restore socket's queued data. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- soccr/soccr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/soccr/soccr.c b/soccr/soccr.c index abea93703..6967835c7 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) continue; } - logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", queue, ret, chunk, len, max_chunk); + logerr("Can't restore %d queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); goto err; } off += ret; From 804c0ba820d5fbc38357da135f7489f285cd3fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 18 May 2023 00:59:04 +0200 Subject: [PATCH 260/775] soccr: Log name of socket queue that failed to restore. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- soccr/soccr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/soccr/soccr.c b/soccr/soccr.c index 6967835c7..8e1ce1c63 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -781,7 +781,7 @@ int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsi return 0; } -static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) +static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __u32 len) { int ret, err = -1, max_chunk; int off; @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) continue; } - logerr("Can't restore %d queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); + logerr("Can't restore %s queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); goto err; } off += ret; @@ -837,7 +837,7 @@ static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) return -1; } - return __send_queue(sk, queue, buf, len); + return __send_queue(sk, queue == TCP_RECV_QUEUE ? "recv" : "send", buf, len); } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, @@ -876,7 +876,7 @@ static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_dat * they can be restored without any tricks. */ tcp_repair_off(sk->fd); - if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) + if (__send_queue(sk, "not-sent send", buf + len, ulen)) return -3; if (tcp_repair_on(sk->fd)) return -4; From 3e428a1de747ae50d773189d2d6e68a607579796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 14:49:47 +0200 Subject: [PATCH 261/775] log: Remove error logs for ignored or otherwise logged subprocess exits. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Errors in early restore.log for status=1 from a subprocess are confusing, esp. that they don't show what command failed. Since the result is either ignored or logged anyway, mark the calls as "can fail". Signed-off-by: Michał Mirosław --- criu/netfilter.c | 4 ++-- criu/util.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/criu/netfilter.c b/criu/netfilter.c index 2212fd9f2..9e78dc4b0 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -48,8 +48,8 @@ void preload_netfilter_modules(void) fd = -1; pr_perror("failed to open /dev/null, using log fd for net module preload"); } - cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, 0); - cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, 0); + cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, CRS_CAN_FAIL); + cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, CRS_CAN_FAIL); close_safe(&fd); } diff --git a/criu/util.c b/criu/util.c index 744ec6032..aa73083bd 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1566,7 +1566,7 @@ static int is_iptables_nft(char *bin) goto err; } - ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, 0); + ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, CRS_CAN_FAIL); if (ret) { pr_err("%s -V failed\n", cmd[0]); goto err; From 4eb6cc319084b2339edf5adf3fdef432293a6cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 18:52:59 +0200 Subject: [PATCH 262/775] mount: Demote fsnotify logs for ignored failures. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make logs about inaccessible mounts warnings, as the failures are normally harmless (e.g. failure to read /dev/cgroup) and don't make the CRIU run fail. (If it happens that the fsnotify can't find a file, then to debug, full CRIU logs will be necessary anyway.) Signed-off-by: Michał Mirosław --- criu/mount.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/criu/mount.c b/criu/mount.c index c26aaa58d..afbd24281 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -1197,8 +1197,8 @@ int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinf dev == pm->s_dev_rt) return 0; - pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, - pm->fstype->name, pm->ns_mountpoint); + pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); return -1; } @@ -1239,12 +1239,16 @@ int __open_mountpoint(struct mount_info *pm) int open_mount(unsigned int s_dev) { struct mount_info *m; + int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; - return __open_mountpoint(m); + mnt_fd = __open_mountpoint(m); + if (mnt_fd < 0) + pr_err("Can't open mount %#x\n", s_dev); + return mnt_fd; } /* Bind-mount a mount point in a temporary place without children */ From 6b8107cd1b85956852703cc4bb006cdc7932edf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 17:26:56 +0200 Subject: [PATCH 263/775] irmap: Reduce error log severity to warning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These errors originate from the filesystem scanning in irmap.c and are mostly benign. Nevertheless, if they do result in a failed irmap lookup, that failed lookup is more interesting from an application perspective. Signed-off-by: Michał Mirosław --- criu/irmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/irmap.c b/criu/irmap.c index 7b9d77bc1..2cdc66071 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -101,7 +101,7 @@ static int irmap_update_stat(struct irmap *i) pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { - pr_perror("Can't stat %s", i->path); + pr_pwarn("Can't stat %s", i->path); return -1; } @@ -136,7 +136,7 @@ static int irmap_update_dir(struct irmap *t) pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { - pr_perror("Can't open %s", t->path); + pr_pwarn("Can't open %s", t->path); return -1; } From 8ee35bebb52d54dee06afef0cb27e7b3e1dd796f Mon Sep 17 00:00:00 2001 From: Yan Evzman Date: Fri, 7 Jul 2023 00:36:41 +0300 Subject: [PATCH 264/775] kerndat: bind ipv6-socket only if ipv6 is enabled Fixes: #2222 Fixes: f1c8d38 ("kerndat: check if setsockopt IPV6_FREEBIND is supported") Signed-off-by: Yan Evzman Signed-off-by: Andrei Vagin --- criu/kerndat.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index 597fe5d92..c74201617 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1578,6 +1578,11 @@ static int kerndat_has_ipv6_freebind(void) { int sk, val; + if (!kdat.ipv6) { + kdat.has_ipv6_freebind = false; + return 0; + } + sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); if (sk == -1) { pr_perror("Unable to create a ipv6 dgram socket"); From b2d74fbfd43db4ef71c239c939237a32570c77b3 Mon Sep 17 00:00:00 2001 From: znley Date: Fri, 7 Jul 2023 15:29:59 +0800 Subject: [PATCH 265/775] zdtm: replace NR_fstat with NR_statx NR_fstat is a deprecated syscall, some modern architectures such as riscv and loongarch64 no longer support this syscall. It is usually replaced by NR_statx. NR_statx is supported since linux 4.10. Signed-off-by: znley --- test/zdtm/static/seccomp_filter_inheritance.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/seccomp_filter_inheritance.c b/test/zdtm/static/seccomp_filter_inheritance.c index 7a86cd85e..5afcb3f84 100644 --- a/test/zdtm/static/seccomp_filter_inheritance.c +++ b/test/zdtm/static/seccomp_filter_inheritance.c @@ -100,7 +100,7 @@ int main(int argc, char **argv) if (filter_syscall(__NR_ptrace) < 0) _exit(1); - if (filter_syscall(__NR_fstat) < 0) + if (filter_syscall(__NR_statx) < 0) _exit(1); zdtm_seccomp = 1; From 23313080aaaedf83dde3bc3724cde92de3ff17b6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 7 Jul 2023 16:33:12 -0700 Subject: [PATCH 266/775] kerndat: don't leak a socket file descriptor kerndat_has_ipv6_freebind creates a socket but doesn't close it. Signed-off-by: Andrei Vagin --- criu/kerndat.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index c74201617..4b836b5f7 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1574,9 +1574,26 @@ static int kerndat_has_nftables_concat(void) #define IPV6_FREEBIND 78 #endif +static int __kerndat_has_ipv6_freebind(int sk) +{ + int val = 1; + + if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { + if (errno == ENOPROTOOPT) { + kdat.has_ipv6_freebind = false; + return 0; + } + pr_perror("Unable to setsockopt ipv6_freebind"); + return -1; + } + + kdat.has_ipv6_freebind = true; + return 0; +} + static int kerndat_has_ipv6_freebind(void) { - int sk, val; + int sk, ret; if (!kdat.ipv6) { kdat.has_ipv6_freebind = false; @@ -1589,18 +1606,9 @@ static int kerndat_has_ipv6_freebind(void) return -1; } - val = 1; - if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { - if (errno == ENOPROTOOPT) { - kdat.has_ipv6_freebind = false; - return 0; - } - pr_perror("Unable to setsockopt ipv6_freebind"); - return -1; - } - - kdat.has_ipv6_freebind = true; - return 0; + ret = __kerndat_has_ipv6_freebind(sk); + close(sk); + return ret; } /* From 8a24d4872ee00ee332834d04c2c8b9d7b437f7b2 Mon Sep 17 00:00:00 2001 From: Prajwal S N Date: Sat, 1 Jul 2023 13:15:36 +0530 Subject: [PATCH 267/775] ci: add workflow to ensure self-contained commits Signed-off-by: Prajwal S N --- .github/workflows/check-commits.yml | 30 +++++++++++++++++++++++++++++ scripts/ci/Makefile | 8 ++++++++ 2 files changed, 38 insertions(+) create mode 100644 .github/workflows/check-commits.yml diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml new file mode 100644 index 000000000..be2fbd285 --- /dev/null +++ b/.github/workflows/check-commits.yml @@ -0,0 +1,30 @@ +name: Verify self-contained commits + +on: pull_request + +# Cancel any preceding run on the pull request +concurrency: + group: commit-test-${{ github.event.pull_request.number }} + +jobs: + build: + runs-on: ubuntu-latest + # Check if pull request does not have label "not-selfcontained-ok" + if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" + steps: + - uses: actions/checkout@v3 + with: + # Needed to rebase against the base branch + fetch-depth: 0 + # Checkout pull request HEAD commit instead of merge commit + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + - name: Configure git user details + run: | + git config --global user.email "checkpoint-restore@users.noreply.github.com" + git config --global user.name "checkpoint-restore" + - name: Configure base branch without switching current branch + run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + - name: Build each commit + run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 30dd9ebeb..5c4579103 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -102,5 +102,13 @@ vagrant-fedora-non-root: setup-vagrant .PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root +check-commit: + ($(MAKE) -j $$(nproc) -C ../.. && \ + echo "Commit $$(git rev-parse --short HEAD) built successfully") || \ + (echo "Build failed for $$(git rev-list -n 1 --pretty HEAD)" && \ + exit 1) + +.PHONY: check-commit + %: $(MAKE) -C ../build $@$(target-suffix) From b304106e6b66f16697c4f1b99e2abb86b4aee006 Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 11:23:38 +0800 Subject: [PATCH 268/775] include: add common header files for loongarch64 Signed-off-by: znley --- include/common/arch/loongarch64/asm/atomic.h | 62 +++++++++++++++++++ include/common/arch/loongarch64/asm/bitops.h | 24 +++++++ .../common/arch/loongarch64/asm/bitsperlong.h | 6 ++ include/common/arch/loongarch64/asm/linkage.h | 19 ++++++ include/common/arch/loongarch64/asm/page.h | 39 ++++++++++++ 5 files changed, 150 insertions(+) create mode 100644 include/common/arch/loongarch64/asm/atomic.h create mode 100644 include/common/arch/loongarch64/asm/bitops.h create mode 100644 include/common/arch/loongarch64/asm/bitsperlong.h create mode 100644 include/common/arch/loongarch64/asm/linkage.h create mode 100644 include/common/arch/loongarch64/asm/page.h diff --git a/include/common/arch/loongarch64/asm/atomic.h b/include/common/arch/loongarch64/asm/atomic.h new file mode 100644 index 000000000..901725439 --- /dev/null +++ b/include/common/arch/loongarch64/asm/atomic.h @@ -0,0 +1,62 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +#include +#include "common/compiler.h" + +typedef struct { + int counter; +} atomic_t; + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(v->counter), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add(1, v) +#define atomic_inc_return(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(ptr->counter) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/loongarch64/asm/bitops.h b/include/common/arch/loongarch64/asm/bitops.h new file mode 100644 index 000000000..170e4f736 --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitops.h @@ -0,0 +1,24 @@ +#ifndef _LINUX_BITOPS_H +#define _LINUX_BITOPS_H +#include "common/asm-generic/bitops.h" + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ + +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((1UL << ((nr) / BITS_PER_LONG)) - 1) +static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long res, mask; + mask = BIT_MASK(nr); + asm volatile("amor_db.d %0, %2, %1" : "=&r"(res), "+ZB"(addr[BIT_WORD(nr)]) : "r"(mask) : "memory"); + return (res & mask) != 0; +} + +#endif diff --git a/include/common/arch/loongarch64/asm/bitsperlong.h b/include/common/arch/loongarch64/asm/bitsperlong.h new file mode 100644 index 000000000..13d06a384 --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG _LOONGARCH_SZLONG + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/loongarch64/asm/linkage.h b/include/common/arch/loongarch64/asm/linkage.h new file mode 100644 index 000000000..448acc29f --- /dev/null +++ b/include/common/arch/loongarch64/asm/linkage.h @@ -0,0 +1,19 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + __ALIGN; \ + .type name, @function; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h new file mode 100644 index 000000000..25bdbc141 --- /dev/null +++ b/include/common/arch/loongarch64/asm/page.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned __page_size; +static unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SIZE page_size() +#define PAGE_SHIFT page_shift() +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#else /* CR_NOGLIBC */ + +extern unsigned page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_PAGE_H__ */ From c9df09eeab3c0e34efd00a0c9f68668416b7f565 Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 09:35:40 +0000 Subject: [PATCH 269/775] compel: add loongarch64 support Signed-off-by: znley --- Makefile | 10 +- compel/Makefile | 2 +- .../plugins/include/asm/prologue.h | 35 +++ .../plugins/include/asm/syscall-types.h | 30 +++ .../loongarch64/plugins/include/features.h | 4 + .../loongarch64/plugins/std/parasite-head.S | 9 + .../plugins/std/syscalls/Makefile.syscalls | 117 ++++++++++ .../syscalls/syscall-common-loongarch-64.S | 44 ++++ .../plugins/std/syscalls/syscall_64.tbl | 121 +++++++++++ .../loongarch64/scripts/compel-pack.lds.S | 32 +++ compel/arch/loongarch64/src/lib/cpu.c | 41 ++++ .../loongarch64/src/lib/handle-elf-host.c | 22 ++ compel/arch/loongarch64/src/lib/handle-elf.c | 22 ++ .../loongarch64/src/lib/include/handle-elf.h | 8 + .../loongarch64/src/lib/include/syscall.h | 8 + .../src/lib/include/uapi/asm/breakpoints.h | 6 + .../src/lib/include/uapi/asm/cpu.h | 6 + .../src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 67 ++++++ .../src/lib/include/uapi/asm/sigframe.h | 86 ++++++++ compel/arch/loongarch64/src/lib/infect.c | 204 ++++++++++++++++++ compel/src/main.c | 3 + scripts/nmk/scripts/include.mk | 3 +- 23 files changed, 881 insertions(+), 3 deletions(-) create mode 100644 compel/arch/loongarch64/plugins/include/asm/prologue.h create mode 100644 compel/arch/loongarch64/plugins/include/asm/syscall-types.h create mode 100644 compel/arch/loongarch64/plugins/include/features.h create mode 100644 compel/arch/loongarch64/plugins/std/parasite-head.S create mode 100644 compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls create mode 100644 compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S create mode 100644 compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl create mode 100644 compel/arch/loongarch64/scripts/compel-pack.lds.S create mode 100644 compel/arch/loongarch64/src/lib/cpu.c create mode 100644 compel/arch/loongarch64/src/lib/handle-elf-host.c create mode 100644 compel/arch/loongarch64/src/lib/handle-elf.c create mode 100644 compel/arch/loongarch64/src/lib/include/handle-elf.h create mode 100644 compel/arch/loongarch64/src/lib/include/syscall.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h create mode 100644 compel/arch/loongarch64/src/lib/infect.c diff --git a/Makefile b/Makefile index 020ef85fc..2e8a866af 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -80,6 +80,10 @@ ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif +ifeq ($(ARCH),loongarch64) + DEFINES := -DCONFIG_LOONGARCH64 +endif + # # CFLAGS_PIE: # @@ -122,6 +126,10 @@ ifeq ($(ARCH),mips) WARNINGS := -rdynamic endif +ifeq ($(ARCH),loongarch64) +WARNINGS := -Wno-implicit-function-declaration +endif + ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) diff --git a/compel/Makefile b/compel/Makefile index b79aee687..78ec4826a 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -33,7 +33,7 @@ lib-y += arch/$(ARCH)/src/lib/thread_area.o endif # handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64,$(ARCH)),) +ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/loongarch64/plugins/include/asm/prologue.h b/compel/arch/loongarch64/plugins/include/asm/prologue.h new file mode 100644 index 000000000..c19ce54d7 --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..b883bd8be --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h @@ -0,0 +1,30 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#include +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +/* refer to arch/loongarch/include/uapi/asm/signal.h */ +#define _KNSIG 64 +#define _NSIG_BPW BITS_PER_LONG +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#define SA_RESTORER 0x04000000 + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/features.h b/compel/arch/loongarch64/plugins/include/features.h new file mode 100644 index 000000000..b4a3cded2 --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/loongarch64/plugins/std/parasite-head.S b/compel/arch/loongarch64/plugins/std/parasite-head.S new file mode 100644 index 000000000..3a960490e --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/parasite-head.S @@ -0,0 +1,9 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + bl parasite_service; + break 0; +END(__export_parasite_head_start) + diff --git a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..0d08f34e1 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S new file mode 100644 index 000000000..fff894466 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S @@ -0,0 +1,44 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ +ENTRY(name); \ + addi.d $a7, $zero, opcode; \ + syscall 0; \ + jirl $r0, $r1, 0; \ +END(name) + +#ifndef AT_FDCWD +#define AT_FDCWD -100 +#endif + +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif + +ENTRY(sys_open) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_openat +END(sys_open) + +ENTRY(sys_mkdir) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_mkdirat +END(sys_mkdir) + +ENTRY(sys_rmdir) + addi.d $a2, $zero, AT_REMOVEDIR + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_unlinkat +END(sys_rmdir) + +ENTRY(__cr_restore_rt) + addi.d $a7, $zero, __NR_rt_sigreturn + syscall 0 +END(__cr_restore_rt) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 000000000..b37a22674 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,121 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) diff --git a/compel/arch/loongarch64/scripts/compel-pack.lds.S b/compel/arch/loongarch64/scripts/compel-pack.lds.S new file mode 100644 index 000000000..cfb7a2fb3 --- /dev/null +++ b/compel/arch/loongarch64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(loongarch) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} diff --git a/compel/arch/loongarch64/src/lib/cpu.c b/compel/arch/loongarch64/src/lib/cpu.c new file mode 100644 index 000000000..172b90e27 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/cpu.c @@ -0,0 +1,41 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf-host.c b/compel/arch/loongarch64/src/lib/handle-elf-host.c new file mode 100644 index 000000000..a605a5a45 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf-host.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf.c b/compel/arch/loongarch64/src/lib/handle-elf.c new file mode 100644 index 000000000..a605a5a45 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/include/handle-elf.h b/compel/arch/loongarch64/src/lib/include/handle-elf.h new file mode 100644 index 000000000..b0a66ef87 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/syscall.h b/compel/arch/loongarch64/src/lib/include/syscall.h new file mode 100644 index 000000000..ac3e2799a --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif + +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..21eb1309f --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..e568df789 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..7f476d541 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..0b047a5b0 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,67 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h + * + * A thread LoongArch CPU context + * + * struct user_fp_state { + * uint64_t fpr[32]; + * uint64_t fcc; + * uint32_t fcsr; + * }; + * + * struct user_pt_regs { + * unsigned long regs[32]; + * unsigned long csr_era; + * unsigned long csr_badv; + * unsigned long reserved[11]; + * }; + */ + +struct user_gp_regs { + uint64_t regs[32]; + uint64_t orig_a0; + uint64_t pc; + uint64_t csr_badv; + uint64_t reserved[10]; +} __attribute__((aligned(8))); + +struct user_fp_regs { + uint64_t regs[32]; + uint64_t fcc; + uint32_t fcsr; +}; + +typedef struct user_gp_regs user_regs_struct_t; +typedef struct user_fp_regs user_fpregs_struct_t; + +#define user_regs_native(regs) true + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(r) ((uint64_t)(r).regs[4]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)(r).regs[3]) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) +#define SET_REG_IP(r, val) ((r).pc = (val)) + +#define GPR_NUM 32 +#define FPR_NUM 32 + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..fcb545a1d --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,86 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +#include +#include + +#include + +#define rt_sigcontext sigcontext +/* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ +#include +typedef __u32 u32; + +typedef struct sigcontext_t { + __u64 pc; + __u64 regs[32]; + __u32 flags; + __u64 extcontext[0] __attribute__((__aligned__(16))); +} sigcontext_t; + +typedef struct context_info_t { + __u32 magic; + __u32 size; + __u64 padding; +} context_info_t; + +#define FPU_CTX_MAGIC 0x46505501 +#define FPU_CTX_ALIGN 8 +typedef struct fpu_context_t { + __u64 regs[32]; + __u64 fcc; + __u64 fcsr; +} fpu_context_t; + +typedef struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + sigset_t uc_sigmask; + __u8 __unused[1024 / 8 - sizeof(sigset_t)]; + sigcontext_t uc_mcontext; +} ucontext; + +/* Copy from the kernel source arch/loongarch/kernel/signal.c */ +struct rt_sigframe { + rt_siginfo_t rs_info; + ucontext rs_uc; +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) +#define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) +#define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) + +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ({ \ + context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ + ctx->magic = FPU_CTX_MAGIC; \ + ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ + (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ + }) + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "addi.d $sp, %0, 0 \n" \ + "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall 0" \ + : \ + :"r"(new_sp) \ + : "$a7", "memory") +/* clang-format on */ + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c new file mode 100644 index 000000000..8e3c19aff --- /dev/null +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" + +/* + * Injected syscall instruction + * loongarch64 is Little Endian + */ +const char code_syscall[] = { + 0x00, 0x00, 0x2b, 0x00, /* syscall */ + 0x00, 0x00, 0x2a, 0x00 /* break */ +}; + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigcontext_t *sc; + fpu_context_t *fpu; + + sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, regs->regs, sizeof(regs->regs)); + sc->pc = regs->pc; + + fpu = RT_SIGFRAME_FPU(sigframe); + memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + /* + * Refer to Linux kernel arch/loongarch/kernel/signal.c + */ + if (regs->regs[0]) { + switch (regs->regs[4]) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->regs[4] = regs->orig_a0; + regs->pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs->regs[4] = regs->orig_a0; + regs->regs[11] = __NR_restart_syscall; + regs->pc -= 4; + break; + } + regs->regs[0] = 0; /* Don't deal with this again. */ + } + + iov.iov_base = fpregs; + iov.iov_len = sizeof(user_fpregs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(arg, regs, fpregs); +err: + return 0; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +/* + * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is + * used as syscall number. + */ +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + int err; + user_regs_struct_t regs = ctl->orig.regs; + + regs.regs[11] = (unsigned long)nr; + regs.regs[4] = arg1; + regs.regs[5] = arg2; + regs.regs[6] = arg3; + regs.regs[7] = arg4; + regs.regs[8] = arg5; + regs.regs[9] = arg6; + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[4]; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->regs[4] = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * TODO: add feature + */ +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +/* + * Refer to Linux kernel arch/loongarch/include/asm/processor.h + */ +#define TASK_SIZE32 (1UL) << 31 +#define TASK_SIZE64_MIN (1UL) << 40 +#define TASK_SIZE64_MAX (1UL) << 48 + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} diff --git a/compel/src/main.c b/compel/src/main.c index ef05a46d0..bc16c0ab4 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -57,6 +57,9 @@ static const flags_t flags = { #elif defined CONFIG_MIPS .arch = "mips", .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_LOONGARCH64 + .arch = "loongarch64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index c1c1e94af..55c5be307 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -20,7 +20,8 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ - -e s/aarch64.*/aarch64/) + -e s/aarch64.*/aarch64/ \ + -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH From ec6dc2d5c0de5b1459ba0b6a91b686ca1db2fdf9 Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 15:09:22 +0800 Subject: [PATCH 270/775] images: add loongarch64 core image Signed-off-by: znley --- images/Makefile | 1 + images/core-loongarch64.proto | 23 +++++++++++++++++++++++ images/core.proto | 3 +++ 3 files changed, 27 insertions(+) create mode 100755 images/core-loongarch64.proto diff --git a/images/Makefile b/images/Makefile index 004e22ec3..ca85b1a21 100644 --- a/images/Makefile +++ b/images/Makefile @@ -2,6 +2,7 @@ proto-obj-y += stats.o proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-mips.o +proto-obj-y += core-loongarch64.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o diff --git a/images/core-loongarch64.proto b/images/core-loongarch64.proto new file mode 100755 index 000000000..8258f006e --- /dev/null +++ b/images/core-loongarch64.proto @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +message user_loongarch64_gpregs_entry { + repeated uint64 regs = 1; + required uint64 pc = 2; +} + +message user_loongarch64_fpregs_entry { + repeated uint64 regs = 1; + required uint64 fcc = 2; + required uint32 fcsr = 3; +} + +message thread_info_loongarch64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_loongarch64_gpregs_entry gpregs = 3[(criu).hex = true]; + required user_loongarch64_fpregs_entry fpregs = 4[(criu).hex = true]; +} diff --git a/images/core.proto b/images/core.proto index eddd1dc55..1882fe8e4 100644 --- a/images/core.proto +++ b/images/core.proto @@ -8,6 +8,7 @@ import "core-aarch64.proto"; import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; +import "core-loongarch64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -122,6 +123,7 @@ message core_entry { PPC64 = 4; S390 = 5; MIPS = 6; + LOONGARCH64 = 7; } required march mtype = 1; @@ -131,6 +133,7 @@ message core_entry { optional thread_info_ppc64 ti_ppc64 = 9; optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; + optional thread_info_loongarch64 ti_loongarch64 = 12; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; From ae0811475783561bf5ce0ea39b9e88e9c8135ebb Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 15:15:30 +0800 Subject: [PATCH 271/775] criu: add loongarch64 support to parasite and restorer Signed-off-by: znley --- criu/arch/loongarch64/Makefile | 14 +++ criu/arch/loongarch64/cpu.c | 31 +++++ criu/arch/loongarch64/crtools.c | 115 ++++++++++++++++++ criu/arch/loongarch64/include/asm/dump.h | 15 +++ criu/arch/loongarch64/include/asm/int.h | 6 + criu/arch/loongarch64/include/asm/kerndat.h | 7 ++ .../include/asm/parasite-syscall.h | 6 + criu/arch/loongarch64/include/asm/parasite.h | 11 ++ criu/arch/loongarch64/include/asm/restore.h | 33 +++++ criu/arch/loongarch64/include/asm/restorer.h | 97 +++++++++++++++ .../loongarch64/include/asm/thread_pointer.h | 27 ++++ criu/arch/loongarch64/include/asm/types.h | 39 ++++++ criu/arch/loongarch64/include/asm/vdso.h | 27 ++++ criu/arch/loongarch64/restorer.c | 14 +++ criu/arch/loongarch64/sigframe.c | 12 ++ criu/arch/loongarch64/vdso-pie.c | 48 ++++++++ 16 files changed, 502 insertions(+) create mode 100644 criu/arch/loongarch64/Makefile create mode 100644 criu/arch/loongarch64/cpu.c create mode 100644 criu/arch/loongarch64/crtools.c create mode 100644 criu/arch/loongarch64/include/asm/dump.h create mode 100644 criu/arch/loongarch64/include/asm/int.h create mode 100644 criu/arch/loongarch64/include/asm/kerndat.h create mode 100644 criu/arch/loongarch64/include/asm/parasite-syscall.h create mode 100644 criu/arch/loongarch64/include/asm/parasite.h create mode 100644 criu/arch/loongarch64/include/asm/restore.h create mode 100644 criu/arch/loongarch64/include/asm/restorer.h create mode 100644 criu/arch/loongarch64/include/asm/thread_pointer.h create mode 100644 criu/arch/loongarch64/include/asm/types.h create mode 100644 criu/arch/loongarch64/include/asm/vdso.h create mode 100644 criu/arch/loongarch64/restorer.c create mode 100644 criu/arch/loongarch64/sigframe.c create mode 100644 criu/arch/loongarch64/vdso-pie.c diff --git a/criu/arch/loongarch64/Makefile b/criu/arch/loongarch64/Makefile new file mode 100644 index 000000000..4bd99eb7e --- /dev/null +++ b/criu/arch/loongarch64/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/criu/arch/loongarch64/cpu.c b/criu/arch/loongarch64/cpu.c new file mode 100644 index 000000000..5559c4288 --- /dev/null +++ b/criu/arch/loongarch64/cpu.c @@ -0,0 +1,31 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c new file mode 100644 index 000000000..eeb0731ca --- /dev/null +++ b/criu/arch/loongarch64/crtools.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/restorer.h" +#include "asm/parasite-syscall.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +#define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + int i; + CoreEntry *core = x; + UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; + UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; + for (i = 0; i < GPR_NUM; i++) + assign_reg(gprs, regs, regs[i]); + assign_reg(gprs, regs, pc); + + for (i = 0; i < FPR_NUM; i++) + assign_reg(fpregs, fpregs, regs[i]); + assign_reg(fprs, fpregs, fcc); + assign_reg(fprs, fpregs, fcsr); + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoLoongarch64 *ti_loongarch64; + UserLoongarch64GpregsEntry *gpregs; + UserLoongarch64FpregsEntry *fpregs; + + ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); + thread_info_loongarch64__init(ti_loongarch64); + core->ti_loongarch64 = ti_loongarch64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_loongarch64_gpregs_entry__init(gpregs); + gpregs->n_regs = GPR_NUM; + gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); + if (!gpregs->regs) + goto err; + ti_loongarch64->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + goto err; + user_loongarch64_fpregs_entry__init(fpregs); + fpregs->n_regs = FPR_NUM; + fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); + if (!fpregs->regs) + goto err; + ti_loongarch64->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpregs) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); + UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; + + memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) +{ + sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, r->regs, sizeof(sc->regs)); + sc->pc = r->pc; + return 0; +} diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h new file mode 100644 index 000000000..04347155c --- /dev/null +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_loongarch64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/int.h b/criu/arch/loongarch64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/kerndat.h b/criu/arch/loongarch64/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/loongarch64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/parasite-syscall.h b/criu/arch/loongarch64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..6008c3792 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/loongarch64/include/asm/parasite.h b/criu/arch/loongarch64/include/asm/parasite.h new file mode 100644 index 000000000..b64cb3185 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite.h @@ -0,0 +1,11 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm volatile("or %0, $zero, $tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/loongarch64/include/asm/restore.h b/criu/arch/loongarch64/include/asm/restore.h new file mode 100644 index 000000000..d956231c8 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restore.h @@ -0,0 +1,33 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ +({ \ + uint64_t save_sp; \ + asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ + asm volatile( \ + "or $a0, $zero, %2 \n" \ + "or $sp, $zero, %0 \n" \ + "jirl $ra, %1, 0 \n" \ + : \ + : "r"(new_sp & ~15), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "$a0", "memory"); \ + asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ +}) + +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_loongarch64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/loongarch64/include/asm/restorer.h b/criu/arch/loongarch64/include/asm/restorer.h new file mode 100644 index 000000000..7a0d35c5b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restorer.h @@ -0,0 +1,97 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld.d $a1, %2 \n" \ + "addi.d $a1, $a1, -16 \n" \ + "st.d %5, $a1, 0 \n" \ + "st.d %6, $a1, 8 \n" \ + "or $a0, $zero, %1 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "ld.d $a1, $sp, 0 \n" \ + "ld.d $a0, $sp, 8 \n" \ + "jirl $ra, $a1, 0 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "ZB"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(&clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + "or $a0, $zero, %1 \n" \ + "or $a1, $zero, %2 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, clone3_thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + "or $a0, $zero, $a3 \n" \ + "jirl $ra, $a2, 0 \n" \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") +/* clang-format on */ + +static inline void restore_tls(tls_t *ptls) +{ + asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); + +#define arch_map_vdso(map, compat) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/thread_pointer.h b/criu/arch/loongarch64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/loongarch64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/loongarch64/include/asm/types.h b/criu/arch/loongarch64/include/asm/types.h new file mode 100644 index 000000000..72bca2022 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/types.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" +#include "images/core.pb-c.h" + +#include + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 + +#define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) + +#define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) + +typedef UserLoongarch64GpregsEntry UserRegsEntry; + +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/loongarch64/include/asm/vdso.h b/criu/arch/loongarch64/include/asm/vdso.h new file mode 100644 index 000000000..64631dee0 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/vdso.h @@ -0,0 +1,27 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 5 +#define VDSO_SYMBOL_GTOD 3 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ + const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ + const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 +#endif diff --git a/criu/arch/loongarch64/restorer.c b/criu/arch/loongarch64/restorer.c new file mode 100644 index 000000000..730318ac1 --- /dev/null +++ b/criu/arch/loongarch64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) +{ + return 0; +} diff --git a/criu/arch/loongarch64/sigframe.c b/criu/arch/loongarch64/sigframe.c new file mode 100644 index 000000000..18983ff13 --- /dev/null +++ b/criu/arch/loongarch64/sigframe.c @@ -0,0 +1,12 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/loongarch64/vdso-pie.c b/criu/arch/loongarch64/vdso-pie.c new file mode 100644 index 000000000..7a75d2741 --- /dev/null +++ b/criu/arch/loongarch64/vdso-pie.c @@ -0,0 +1,48 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t pcaddi; + uint32_t ldptr; + uint32_t jirl; + uint32_t guards; + uint64_t imm64; + } __packed jmp = { + .pcaddi = 0x18000095, /* pcaddi $x, 4 */ + .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ + .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ + .guards = 0x002a0000, /* break 0 */ + .imm64 = to, + }; + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) +{ + unsigned int i; + unsigned long from, to; + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + insert_trampoline(from, to); + } + return 0; +} From 788e1e92ef092478f95d6c9f53c4417d8aa1da1e Mon Sep 17 00:00:00 2001 From: znley Date: Mon, 12 Jun 2023 15:26:35 +0800 Subject: [PATCH 272/775] zdtm: add loongarch64 support Signed-off-by: znley --- .../lib/arch/loongarch64/include/asm/atomic.h | 49 +++++++++++++++++++ test/zdtm/lib/test.c | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 test/zdtm/lib/arch/loongarch64/include/asm/atomic.h diff --git a/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h new file mode 100644 index 000000000..1803aaeb4 --- /dev/null +++ b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h @@ -0,0 +1,49 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +#define atomic_get(v) (*(volatile int *)v) +#define atomic_set(v, i) (*(v) = (i)) + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(*v), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub_return(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(*ptr) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 6291ea4a7..a5ba38b2d 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -406,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); From e25a243b28d3a4a4ab3e1158a78214984ef476b7 Mon Sep 17 00:00:00 2001 From: znley Date: Tue, 11 Jul 2023 15:20:00 +0800 Subject: [PATCH 273/775] ci: add workflow for loongarch64 Signed-off-by: znley --- .github/workflows/loongarch64-qemu-test.yml | 15 +++++ scripts/ci/Makefile | 5 ++ scripts/ci/loongarch64-qemu-test.sh | 69 +++++++++++++++++++++ 3 files changed, 89 insertions(+) create mode 100644 .github/workflows/loongarch64-qemu-test.yml create mode 100755 scripts/ci/loongarch64-qemu-test.sh diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml new file mode 100644 index 000000000..ba22fa25f --- /dev/null +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -0,0 +1,15 @@ +name: LoongArch64 Qemu Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v2 + - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 5c4579103..ce844a17c 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -110,5 +110,10 @@ check-commit: .PHONY: check-commit +loongarch64-qemu-test: + ./loongarch64-qemu-test.sh + +.PHONY: loongarch64-qemu-test + %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh new file mode 100755 index 000000000..52e587619 --- /dev/null +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -x + +./apt-install \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common \ + sshpass \ + openssh-client + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable test" + +./apt-install docker-ce + +# shellcheck source=/dev/null +. /etc/lsb-release + +# docker checkpoint and restore is an experimental feature +echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart + +docker info + +# run a loongarch64 vm + +PORT='2222' +USER='root' +PASSWORD='loongarch64' +NAME='vm' + +docker run \ + -d \ + --net host \ + --name $NAME \ + merore/archlinux-loongarch64 + +run() { + if [ -z "$1" ]; then + echo "Command cannot be empty." + exit 1 + fi + sshpass -p $PASSWORD ssh -o StrictHostKeyChecking=no -p $PORT $USER@127.0.0.1 "$1" +} + +# wait vm to start +while (! run "uname -a") +do + echo "Wait vm to start..." + sleep 1 +done +echo "The loongarch64 vm is started!" + +# Tar criu and send to vm +tar -cf criu.tar ../../../criu +sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127.0.0.1:/root + +# build and test +run 'cd /root; tar -xf criu.tar' +run 'cd /root/criu; make -j4' +run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" From 21560270dd0a900d683bc01ce20e0c8f8050fba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 20:46:33 +0200 Subject: [PATCH 274/775] util: Implement fchown() and fchmod() wrappers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add generic wrappers for fchown() and fchmod() that skip the calls if no changes are needed. This will allow to unify places where we can avoid errors when no-op requests are not permitted. Signed-off-by: Michał Mirosław --- criu/include/util.h | 4 +++ criu/util.c | 83 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/criu/include/util.h b/criu/include/util.h index 4b4dfda95..7e4a13a6a 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -263,6 +263,10 @@ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); + int fd_has_data(int lfd); int make_yard(char *path); diff --git a/criu/util.c b/criu/util.c index aa73083bd..bca7ad88a 100644 --- a/criu/util.c +++ b/criu/util.c @@ -952,6 +952,89 @@ FILE *fopenat(int dirfd, char *path, char *cflags) return fdopen(tmp, cflags); } +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid) +{ + struct stat st; + + if (!fchown(fd, new_uid, new_gid)) + return 0; + if (errno != EPERM) + return -1; + + if (fstat(fd, &st) < 0) { + pr_perror("fstat() after fchown() for fd %d", fd); + goto out_eperm; + } + pr_debug("fstat(%d): uid %u gid %u\n", fd, st.st_uid, st.st_gid); + + if (new_uid != st.st_uid || new_gid != st.st_gid) + goto out_eperm; + + return 0; +out_eperm: + errno = EPERM; + return -1; +} + +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags) +{ + struct stat st; + int ret; + + if (fchownat(dirfd, path, new_uid, new_gid, flags) < 0 && errno != EPERM) { + int errno_cpy = errno; + pr_perror("Unable to change [%d]/%s ownership to (%d, %d)", + dirfd, path, new_uid, new_gid); + errno = errno_cpy; + return -1; + } + + if (fstatat(dirfd, path, &st, flags) < 0) { + int errno_cpy = errno; + pr_perror("Unable to stat [%d]/%s", dirfd, path); + errno = errno_cpy; + return -1; + } + + if (new_uid != st.st_uid || new_gid != st.st_gid) { + errno = EPERM; + pr_perror("Unable to change [%d]/%s ownership (%d, %d) to (%d, %d)", + dirfd, path, st.st_uid, st.st_gid, new_uid, new_gid); + errno = EPERM; + return -1; + } + + if (new_mode == st.st_mode) + return 0; + + if (S_ISLNK(st.st_mode)) { + /* + * We have no lchmod() function, and fchmod() will fail on + * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() + * function and flag AT_SYMLINK_NOFOLLOW described in + * man 2 fchmodat, but it is not currently implemented. %) + */ + return 0; + } + + if (!*path && flags & AT_EMPTY_PATH) + ret = fchmod(dirfd, new_mode); + else + ret = fchmodat(dirfd, path, new_mode, flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)); + if (ret < 0) { + int errno_cpy = errno; + pr_perror("Unable to set perms %o on [%d]/%s", new_mode, dirfd, path); + errno = errno_cpy; + } + + return ret; +} + +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode) +{ + return cr_fchpermat(fd, "", new_uid, new_gid, new_mode, AT_EMPTY_PATH); +} + void split(char *str, char token, char ***out, int *n) { int i; From f985d9f44b3029a384532c03a22839209550a185 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 20:51:03 +0200 Subject: [PATCH 275/775] sk-unix: Avoid restore_file_perms() EPERM error for no-op changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: This removes the difference in calling convention of restore_file_perms() returning -errno that was the only call that did this in the caller. From: Radosław Burny Signed-off-by: Michał Mirosław --- criu/sk-unix.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/criu/sk-unix.c b/criu/sk-unix.c index fd38ee7b1..70ca16be4 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -1431,32 +1431,22 @@ err_revert_and_exit: static int restore_file_perms(struct unix_sk_info *ui) { - if (ui->ue->file_perms) { - FilePermsEntry *perms = ui->ue->file_perms; - char fname[PATH_MAX]; + FilePermsEntry *perms = ui->ue->file_perms; + char fname[PATH_MAX]; - if (ui->ue->name.len >= sizeof(fname)) { - pr_err("The file name is too long\n"); - return -E2BIG; - } + if (!perms) + return 0; - memcpy(fname, ui->name, ui->ue->name.len); - fname[ui->ue->name.len] = '\0'; - - if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file owner and group"); - return -errno_cpy; - } - - if (fchmodat(AT_FDCWD, fname, perms->mode, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file mode bits"); - return -errno_cpy; - } + if (ui->ue->name.len >= sizeof(fname)) { + pr_err("The file name is too long\n"); + errno = -E2BIG; + return -1; } - return 0; + memcpy(fname, ui->name, ui->ue->name.len); + fname[ui->ue->name.len] = '\0'; + + return cr_fchpermat(AT_FDCWD, fname, perms->uid, perms->gid, perms->mode, 0); } static int keep_deleted(struct unix_sk_info *ui) From b074f92f99271c9a45878f1ed4288759068a1564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 20:58:00 +0200 Subject: [PATCH 276/775] files-reg: Avoid EPERM in ghost_apply_metadata() for no-op changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/files-reg.c | 48 +++++++++++++----------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 512097716..50dcbc438 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -407,46 +407,24 @@ static int mklnk_ghost(char *path, GhostFileEntry *gfe) static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; - int ret = -1; - if (S_ISLNK(gfe->mode)) { - if (lchown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) + return -1; - /* - * We have no lchmod() function, and fchmod() will fail on - * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() - * function and flag AT_SYMLINK_NOFOLLOW described in - * man 2 fchmodat, but it is not currently implemented. %) - */ - } else { - if (chown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (!gfe->atim) + return 0; - if (chmod(path, gfe->mode)) { - pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); - goto err; - } + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; + + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + return -1; } - if (gfe->atim) { - tv[0].tv_sec = gfe->atim->tv_sec; - tv[0].tv_usec = gfe->atim->tv_usec; - tv[1].tv_sec = gfe->mtim->tv_sec; - tv[1].tv_usec = gfe->mtim->tv_usec; - if (lutimes(path, tv)) { - pr_perror("Can't set access and modification times on ghost %s", path); - goto err; - } - } - - ret = 0; -err: - return ret; + return 0; } static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) From 96fa42b79dbbfa1b72a955c055121f15f053bc4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 21:01:29 +0200 Subject: [PATCH 277/775] cgroup: Replace restore_perms() with cr_fchperm(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/cgroup.c | 35 ++++------------------------------- 1 file changed, 4 insertions(+), 31 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 267a5b6b4..67282f269 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1337,34 +1337,6 @@ void fini_cgroup(void) cg_yard = NULL; } -static int restore_perms(int fd, const char *path, CgroupPerms *perms) -{ - struct stat sb; - - if (perms) { - if (fstat(fd, &sb) < 0) { - pr_perror("stat of property %s failed", path); - return -1; - } - - /* only chmod/chown if the perms are actually different: we aren't - * allowed to chmod some cgroup props (e.g. the read only ones), so we - * don't want to try if the perms already match. - */ - if (sb.st_mode != (mode_t)perms->mode && fchmod(fd, perms->mode) < 0) { - pr_perror("chmod of %s failed", path); - return -1; - } - - if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && fchown(fd, perms->uid, perms->gid)) { - pr_perror("chown of %s failed", path); - return -1; - } - } - - return 0; -} - static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) { char *current, *next; @@ -1462,7 +1434,7 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat return -1; } - if (restore_perms(fd, path, perms) < 0) + if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ @@ -1786,7 +1758,7 @@ static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { - int fd, ret; + int fd, ret = 0; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { @@ -1794,7 +1766,8 @@ static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) return -1; } - ret = restore_perms(fd, path, perms); + if (perms) + ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); close(fd); return ret; } From 113957270bc1506435946e49023763019120c5f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 21:02:43 +0200 Subject: [PATCH 278/775] memfd: Avoid EPERM for no-op chown(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/memfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/memfd.c b/criu/memfd.c index 6a43dece6..1b4278a7d 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -279,7 +279,7 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (fchown(fd, mie->uid, mie->gid)) { + if (cr_fchown(fd, mie->uid, mie->gid)) { pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); goto out; } From 99188cfbe33d091bccdc0160627bf35c2e126782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 21:03:02 +0200 Subject: [PATCH 279/775] tty: Avoid EPERM for no-op chown(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/tty.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/tty.c b/criu/tty.c index 199984ec0..9faf602f2 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -22,6 +22,7 @@ #include "rst-malloc.h" #include "log.h" #include "common/list.h" +#include "util.h" #include "util-pie.h" #include "proc_parse.h" #include "file-ids.h" @@ -867,7 +868,7 @@ static int restore_tty_params(int fd, struct tty_info *info) } if (info->tie->has_uid && info->tie->has_gid) { - if (fchown(fd, info->tie->uid, info->tie->gid)) { + if (cr_fchown(fd, info->tie->uid, info->tie->gid)) { pr_perror("Can't setup uid %d gid %d on %#x", (int)info->tie->uid, (int)info->tie->gid, info->tfe->id); return -1; From 4998b724ef9df3771803c65ace4f8a071d625906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:42:09 +0200 Subject: [PATCH 280/775] restore: Avoid need for CAP_SETPCAP if not changing uids. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CRIU is run with the task's credentials on restore, don't set uids and gids. This avoids the need to modify the SECURE_NO_SETUID_FIXUP flag which requires CAP_SETPCAP. From: Andy Tucker Signed-off-by: Michał Mirosław --- criu/pie/restorer.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0d1360c52..9d1facf8a 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -191,10 +191,8 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; - - /* - * We're still root here and thus can do it without failures. - */ + int ruid, euid, suid, fsuid; + int rgid, egid, sgid, fsgid; /* * Setup supplementary group IDs early. @@ -207,6 +205,18 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ } } + /* + * Compare xids with current values. If all match then we can skip + * setting them (which requires extra capabilities). + */ + fsuid = sys_setfsuid(-1); + fsgid = sys_setfsgid(-1); + if (sys_getresuid(&ruid, &euid, &suid) == 0 && sys_getresgid(&rgid, &egid, &sgid) == 0 && ruid == ce->uid && + euid == ce->euid && suid == ce->suid && rgid == ce->gid && egid == ce->egid && sgid == ce->sgid && + fsuid == ce->fsuid && fsgid == ce->fsgid) { + goto skip_xids; + } + /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. @@ -250,12 +260,13 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } +skip_xids: /* * Third -- restore securebits. We don't need them in any * special state any longer. */ - if (!uid) { + if (sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0) != ce->secbits) { ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); if (ret) { pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); From 7ab02639f6540920f086e9c45a99e68e391a8904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 21 Jun 2023 22:42:44 +0200 Subject: [PATCH 281/775] restore: Skip setgroups() when already correct. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip calling setgroups() when the list of auxiliary groups already has the values we want. This allows restoring into an unprivileged user namespace where setgroups() is disabled. From: Ambrose Feinstein Signed-off-by: Michał Mirosław --- criu/pie/restorer.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9d1facf8a..a0f3eb90b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -51,6 +51,11 @@ #include "shmem.h" #include "restorer.h" +/* + * sys_getgroups() buffer size. Not too much, to avoid stack overflow. + */ +#define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) + #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 #endif @@ -198,10 +203,19 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ * Setup supplementary group IDs early. */ if (args->groups) { - ret = sys_setgroups(ce->n_groups, args->groups); - if (ret) { - pr_err("Can't setup supplementary group IDs: %d\n", ret); - return -1; + /* + * We may be in an unprivileged user namespace where setgroups + * is disabled. If the current list of groups is already what + * we want, skip the call to setgroups. + */ + unsigned int gids[MAX_GETGROUPS_CHECKED]; + int n = sys_getgroups(MAX_GETGROUPS_CHECKED, gids); + if (n != ce->n_groups || memcmp(gids, args->groups, n * sizeof(*gids))) { + ret = sys_setgroups(ce->n_groups, args->groups); + if (ret) { + pr_err("Can't setgroups([%zu gids]): %d\n", ce->n_groups, ret); + return -1; + } } } From 7df3f659572b428a5fac867c6abc64b08286de85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 21 Jun 2023 14:42:48 +0200 Subject: [PATCH 282/775] restore: Fix capability migration requirements between different kernels. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When restoring on a kernel that has different number of supported capabilities than checkpoint one, check that the extra caps are unset. There are two directions to consider: 1) dump.cap_last_cap > restore.cap_last_cap - restoring might reduce the processes' capabilities if restored kernel doesn't support checkpointed caps. Warn. 2) dump.cap_last_cap < restore.cap_last_cap - restoring will fill the extra caps with zeroes. No changes. Note: `last_cap` might change without affecting `n_words`. Signed-off-by: Michał Mirosław --- criu/cr-restore.c | 40 ++++++++++++++++++++++++---------------- criu/kerndat.c | 9 ++++++++- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index bff41dc56..9107a2322 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2940,12 +2940,6 @@ out: return ret; } -static inline int verify_cap_size(CredsEntry *ce) -{ - return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && (ce->n_cap_prm == CR_CAP_SIZE) && - (ce->n_cap_bnd == CR_CAP_SIZE)); -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; @@ -3360,17 +3354,31 @@ static bool groups_match(gid_t *groups, int n_groups) return ret; } +static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) +{ + int i, cap_end; + + for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { + if (~in_caps[i / 32] & (1 << (i % 32))) + continue; + + pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); + /* extra caps will be cleared below */ + } + + n_words = min(n_words, (kdat.last_cap + 31) / 32); + cap_end = (kdat.last_cap & 31) + 1; + memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); + if ((cap_end & 31) && n_words) + out_caps[n_words - 1] &= (1 << cap_end) - 1; + memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); +} + static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; - if (!verify_cap_size(ce)) { - pr_err("Caps size mismatch %d %d %d %d\n", (int)ce->n_cap_inh, (int)ce->n_cap_eff, (int)ce->n_cap_prm, - (int)ce->n_cap_bnd); - return ERR_PTR(-EINVAL); - } - this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); @@ -3458,10 +3466,10 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.groups = NULL; args->creds.lsm_profile = NULL; - memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); - memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); - memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); - memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); + copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); + copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); + copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); + copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; diff --git a/criu/kerndat.c b/criu/kerndat.c index 4b836b5f7..bd1ccdc7d 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -465,8 +465,15 @@ static int get_last_cap(void) struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; + int ret; - return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) + return ret; + + pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", + kdat.last_cap, 32 * CR_CAP_SIZE - 1); + return -1; } static bool kerndat_has_memfd_create(void) From a605cc9f36e0d3ac6fe3f7c9d5ce8090a466b067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 22 Jun 2023 18:20:24 +0200 Subject: [PATCH 283/775] prctl: Migrate prctl(NO_NEW_PRIVS) setting. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/include/parasite.h | 1 + criu/include/prctl.h | 6 ++++++ criu/parasite-syscall.c | 4 ++++ criu/pie/parasite.c | 1 + criu/pie/restorer.c | 8 ++++++++ images/creds.proto | 1 + 6 files changed, 21 insertions(+) diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 787c927be..739fbf2c3 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -148,6 +148,7 @@ struct parasite_dump_creds { int uids[4]; int gids[4]; + int no_new_privs; unsigned int secbits; unsigned int ngroups; /* diff --git a/criu/include/prctl.h b/criu/include/prctl.h index c843f40a7..4c2a548b1 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -30,6 +30,12 @@ #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif +#ifndef PR_GET_NO_NEW_PRIVS +#define PR_GET_NO_NEW_PRIVS 39 +#endif +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 35489634d..c08ed09b1 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -115,6 +115,10 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + if (c->no_new_privs > 0) { + ce->no_new_privs = c->no_new_privs; + ce->has_no_new_privs = true; + } ce->secbits = c->secbits; ce->n_groups = c->ngroups; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 2303f41c3..58ea35892 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -268,6 +268,7 @@ static int dump_creds(struct parasite_dump_creds *args) } } + args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index a0f3eb90b..c3662b30b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -345,6 +345,14 @@ skip_xids: if (lsm_set_label(args->lsm_sockcreate, "sockcreate", procfd) < 0) return -1; + if (ce->has_no_new_privs && ce->no_new_privs) { + ret = sys_prctl(PR_SET_NO_NEW_PRIVS, ce->no_new_privs, 0, 0, 0); + if (ret) { + pr_err("Unable to set no_new_privs=%d: %d\n", ce->no_new_privs, ret); + return -1; + } + } + return 0; } diff --git a/images/creds.proto b/images/creds.proto index 6228f7fcb..220ed3858 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -24,4 +24,5 @@ message creds_entry { optional string lsm_profile = 15; optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; + optional uint32 no_new_privs = 18; } From fe4be19de432fb3ff8cd2a41f4ba788c3c4e6ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 19 Jul 2023 18:57:09 +0200 Subject: [PATCH 284/775] prctl: test prctl(NO_NEW_PRIVS) setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- test/zdtm/static/Makefile | 1 + test/zdtm/static/seccomp_no_new_privs.c | 42 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 test/zdtm/static/seccomp_no_new_privs.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4b3d2e341..30429e425 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -215,6 +215,7 @@ TST_NOFILE := \ seccomp_filter_tsync \ seccomp_filter_threads \ seccomp_filter_inheritance \ + seccomp_no_new_privs \ different_creds \ vsx \ bridge \ diff --git a/test/zdtm/static/seccomp_no_new_privs.c b/test/zdtm/static/seccomp_no_new_privs.c new file mode 100644 index 000000000..95f9501ed --- /dev/null +++ b/test/zdtm/static/seccomp_no_new_privs.c @@ -0,0 +1,42 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that NO_NEW_PRIVS attribute is restored"; +const char *test_author = "Michał Mirosław "; + +int main(int argc, char **argv) +{ + int ret; + + test_init(argc, argv); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 0) + fail("initial NO_NEW_PRIVS = %d != 0", ret); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (ret) { + pr_perror("Can't set NO_NEW_PRIVS attribute"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 1) + fail("restored NO_NEW_PRIVS = %d != 1", ret); + + pass(); + return 0; +} From 25d0330809129222044147463172d6da0bec69fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Tue, 25 Jul 2023 17:54:26 +0200 Subject: [PATCH 285/775] restore: Skip dropping BSET capability if irrelevant. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit prctl(NO_NEW_PRIVS) when set prevents child processes gaining capabilities not in permitted set. In this case, inability to clear capability from BSET that is not in the permitted set is harmless. Signed-off-by: Michał Mirosław --- criu/pie/restorer.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index c3662b30b..d4f77bfde 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -301,10 +301,18 @@ skip_xids: /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); - if (ret) { + if (!ret) + continue; + if (!ce->has_no_new_privs || !ce->no_new_privs || args->cap_prm[b] & (1 << i)) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } + /* + * If prctl(NO_NEW_PRIVS) is going to be set then it + * will prevent inheriting the capabilities not in + * the permitted set. + */ + pr_warn("Unable to drop capability %d from bset: %d (but NO_NEW_PRIVS will drop it)\n", i + b * 32, ret); } } From ac1219f4eef81ed89560344f50f56ab61240519b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 22 May 2023 17:41:56 +0200 Subject: [PATCH 286/775] sk-inet: Extend 'TCP repair off' failure log. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include the file descriptor and error code in the debug message to make it more useful. Fixes: e7ba90955ce7 (2016-03-14 "cr-check: Inspect errno on syscall failures") Signed-off-by: Michał Mirosław --- criu/include/sk-inet.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 961d711ee..b3a70fb27 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -69,6 +69,7 @@ extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt +#define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { @@ -76,7 +77,7 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_err("Failed to turn off repair mode on socket\n"); + pr_perror("Failed to turn off repair mode on socket %d", fd); } extern void tcp_locked_conn_add(struct inet_sk_info *); From dfa54109512ac36ea96f61c114a952e5668665a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 21:21:41 +0200 Subject: [PATCH 287/775] memfd: dump and restore permissions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memfd is created by default with +x permissions set. This can be changed by a process using fchmod() and expected to prevent using this fd for exec(). Migrate the permissions. Signed-off-by: Michał Mirosław --- criu/memfd.c | 11 +++++++++-- images/memfd.proto | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/criu/memfd.c b/criu/memfd.c index 1b4278a7d..2158b6720 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -91,6 +91,8 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * mie.has_hugetlb_flag = true; mie.hugetlb_flag = flag | MFD_HUGETLB; } + mie.mode = st->st_mode; + mie.has_mode = true; mie.seals = fcntl(fd, F_GET_SEALS); if (mie.seals == -1) { @@ -279,8 +281,13 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (cr_fchown(fd, mie->uid, mie->gid)) { - pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); + if (mie->has_mode) + ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); + else + ret = cr_fchown(fd, mie->uid, mie->gid); + if (ret) { + pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, + (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); goto out; } diff --git a/images/memfd.proto b/images/memfd.proto index 0e625416a..bb0be4a6f 100644 --- a/images/memfd.proto +++ b/images/memfd.proto @@ -22,4 +22,5 @@ message memfd_inode_entry { required uint32 seals = 6 [(criu).flags = "seals.flags"]; required uint64 inode_id = 7; optional uint32 hugetlb_flag = 8; + optional uint32 mode = 9; }; From 3628589b51c05d783fb37c05fae6d62e17347caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 21:30:26 +0200 Subject: [PATCH 288/775] zdtm/memfd00: test memfd file mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- test/zdtm/static/memfd00.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/zdtm/static/memfd00.c b/test/zdtm/static/memfd00.c index d037f6969..8d77ed06e 100644 --- a/test/zdtm/static/memfd00.c +++ b/test/zdtm/static/memfd00.c @@ -30,8 +30,10 @@ int main(int argc, char *argv[]) { int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; struct statfs statfs1, statfs2; + struct stat stat; off_t pos1, pos2; char buf[5]; + int fmode1, fmode2; test_init(argc, argv); @@ -58,6 +60,13 @@ int main(int argc, char *argv[]) if (lseek(fd, pos1, SEEK_SET) < 0) err(1, "seek error"); + if (fchmod(fd, 0642)) + err(1, "Can't set permission bits"); + + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode1 = stat.st_mode; + test_daemon(); test_waitsig(); @@ -85,6 +94,15 @@ int main(int argc, char *argv[]) return 1; } + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode2 = stat.st_mode; + + if (fmode1 != fmode2) { + fail("stat.st_mode = %#o != %#o", fmode2, fmode1); + return 1; + } + pos2 = lseek(fd, 0, SEEK_CUR); if (pos1 != pos2) { fail("position differs"); From 6ed50ea49d1ed234178f4df86ca6227252350e6b Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 23:00:07 -0700 Subject: [PATCH 289/775] apparmor: fix incorrect usage of sizeof on char ptr In criu/apparmor.c: write_aa_policy(), the arg path is passed as a char pointer. The original code used sizeof(path) to get the size of it, which is incorrect as it always return the size of the char pointer (typically 8 or 4), not the actual capacity of the char array. Given that this function is only invoked with path declared as `char path[PATH_MAX]`, replacing sizeof(path) with PATH_MAX should correctly represent the maximum size of it. Fixes: 8723e3f ("check: add a feature test for apparmor_stacking") Signed-off-by: Haorong Lu --- criu/apparmor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/apparmor.c b/criu/apparmor.c index 9de54ce40..5b62759e2 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -551,8 +551,8 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit goto fail; } - ret = snprintf(path + offset + my_offset, sizeof(path) - offset - my_offset, "/.replace"); - if (ret < 0 || ret >= sizeof(path) - offset - my_offset) { + ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); + if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { pr_err("snprintf failed\n"); goto fail; } From 2a131167bbc671a274f3ddeaf3b153fba6400e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Mon, 31 Jul 2023 20:49:05 +0200 Subject: [PATCH 290/775] page-xfer: Pull tcp_cork,nodelay(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move tcp_cork() and tcp_nodelay() to the only user: page-xfer.c. While at it, fix error messages (as they do not refer to restoring the sockopt values) and demote them as they are not fatal to the page transfer. Signed-off-by: Michał Mirosław --- criu/include/util.h | 2 -- criu/page-xfer.c | 15 +++++++++++++++ criu/util.c | 15 --------------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/criu/include/util.h b/criu/include/util.h index 7e4a13a6a..4334e69c2 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -278,8 +278,6 @@ static inline int sk_wait_data(int sk) } void fd_set_nonblocking(int fd, bool on); -void tcp_nodelay(int sk, bool on); -void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 782d4cafc..94f477414 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,20 @@ static inline int send_psi(int sk, struct page_server_iov *pi) return send_psi_flags(sk, pi, 0); } +static void tcp_cork(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_CORK=%d", val); +} + +static void tcp_nodelay(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_NODELAY=%d", val); +} + /* page-server xfer */ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long len) { diff --git a/criu/util.c b/criu/util.c index bca7ad88a..993ab97bb 100644 --- a/criu/util.c +++ b/criu/util.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -1155,20 +1154,6 @@ const char *ns_to_string(unsigned int ns) } } -void tcp_cork(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_pwarn("Unable to restore TCP_CORK (%d)", val); -} - -void tcp_nodelay(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_pwarn("Unable to restore TCP_NODELAY (%d)", val); -} - static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) { memset(addr, 0, sizeof(*addr)); From 69200bec7608d40de127fdc3cb0977ce552f83a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Tue, 1 Aug 2023 22:39:59 +0200 Subject: [PATCH 291/775] irmap: scan user-provided paths in order MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the scan use the order of paths that came from the user. Fixes: 4f2e4ab3be01 ("irmap: add --irmap-scan-path option"; 2015-09-16) Signed-off-by: Michał Mirosław --- criu/irmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/irmap.c b/criu/irmap.c index 2cdc66071..e12df5cb5 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -501,6 +501,6 @@ int irmap_scan_path_add(char *path) o->ir->path = path; o->ir->nr_kids = -1; - list_add(&o->node, &opts.irmap_scan_paths); + list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; } From a4b49c46fe0f9d097840718e20efb54178162556 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 7 Aug 2023 19:28:07 +0100 Subject: [PATCH 292/775] amdgpu_plugin: remove duplicated log prefix The log prefix "amdgpu_plugin:" is defined with `LOG_PREFIX` in `amdgpu_plugin.c`. However, the prefix is also included in each log message. As a result it appears duplicated in the log messages: (00.044324) amdgpu_plugin: amdgpu_plugin: devices:1 bos:58 objects:148 priv_data:45696 (00.045376) amdgpu_plugin: amdgpu_plugin: Thread[0x5589] started (00.167172) amdgpu_plugin: amdgpu_plugin: img_path = amdgpu-kfd-62.img (00.083739) amdgpu_plugin: amdgpu_plugin : amdgpu_plugin_dump_file() called for fd = 235 Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 96 +++++++++++++++++----------------- 1 file changed, 47 insertions(+), 49 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 0a55e34a2..6397ecdb7 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -451,7 +451,7 @@ void getenv_bool(const char *var, bool *value) int amdgpu_plugin_init(int stage) { - pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); topology_init(&dest_topology); @@ -481,7 +481,7 @@ int amdgpu_plugin_init(int stage) void amdgpu_plugin_fini(int stage, int ret) { - pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) sys_close_drm_render_devices(&dest_topology); @@ -513,7 +513,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) char img_path[128]; int ret = 0; - pr_debug("amdgpu_plugin: Enter %s\n", __func__); + pr_debug("Enter %s\n", __func__); ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { pr_perror("stat error for /dev/kfd"); @@ -539,7 +539,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) return 0; } - pr_perror("amdgpu_plugin: Can't handle the VMA mapping"); + pr_perror("Can't handle the VMA mapping"); return -ENOTSUP; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -857,7 +857,7 @@ void *dump_bo_contents(void *_thread_data) void *buffer; char img_path[40]; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -922,7 +922,7 @@ void *dump_bo_contents(void *_thread_data) } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -951,7 +951,7 @@ void *restore_bo_contents(void *_thread_data) int num_bos = 0; int i, ret = 0; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -989,8 +989,7 @@ void *restore_bo_contents(void *_thread_data) } if (total_bo_size != image_size) { - pr_err("amdgpu_plugin: %s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, - total_bo_size); + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, total_bo_size); ret = -EINVAL; goto exit; @@ -1026,7 +1025,7 @@ void *restore_bo_contents(void *_thread_data) } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -1054,9 +1053,9 @@ int check_hsakmt_shared_mem(uint64_t *shared_mem_size, uint32_t *shared_mem_magi /* First 4 bytes of shared file is the magic */ ret = read_file(HSAKMT_SHM_PATH, shared_mem_magic, sizeof(*shared_mem_magic)); if (ret) - pr_perror("amdgpu_plugin: Failed to read shared mem magic"); + pr_perror("Failed to read shared mem magic"); else - plugin_log_msg("amdgpu_plugin: Shared mem magic:0x%x\n", *shared_mem_magic); + plugin_log_msg("Shared mem magic:0x%x\n", *shared_mem_magic); return 0; } @@ -1071,7 +1070,7 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; if (!stat(HSAKMT_SHM_PATH, &st)) { - pr_debug("amdgpu_plugin: %s already exists\n", HSAKMT_SHM_PATH); + pr_debug("%s already exists\n", HSAKMT_SHM_PATH); } else { pr_info("Warning:%s was missing. Re-creating new file but we may lose perf counters\n", HSAKMT_SHM_PATH); @@ -1079,14 +1078,14 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha ret = ftruncate(fd, shared_mem_size); if (ret < 0) { - pr_err("amdgpu_plugin: Failed to truncate shared mem %s\n", HSAKMT_SHM); + pr_err("Failed to truncate shared mem %s\n", HSAKMT_SHM); close(fd); return -errno; } ret = write(fd, &shared_mem_magic, sizeof(shared_mem_magic)); if (ret != sizeof(shared_mem_magic)) { - pr_perror("amdgpu_plugin: Failed to restore shared mem magic"); + pr_perror("Failed to restore shared mem magic"); close(fd); return -errno; } @@ -1112,7 +1111,7 @@ static int unpause_process(int fd) ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to unpause process"); + pr_perror("Failed to unpause process"); goto exit; } @@ -1254,7 +1253,7 @@ bool kernel_supports_criu(int fd) } if (kmtIoctl(fd, AMDKFD_IOC_GET_VERSION, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call get version ioctl"); + pr_perror("Failed to call get version ioctl"); ret = false; goto exit; } @@ -1262,8 +1261,8 @@ bool kernel_supports_criu(int fd) pr_debug("Kernel IOCTL version:%d.%02d\n", args.major_version, args.minor_version); if (args.major_version != KFD_IOCTL_MAJOR_VERSION || args.minor_version < MIN_KFD_IOCTL_MINOR_VERSION) { - pr_err("amdgpu_plugin: CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", - args.major_version, args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); + pr_err("CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", args.major_version, + args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); ret = false; goto exit; } @@ -1286,13 +1285,13 @@ int amdgpu_plugin_dump_file(int fd, int id) size_t len; if (fstat(fd, &st) == -1) { - pr_perror("amdgpu_plugin: fstat error"); + pr_perror("fstat error"); return -1; } ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { - pr_perror("amdgpu_plugin: fstat error for /dev/kfd"); + pr_perror("fstat error for /dev/kfd"); return -1; } @@ -1317,12 +1316,11 @@ int amdgpu_plugin_dump_file(int fd, int id) CriuRenderNode rd = CRIU_RENDER_NODE__INIT; struct tp_node *tp_node; - pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), - fd, id); + pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id); tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); if (!tp_node) { - pr_err("amdgpu_plugin: Failed to find a device with minor number = %d\n", minor(st.st_rdev)); + pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev)); return -ENODEV; } @@ -1350,7 +1348,7 @@ int amdgpu_plugin_dump_file(int fd, int id) return ret; } - pr_info("amdgpu_plugin: %s : %s() called for fd = %d\n", CR_PLUGIN_DESC.name, __func__, major(st.st_rdev)); + pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); /* KFD only allows ioctl calls from the same process that opened the KFD file descriptor. * The existing /dev/kfd file descriptor that is passed in is only allowed to do IOCTL calls with @@ -1362,13 +1360,13 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_PROCESS_INFO; if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call process info ioctl"); + pr_perror("Failed to call process info ioctl"); ret = -1; goto exit; } - pr_info("amdgpu_plugin: devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, - args.num_objects, args.priv_data_size); + pr_info("devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, args.num_objects, + args.priv_data_size); e = xmalloc(sizeof(*e)); if (!e) { @@ -1401,7 +1399,7 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_CHECKPOINT; ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to call dumper (process) ioctl"); + pr_perror("Failed to call dumper (process) ioctl"); goto exit; } @@ -1423,11 +1421,11 @@ int amdgpu_plugin_dump_file(int fd, int id) goto exit; snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); - pr_info("amdgpu_plugin: img_path = %s\n", img_path); + pr_info("img_path = %s\n", img_path); len = criu_kfd__get_packed_size(e); - pr_info("amdgpu_plugin: Len = %ld\n", len); + pr_info("Len = %ld\n", len); buf = xmalloc(len); if (!buf) { @@ -1453,9 +1451,9 @@ exit: free_e(e); if (ret) - pr_err("amdgpu_plugin: Failed to dump (ret:%d)\n", ret); + pr_err("Failed to dump (ret:%d)\n", ret); else - pr_info("amdgpu_plugin: Dump successful\n"); + pr_info("Dump successful\n"); return ret; } @@ -1501,10 +1499,10 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) device_bucket->drm_fd = node_get_drm_render_device(tp_node); if (device_bucket->drm_fd < 0) { - pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver"); + pr_perror("Can't pass NULL drm render fd to driver"); goto exit; } else { - pr_info("amdgpu_plugin: passing drm render fd = %d to driver\n", device_bucket->drm_fd); + pr_info("passing drm render fd = %d to driver\n", device_bucket->drm_fd); } } @@ -1588,7 +1586,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf vma_md->new_pgoff = bo_bucket->restored_offset; vma_md->fd = node_get_drm_render_device(tp_node); - plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx " + plugin_log_msg("adding vma_entry:addr:0x%lx old-off:0x%lx " "new_off:0x%lx new_minor:%d\n", vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor); @@ -1669,7 +1667,7 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; - pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id); + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1713,7 +1711,7 @@ int amdgpu_plugin_restore_file(int id) } fclose(img_fp); - pr_info("amdgpu_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id); + pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id); target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id); if (!target_gpu_id) { @@ -1727,11 +1725,11 @@ int amdgpu_plugin_restore_file(int id) goto fail; } - pr_info("amdgpu_plugin: render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); + pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); if (fd < 0) - pr_err("amdgpu_plugin: Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1752,7 +1750,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - pr_info("amdgpu_plugin: Opened kfd, fd = %d\n", fd); + pr_info("Opened kfd, fd = %d\n", fd); if (!kernel_supports_criu(fd)) return -ENOTSUP; @@ -1780,7 +1778,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - plugin_log_msg("amdgpu_plugin: read image file data\n"); + plugin_log_msg("read image file data\n"); /* * Initialize fd_next to be 1 greater than the biggest file descriptor in use by the target restore process. @@ -1847,10 +1845,10 @@ exit: xfree(buf); if (ret) { - pr_err("amdgpu_plugin: Failed to restore (ret:%d)\n", ret); + pr_err("Failed to restore (ret:%d)\n", ret); fd = ret; } else { - pr_info("amdgpu_plugin: Restore successful (fd:%d)\n", fd); + pr_info("Restore successful (fd:%d)\n", fd); } return fd; @@ -1870,7 +1868,7 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; - plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__); + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1908,8 +1906,8 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const else *updated_fd = -1; - plugin_log_msg("amdgpu_plugin: old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, - vma_md->new_pgoff, *updated_fd); + plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, + *updated_fd); return 1; } @@ -1924,7 +1922,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) struct kfd_ioctl_criu_args args = { 0 }; int fd, ret = 0; - pr_info("amdgpu_plugin: Inside %s for target pid = %d\n", __func__, target_pid); + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { @@ -1934,7 +1932,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) args.pid = target_pid; args.op = KFD_CRIU_OP_RESUME; - pr_info("amdgpu_plugin: Calling IOCTL to start notifiers and queues\n"); + pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { pr_perror("restore late ioctl failed"); ret = -1; From 9477354defe11add445102125973cdb1ccde5c84 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 7 Aug 2023 15:45:37 -0700 Subject: [PATCH 293/775] scripts/apt: don't hide apt output It is required to investigate issues. Signed-off-by: Andrei Vagin --- scripts/ci/apt-install | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/apt-install b/scripts/ci/apt-install index 45aca13f4..676e0f794 100755 --- a/scripts/ci/apt-install +++ b/scripts/ci/apt-install @@ -15,7 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends "$@" && break + apt-get update -y && apt-get install -y --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" From d3b955e5786f07bc4e47676f07159893b919ec64 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 7 Aug 2023 16:00:39 -0700 Subject: [PATCH 294/775] ci/docker: install all required packages This change fixes the issue: ``` The following packages have unmet dependencies: docker-ce : Depends: containerd.io (>= 1.6.4) E: Unable to correct problems, you have held broken packages. ``` Signed-off-by: Andrei Vagin --- scripts/ci/docker-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index bd46d5dd3..22d326a37 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -15,7 +15,7 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce +./apt-install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin # shellcheck source=/dev/null . /etc/lsb-release From d1096e3b31c6cd5a0eb5b0f2813ef88f23db32d8 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 13 Aug 2023 12:21:35 +0100 Subject: [PATCH 295/775] lib/py: add VMA_AREA_MEMFD constant The VMA_AREA_MEMFD constant was introduced with commit 29a1a88bcebaf9d83591077d2bec424da82c0e71 memfd: add memory mapping support This patch extends the status map used in CRIT and coredump with the value of this constant to recognize it. Signed-off-by: Radostin Stoyanov --- coredump/criu_coredump/coredump.py | 1 + lib/py/images/pb2dict.py | 1 + 2 files changed, 2 insertions(+) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 0b8a02e0a..20ec8e5dc 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -53,6 +53,7 @@ status = { "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, + "VMA_AREA_MEMFD": 1 << 14, "VMA_AREA_UNSUPP": 1 << 31 } diff --git a/lib/py/images/pb2dict.py b/lib/py/images/pb2dict.py index c7046429e..fe41642d5 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/py/images/pb2dict.py @@ -102,6 +102,7 @@ mmap_status_map = [ ('VMA_AREA_SOCKET', 1 << 11), ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), + ('VMA_AREA_MEMFD', 1 << 14), ('VMA_UNSUPP', 1 << 31), ] From 5b790aa18162d7791edf8fa6eac65ed7da0a285b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 21 Aug 2023 08:10:37 -0700 Subject: [PATCH 296/775] loongarch64: reformat syscall_64.tbl for 8-wide tabs Signed-off-by: Andrei Vagin --- .../plugins/std/syscalls/syscall_64.tbl | 228 +++++++++--------- 1 file changed, 114 insertions(+), 114 deletions(-) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index b37a22674..a0ad0cef4 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -5,117 +5,117 @@ # # __NR_name code name arguments # ------------------------------------------------------------------------------------------------------------------------------------------------------------- -__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) -__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) -__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) -__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) -__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) -__NR_flock 32 sys_flock (int fd, unsigned long cmd) -__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) -__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) -__NR_umount2 39 sys_umount2 (char *name, int flags) -__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) -__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) -__NR_close 57 sys_close (int fd) -__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) -__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) -__NR_read 63 sys_read (int fd, void *buf, unsigned long count) -__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) -__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) -__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) -__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) -__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) -__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) -__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) -__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) -__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) -__NR_personality 92 sys_personality (unsigned int personality) -__NR_exit 93 sys_exit (unsigned long error_code) -__NR_exit_group 94 sys_exit_group (int error_code) -__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) -__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) -__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) -__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) -__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) -__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) -__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) -__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) -__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) -__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) -__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) -__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) -__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) -__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) -__NR_restart_syscall 128 sys_restart_syscall (void) -__NR_kill 129 sys_kill (long pid, int sig) -__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) -__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) -__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) -__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) -__NR_rt_sigreturn 139 sys_rt_sigreturn (void) -__NR_setpriority 140 sys_setpriority (int which, int who, int nice) -__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) -__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) -__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) -__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) -__NR_getpgid 155 sys_getpgid (pid_t pid) -__NR_setfsuid 151 sys_setfsuid (int fsuid) -__NR_setfsgid 152 sys_setfsgid (int fsgid) -__NR_getsid 156 sys_getsid (void) -__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) -__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) -__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) -__NR_umask 166 sys_umask (int mask) -__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) -__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) -__NR_getpid 172 sys_getpid (void) -__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) -__NR_gettid 178 sys_gettid (void) -__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) -__NR_socket 198 sys_socket (int domain, int type, int protocol) -__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) -__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) -__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) -__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) -__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) -__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) -__NR_shutdown 210 sys_shutdown (int sockfd, int how) -__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) -__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) -__NR_brk 214 sys_brk (void *addr) -__NR_munmap 215 sys_munmap (void *addr, unsigned long len) -__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) -__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) -__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) -__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) -__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) -__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) -__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) -__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) -__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) -__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) -__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) -__NR_setns 268 sys_setns (int fd, int nstype) -__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) -__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) -__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) -__NR_userfaultfd 282 sys_userfaultfd (int flags) -__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) -__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) -__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) -__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) -__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) -__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) -#__NR_rmdir ! sys_rmdir (const char *name) -#__NR_unlink ! sys_unlink (char *pathname) -#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) -#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) -#__NR_mkdir ! sys_mkdir (const char *name, int mode) -#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 2 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 2 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 3 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 3 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 3 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 3 sys_umount2 (char *name, int flags) +__NR_mount 4 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 4 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 5 sys_close (int fd) +__NR_openat 5 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 6 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 6 sys_read (int fd, void *buf, unsigned long count) +__NR_write 6 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 6 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 6 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 7 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 7 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 7 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 7 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 8 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 9 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 9 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 9 sys_personality (unsigned int personality) +__NR_exit 9 sys_exit (unsigned long error_code) +__NR_exit_group 9 sys_exit_group (int error_code) +__NR_waitid 9 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 9 sys_set_tid_address (int *tid_addr) +__NR_futex 9 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 9 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 1 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 1 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 1 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 1 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 1 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 1 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 1 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 1 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 1 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 1 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 1 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 1 sys_restart_syscall (void) +__NR_kill 1 sys_kill (long pid, int sig) +__NR_sigaltstack 1 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 1 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 1 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 1 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 1 sys_rt_sigreturn (void) +__NR_setpriority 1 sys_setpriority (int which, int who, int nice) +__NR_setresuid 1 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 1 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 1 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 1 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 1 sys_getpgid (pid_t pid) +__NR_setfsuid 1 sys_setfsuid (int fsuid) +__NR_setfsgid 1 sys_setfsgid (int fsgid) +__NR_getsid 1 sys_getsid (void) +__NR_getgroups 1 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 1 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 1 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 1 sys_umask (int mask) +__NR_prctl 1 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 1 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 1 sys_getpid (void) +__NR_ptrace 1 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 1 sys_gettid (void) +__NR_shmat 1 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 1 sys_socket (int domain, int type, int protocol) +__NR_bind 2 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 2 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 2 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 2 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 2 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 2 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 2 sys_shutdown (int sockfd, int how) +__NR_sendmsg 2 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 2 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 2 sys_brk (void *addr) +__NR_munmap 2 sys_munmap (void *addr, unsigned long len) +__NR_mremap 2 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 2 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 2 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 2 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 2 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 2 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 2 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 2 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 2 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 2 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 2 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 2 sys_setns (int fd, int nstype) +__NR_kcmp 2 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 2 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 2 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 2 sys_userfaultfd (int flags) +__NR_rseq 2 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 4 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 4 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 4 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 4 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 4 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 4 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 4 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 4 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 4 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) From e07155e194c914400c5dc574820d726a4d8f9fbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 6 Oct 2022 17:52:46 +0200 Subject: [PATCH 297/775] dump+restore: Implement membarrier() registration c/r. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note: Silently drops MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED as it's not currently detectable. This is still better than silently dropping all membarrier() registrations. Signed-off-by: Michał Mirosław --- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../plugins/std/syscalls/syscall_64.tbl | 1 + .../mips/plugins/std/syscalls/syscall_64.tbl | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + criu/cr-dump.c | 5 ++ criu/cr-restore.c | 3 ++ criu/include/parasite.h | 1 + criu/include/restorer.h | 1 + criu/pie/parasite.c | 50 +++++++++++++++++++ criu/pie/restorer.c | 27 ++++++++++ images/core.proto | 2 + 14 files changed, 96 insertions(+) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 8bcc3cc50..7489ee0c1 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -122,3 +122,4 @@ pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index a0ad0cef4..f844d898d 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -102,6 +102,7 @@ __NR_kcmp 2 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, u __NR_seccomp 2 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_memfd_create 2 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 2 sys_userfaultfd (int flags) +__NR_membarrier 3 sys_membarrier (int cmd, unsigned int flags, int cpu_id) __NR_rseq 2 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_open_tree 4 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) __NR_move_mount 4 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 505ec849d..9f50d5e8a 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -119,3 +119,4 @@ __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index af40d7104..4c9b75cf1 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -118,3 +118,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 6a349e1cb..af7d550e2 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -118,3 +118,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index a119a59b2..ab36a5cd6 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -106,3 +106,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 16dd86e79..57681b79a 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -117,3 +117,4 @@ __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 340fb96ec..ee5974acc 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -770,6 +770,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; + if (misc->membarrier_registration_mask) { + core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; + core->tc->has_membarrier_registration_mask = true; + } + ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9107a2322..270049721 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -863,6 +863,9 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; + if (tc->has_membarrier_registration_mask) + args->membarrier_registration_mask = tc->membarrier_registration_mask; + /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid); diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 739fbf2c3..5209b6da2 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -128,6 +128,7 @@ struct parasite_dump_misc { int dumpable; int thp_disabled; int child_subreaper; + int membarrier_registration_mask; }; /* diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 2475ee0bc..f398d8d8f 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -229,6 +229,7 @@ struct task_restore_args { #endif int lsm_type; int child_subreaper; + int membarrier_registration_mask; bool has_clone3_set_tid; /* diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 58ea35892..c0604903b 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -211,6 +211,42 @@ out: return ret; } +/* + * Returns a membarrier() registration command (it is a bitmask) if the process + * was registered for specified (as a bit index) membarrier()-issuing command; + * returns zero otherwise. + */ +static int get_membarrier_registration_mask(int cmd_bit) +{ + unsigned cmd = 1 << cmd_bit; + int ret; + + /* + * Issuing a barrier will be successful only if the process was registered + * for this type of membarrier. All errors are a sign that the type issued + * was not registered (EPERM) or not supported by kernel (EINVAL or ENOSYS). + */ + ret = sys_membarrier(cmd, 0, 0); + if (ret && ret != -EPERM && ret != -EINVAL && ret != -ENOSYS) { + pr_err("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + return -1; + } + pr_debug("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + /* + * For supported registrations, MEMBARRIER_CMD_REGISTER_xxx = MEMBARRIER_CMD_xxx << 1. + * See: enum membarrier_cmd in include/uapi/linux/membarrier.h in kernel sources. + */ + return ret ? 0 : cmd << 1; +} + +/* + * It would be better to check the following with BUILD_BUG_ON, but we might + * have an old linux/membarrier.h header without necessary enum values. + */ +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 + static int dump_misc(struct parasite_dump_misc *args) { int ret; @@ -225,6 +261,20 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + args->membarrier_registration_mask = 0; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); + if (ret < 0) + return -1; + args->membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); + if (ret < 0) + return -1; + args->membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); + if (ret < 0) + return -1; + args->membarrier_registration_mask |= ret; + ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) pr_err("PR_GET_CHILD_SUBREAPER failed (%d)\n", ret); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index d4f77bfde..bbee0f6fb 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1537,6 +1537,30 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; } +/* + * Restore membarrier() registrations. + */ +static int restore_membarrier_registrations(int mask) +{ + unsigned long bitmap[1] = { mask }; + int i, err, ret = 0; + + if (!mask) + return 0; + + pr_info("Restoring membarrier() registrations %x\n", mask); + + for_each_bit(i, bitmap) { + err = sys_membarrier(1 << i, 0, 0); + if (!err) + continue; + pr_err("Can't restore membarrier(1 << %d) registration: %d\n", i, err); + ret = -1; + } + + return ret; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -2023,6 +2047,9 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } + if (restore_membarrier_registrations(args->membarrier_registration_mask) < 0) + goto core_restore_end; + pr_info("%ld: Restored\n", sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); diff --git a/images/core.proto b/images/core.proto index 1882fe8e4..5b07b5c44 100644 --- a/images/core.proto +++ b/images/core.proto @@ -64,6 +64,8 @@ message task_core_entry { optional uint64 blk_sigset_extended = 20[(criu).hex = true]; optional uint32 stop_signo = 21; + + optional uint32 membarrier_registration_mask = 22 [(criu).hex = true]; } message task_kobj_ids_entry { From 2547ac8ac18cc902ff7e85b271ddf66e0efeb4a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 19 Jun 2023 12:00:51 +0200 Subject: [PATCH 298/775] zdtm: membarrier: test migration of membarrier() registration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- test/zdtm/static/Makefile | 1 + test/zdtm/static/membarrier.c | 116 ++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 test/zdtm/static/membarrier.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 30429e425..cd53932db 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -62,6 +62,7 @@ TST_NOFILE := \ pthread_timers \ pthread_timers_h \ rseq00 \ + membarrier \ vdso00 \ vdso01 \ vdso02 \ diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c new file mode 100644 index 000000000..a04b36035 --- /dev/null +++ b/test/zdtm/static/membarrier.c @@ -0,0 +1,116 @@ +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test membarrier() migration"; +const char *test_author = "Michał Mirosław "; + +/* + * Define membarrier() CMDs to avoid depending on exact kernel header version. + * FIXME: use MEMBARRIER_CMD_GET_REGISTRATIONS if supported by kernel. + */ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) + +static int membarrier(int cmd, unsigned int flags, int cpu_id) +{ + return syscall(__NR_membarrier, cmd, flags, cpu_id); +} + +static const struct { + const char *name_suffix; + int register_cmd; + int execute_cmd; +} membarrier_cmds[] = { + { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, +}; +static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); + +static int register_membarriers(void) +{ + int barriers_supported, barriers_registered; + bool all_ok = true; + + barriers_supported = membarrier(MEMBARRIER_CMD_QUERY, 0, 0); + if (barriers_supported < 0) { + fail("membarrier() not supported by running kernel"); + return -1; + } + + barriers_registered = 0; + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_supported & membarrier_cmds[i].register_cmd) + continue; + + barriers_registered |= membarrier_cmds[i].execute_cmd; + + if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { + pr_perror("membarrier(REGISTER_PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) { + fail("can't register membarrier()s - tried %#x, kernel %#x", + barriers_registered, barriers_supported); + return -1; + } + + if (!barriers_registered) { + fail("no known membarrier() cmds are supported by the kernel"); + return -1; + } + + return barriers_registered; +} + +static bool check_membarriers(int barriers_registered) +{ + bool all_ok = true; + + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_registered & membarrier_cmds[i].execute_cmd) + continue; + if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { + pr_perror("membarrier(PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) + fail("membarrier() check failed"); + + return all_ok; +} + +int main(int argc, char **argv) +{ + int barriers_registered; + + test_init(argc, argv); + + barriers_registered = register_membarriers(); + if (barriers_registered < 0) + return 1; + + test_msg("Pre-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + test_daemon(); + test_waitsig(); + + test_msg("Post-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + pass(); + return 0; +} From 620e8c0a61ae00d90c032018b67ed65c3057251f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Wed, 9 Aug 2023 14:42:22 +0200 Subject: [PATCH 299/775] Put a cap on the size of single preadv in restore operation. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While each preadv() is followed by a fallocate() that removes the data range from image files on tmpfs, temporarily (between preadv() and fallocate()) the same data is in two places; this increases the memory overhead of restore operation by the size of a single preadv. Uncapped preadv() would read up to 2 GiB of data, thus we limit that to a smaller block size (128 MiB). Based-on-work-by: Paweł Stradomski Signed-off-by: Michał Mirosław --- criu/pie/restorer.c | 47 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index bbee0f6fb..0de2423a1 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -56,6 +56,12 @@ */ #define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) +/* + * Memory overhead limit for reading VMA when auto_dedup is enabled. + * An arbitrarily chosen trade-off point between speed and memory usage. + */ +#define AUTO_DEDUP_OVERHEAD_BYTES (128 << 20) + #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 #endif @@ -1477,6 +1483,40 @@ static int fd_poll(int inotify_fd) return sys_ppoll(&pfd, 1, &tmo, NULL, sizeof(sigset_t)); } +/* + * Call preadv() but limit size of the read. Zero `max_to_read` skips the limit. + */ +static ssize_t preadv_limited(int fd, struct iovec *iovs, int nr, off_t offs, size_t max_to_read) +{ + size_t saved_last_iov_len = 0; + ssize_t ret; + + if (max_to_read) { + for (int i = 0; i < nr; ++i) { + if (iovs[i].iov_len <= max_to_read) { + max_to_read -= iovs[i].iov_len; + continue; + } + + if (!max_to_read) { + nr = i; + break; + } + + saved_last_iov_len = iovs[i].iov_len; + iovs[i].iov_len = max_to_read; + nr = i + 1; + break; + } + } + + ret = sys_preadv(fd, iovs, nr, offs); + if (saved_last_iov_len) + iovs[nr - 1].iov_len = saved_last_iov_len; + + return ret; +} + /* * In the worst case buf size should be: * sizeof(struct inotify_event) * 2 + PATH_MAX @@ -1748,7 +1788,12 @@ long __export_restore_task(struct task_restore_args *args) while (nr) { pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); + /* + * If we're requested to punch holes in the file after reading we do + * it to save memory. Limit the reads then to an arbitrary block size. + */ + r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, + args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); if (r < 0) { pr_err("Can't read pages data (%d)\n", (int)r); goto core_restore_end; From 6ea60d6ef74604f28dd696adc11104c817250964 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 19 Aug 2023 21:56:25 -0700 Subject: [PATCH 300/775] github: auto-remove `changes requested` and `awaiting reply` labels Labels are removed when new comments are posted. Signed-off-by: Andrei Vagin --- .github/workflows/manage-labels.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/manage-labels.yml diff --git a/.github/workflows/manage-labels.yml b/.github/workflows/manage-labels.yml new file mode 100644 index 000000000..a2bcd8860 --- /dev/null +++ b/.github/workflows/manage-labels.yml @@ -0,0 +1,14 @@ +name: Remove labels +on: [issue_comment, pull_request_review_comment] +jobs: + remove-labels-on-comments: + name: Remove labels on comments + if: github.event_name == 'issue_comment' + runs-on: ubuntu-latest + steps: + - uses: mondeja/remove-labels-gh-action@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + labels: | + changes requested + awaiting reply From 8c17535f3f857233913b34b217419f57f5548a0b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 24 Aug 2023 13:52:21 -0700 Subject: [PATCH 301/775] loongarch64: fix syscall_64.tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 288d6a61e29d change broke all the syscall numbers. Reported-by: Michał Mirosław Fixes: (288d6a61e29d "loongarch64: reformat syscall_64.tbl for 8-wide tabs") Signed-off-by: Andrei Vagin --- .../plugins/std/syscalls/syscall_64.tbl | 210 +++++++++--------- 1 file changed, 105 insertions(+), 105 deletions(-) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index f844d898d..aa6ffb44d 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -8,111 +8,111 @@ __NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) __NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) -__NR_fcntl 2 sys_fcntl (int fd, int type, long arg) -__NR_ioctl 2 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) -__NR_flock 3 sys_flock (int fd, unsigned long cmd) -__NR_mkdirat 3 sys_mkdirat (int dfd, const char *pathname, int flag) -__NR_unlinkat 3 sys_unlinkat (int dfd, const char *pathname, int flag) -__NR_umount2 3 sys_umount2 (char *name, int flags) -__NR_mount 4 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) -__NR_fallocate 4 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) -__NR_close 5 sys_close (int fd) -__NR_openat 5 sys_openat (int dfd, const char *filename, int flags, int mode) -__NR_lseek 6 sys_lseek (int fd, unsigned long offset, unsigned long origin) -__NR_read 6 sys_read (int fd, void *buf, unsigned long count) -__NR_write 6 sys_write (int fd, const void *buf, unsigned long count) -__NR_pread64 6 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) -__NR_preadv 6 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) -__NR_ppoll 7 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_signalfd4 7 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) -__NR_vmsplice 7 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) -__NR_readlinkat 7 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) -__NR_timerfd_settime 8 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) -__NR_capget 9 sys_capget (struct cap_header *h, struct cap_data *d) -__NR_capset 9 sys_capset (struct cap_header *h, struct cap_data *d) -__NR_personality 9 sys_personality (unsigned int personality) -__NR_exit 9 sys_exit (unsigned long error_code) -__NR_exit_group 9 sys_exit_group (int error_code) -__NR_waitid 9 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) -__NR_set_tid_address 9 sys_set_tid_address (int *tid_addr) -__NR_futex 9 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) -__NR_set_robust_list 9 sys_set_robust_list (struct robust_list_head *head, size_t len) -__NR_get_robust_list 1 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) -__NR_nanosleep 1 sys_nanosleep (struct timespec *req, struct timespec *rem) -__NR_getitimer 1 sys_getitimer (int which, const struct itimerval *val) -__NR_setitimer 1 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) -__NR_sys_timer_create 1 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) -__NR_sys_timer_gettime 1 sys_timer_gettime (int timer_id, const struct itimerspec *setting) -__NR_sys_timer_getoverrun 1 sys_timer_getoverrun (int timer_id) -__NR_sys_timer_settime 1 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) -__NR_sys_timer_delete 1 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 1 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) -__NR_sched_setscheduler 1 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) -__NR_restart_syscall 1 sys_restart_syscall (void) -__NR_kill 1 sys_kill (long pid, int sig) -__NR_sigaltstack 1 sys_sigaltstack (const void *uss, void *uoss) -__NR_rt_sigaction 1 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) -__NR_rt_sigprocmask 1 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) -__NR_rt_sigqueueinfo 1 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) -__NR_rt_sigreturn 1 sys_rt_sigreturn (void) -__NR_setpriority 1 sys_setpriority (int which, int who, int nice) -__NR_setresuid 1 sys_setresuid (int uid, int euid, int suid) -__NR_getresuid 1 sys_getresuid (int *uid, int *euid, int *suid) -__NR_setresgid 1 sys_setresgid (int gid, int egid, int sgid) -__NR_getresgid 1 sys_getresgid (int *gid, int *egid, int *sgid) -__NR_getpgid 1 sys_getpgid (pid_t pid) -__NR_setfsuid 1 sys_setfsuid (int fsuid) -__NR_setfsgid 1 sys_setfsgid (int fsgid) -__NR_getsid 1 sys_getsid (void) -__NR_getgroups 1 sys_getgroups (int gsize, unsigned int *groups) -__NR_setgroups 1 sys_setgroups (int gsize, unsigned int *groups) -__NR_setrlimit 1 sys_setrlimit (int resource, struct krlimit *rlim) -__NR_umask 1 sys_umask (int mask) -__NR_prctl 1 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) -__NR_gettimeofday 1 sys_gettimeofday (struct timeval *tv, struct timezone *tz) -__NR_getpid 1 sys_getpid (void) -__NR_ptrace 1 sys_ptrace (long request, pid_t pid, void *addr, void *data) -__NR_gettid 1 sys_gettid (void) -__NR_shmat 1 sys_shmat (int shmid, void *shmaddr, int shmflag) -__NR_socket 1 sys_socket (int domain, int type, int protocol) -__NR_bind 2 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) -__NR_connect 2 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) -__NR_sendto 2 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) -__NR_recvfrom 2 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) -__NR_setsockopt 2 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) -__NR_getsockopt 2 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) -__NR_shutdown 2 sys_shutdown (int sockfd, int how) -__NR_sendmsg 2 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) -__NR_recvmsg 2 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) -__NR_brk 2 sys_brk (void *addr) -__NR_munmap 2 sys_munmap (void *addr, unsigned long len) -__NR_mremap 2 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) -__NR_clone 2 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) -__NR_mmap 2 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) -__NR_mprotect 2 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) -__NR_mincore 2 sys_mincore (void *addr, unsigned long size, unsigned char *vec) -__NR_madvise 2 sys_madvise (unsigned long start, size_t len, int behavior) -__NR_rt_tgsigqueueinfo 2 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) -__NR_wait4 2 sys_wait4 (int pid, int *status, int options, struct rusage *ru) -__NR_fanotify_init 2 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) -__NR_fanotify_mark 2 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) -__NR_open_by_handle_at 2 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) -__NR_setns 2 sys_setns (int fd, int nstype) -__NR_kcmp 2 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) -__NR_seccomp 2 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) -__NR_memfd_create 2 sys_memfd_create (const char *name, unsigned int flags) -__NR_userfaultfd 2 sys_userfaultfd (int flags) -__NR_membarrier 3 sys_membarrier (int cmd, unsigned int flags, int cpu_id) -__NR_rseq 2 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_open_tree 4 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 4 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) -__NR_fsopen 4 sys_fsopen (char *fsname, unsigned int flags) -__NR_fsconfig 4 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) -__NR_fsmount 4 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) -__NR_pidfd_open 4 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_clone3 4 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_openat2 4 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) -__NR_pidfd_getfd 4 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) #__NR_dup2 ! sys_dup2 (int oldfd, int newfd) #__NR_rmdir ! sys_rmdir (const char *name) #__NR_unlink ! sys_unlink (char *pathname) From 0085f992cbabd39dd74a1eeb0bedc1e0b1b86170 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 24 Aug 2023 21:07:51 +0200 Subject: [PATCH 302/775] memfd: don't set fd attributes not needed for vma mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is only one user of memfd_open() outside of memfd.c: open_filemap(). It is restoring a file-backed mapping and doesn't need nor expect to update F_SETOWN nor the fd's position. Check the inherited_fd() handling in the callers to simplify the code. Signed-off-by: Michał Mirosław --- criu/files-reg.c | 3 ++- criu/memfd.c | 46 +++++++++++++++++++++++----------------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 50dcbc438..cf0c84b52 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2508,7 +2508,8 @@ static int open_filemap(int pid, struct vma_area *vma) */ ret = dup(plugin_fd); } else if (vma->e->status & VMA_AREA_MEMFD) { - ret = memfd_open(vma->vmfd, &flags); + if (!inherited_fd(vma->vmfd, &ret)) + ret = memfd_open(vma->vmfd, &flags); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } diff --git a/criu/memfd.c b/criu/memfd.c index 2158b6720..5fe0aeae3 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -331,14 +331,11 @@ int memfd_open(struct file_desc *d, u32 *fdflags) mfi = container_of(d, struct memfd_info, d); mfe = mfi->mfe; - if (inherited_fd(d, &fd)) - return fd; - pr_info("Restoring memfd id=%d\n", mfe->id); fd = memfd_open_inode(mfi->inode); if (fd < 0) - goto err; + return -1; /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; @@ -348,40 +345,43 @@ int memfd_open(struct file_desc *d, u32 *fdflags) * important though. */ _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); - if (_fd < 0) { + if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); - goto err; - } + close(fd); - fd = _fd; + return _fd; +} + +static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) +{ + MemfdFileEntry *mfe; + int fd; + + if (inherited_fd(d, new_fd)) + return 0; + + fd = memfd_open(d, NULL); + if (fd < 0) + return -1; + + mfe = container_of(d, struct memfd_info, d)->mfe; if (restore_fown(fd, mfe->fown) < 0) goto err; if (lseek(fd, mfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file position of memfd id=%d", mfe->id); + pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); goto err; } - return fd; + *new_fd = fd; + return 0; err: - if (fd >= 0) - close(fd); + close(fd); return -1; } -static int memfd_open_fe_fd(struct file_desc *fd, int *new_fd) -{ - int tmp; - - tmp = memfd_open(fd, NULL); - if (tmp < 0) - return -1; - *new_fd = tmp; - return 0; -} - static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) { MemfdInodeEntry *mie = NULL; From 86ad52bc2def0a5ea715f1a92fbd4dc2d5ff41d1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 24 Aug 2023 15:22:32 -0700 Subject: [PATCH 303/775] ci/loongarch64: compile tests before running zdtm.py Otherwise tests fail by timeout. Signed-off-by: Andrei Vagin --- scripts/ci/loongarch64-qemu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh index 52e587619..d5646468e 100755 --- a/scripts/ci/loongarch64-qemu-test.sh +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -65,5 +65,5 @@ sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127 # build and test run 'cd /root; tar -xf criu.tar' -run 'cd /root/criu; make -j4' +run 'cd /root/criu; make -j4 && make -j4 -C test/zdtm' run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" From b56a9cef32bda8e4f332e94bfbdcc3b2c315a118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 24 Aug 2023 20:21:28 +0200 Subject: [PATCH 304/775] kerndat: Make pagemap check more robust against swapped out pages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix test of whether the kernel exposes page frame numbers to cope with the possibility that the top of the stack is swapped out, which was happening in about one 1 out of 3 million runs. This lead to a later failure when trying to read the PFN of the zero page, after which criu would exit with no error message. Original-From: Ambrose Feinstein Signed-off-by: Michał Mirosław --- criu/kerndat.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index bd1ccdc7d..55a1e2bcd 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -55,10 +55,11 @@ #include "util-caps.h" struct kerndat_s kdat = {}; +volatile int dummy_var; static int check_pagemap(void) { - int ret, fd; + int ret, fd, retry; u64 pfn = 0; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); @@ -72,11 +73,24 @@ static int check_pagemap(void) return -1; } - /* Get the PFN of some present page. Stack is here, so try it :) */ - ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); - if (ret != sizeof(pfn)) { - pr_perror("Can't read pagemap"); - return -1; + retry = 3; + while (retry--) { + ++dummy_var; + /* Get the PFN of a page likely to be present. */ + ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + return -1; + } + /* The page can be swapped out by the time the read occurs, + * in which case the rest of the bits are a swap type + offset + * (which could be zero even if not hidden). + * Retry if this happens. */ + if (pfn & PME_PRESENT) + break; + pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, + retry ? "retrying" : "giving up"); + pfn = 0; } close(fd); From f7d7dc9c08c256c6e7b7fde8a02fd9db35d0b4e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Tue, 22 Aug 2023 17:30:44 +0200 Subject: [PATCH 305/775] compel/infect: include the relevant pid in "no-breakpoints restore" debug message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- compel/src/lib/infect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 022d4ebf3..b9a913fa1 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1577,7 +1577,7 @@ int compel_stop_pie(pid_t pid, void *addr, bool no_bp) int ret; if (no_bp) { - pr_debug("Force no-breakpoints restore\n"); + pr_debug("Force no-breakpoints restore of %d\n", pid); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); From ea05b06ac255e999f03e2858ed2c7119bfb153b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 25 Aug 2023 13:36:21 +0200 Subject: [PATCH 306/775] proc_parse: remove trivial goto from vma_get_mapfile_user() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/proc_parse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 5e96b5c96..bd6c3dcbc 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -338,7 +338,7 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct fd = open(fname, O_RDONLY); if (fd < 0) { pr_perror("Can't open mapped [%s]", fname); - goto returnerr; + return -1; } if (vma_stat(vma, fd)) { @@ -379,7 +379,6 @@ errmsg: pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); closefd: close(fd); -returnerr: return -1; } From 1800018bc1140c7910568a642c62de26d2bcf6a1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 Mar 2023 20:21:04 +0000 Subject: [PATCH 307/775] test/other: add test for action-script This commit is introducing a test for the action-script functionality of CRIU to verify that pre-dump, post-dump, pre-restore, pre-resume, post-restore, post-resume hooks are executed during dump/restore. Signed-off-by: Radostin Stoyanov --- Makefile | 1 + scripts/ci/run-ci-tests.sh | 3 ++ test/others/action-script/.gitignore | 1 + test/others/action-script/Makefile | 5 ++ test/others/action-script/action-script.sh | 2 + test/others/action-script/run.sh | 60 ++++++++++++++++++++++ 6 files changed, 72 insertions(+) create mode 100644 test/others/action-script/.gitignore create mode 100644 test/others/action-script/Makefile create mode 100755 test/others/action-script/action-script.sh create mode 100755 test/others/action-script/run.sh diff --git a/Makefile b/Makefile index 2e8a866af..85407e299 100644 --- a/Makefile +++ b/Makefile @@ -439,6 +439,7 @@ lint: shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh + shellcheck -x test/others/action-script/*.sh codespell -S tags # Do not append \n to pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 79744c750..47749e7fa 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -316,6 +316,9 @@ make -C test/others/ns_ext run # config file parser and parameter testing make -C test/others/config-file run +# action script testing +make -C test/others/action-script run + # Skip all further tests when running with GCOV=1 # The one test which currently cannot handle GCOV testing is compel/test # Probably because the GCOV Makefile infrastructure does not exist in compel diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore new file mode 100644 index 000000000..c0b6a2490 --- /dev/null +++ b/test/others/action-script/.gitignore @@ -0,0 +1 @@ +img-dir-* diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile new file mode 100644 index 000000000..f1ce191db --- /dev/null +++ b/test/others/action-script/Makefile @@ -0,0 +1,5 @@ +run: + @make -C .. loop + ./run.sh + +.PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh new file mode 100755 index 000000000..aba8292c0 --- /dev/null +++ b/test/others/action-script/action-script.sh @@ -0,0 +1,2 @@ +#!/bin/bash +touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh new file mode 100755 index 000000000..a82fccf35 --- /dev/null +++ b/test/others/action-script/run.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -ebm + +# shellcheck source=test/others/env.sh +source ../env.sh || exit 1 + +SELFDIR="$(dirname "$(readlink -f "$0")")" +SCRIPT="$SELFDIR/action-script.sh" +IMGDIR="$SELFDIR/img-dir-$$" + +rm -rf "$IMGDIR" +mkdir "$IMGDIR" + +trap "cleanup" QUIT TERM INT HUP EXIT + +# shellcheck disable=SC2317 +# https://github.com/koalaman/shellcheck/issues/2660 +function cleanup() +{ + if [[ -n "$PID" ]]; then + kill -9 "$PID" + fi +} + +PID=$(../loop) +if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then + echo "Failed to checkpoint process $PID" + cat dump.log + kill -9 "$PID" + exit 1 +fi + +if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then + echo "CRIU restore failed" + echo FAIL + exit 1 +fi + +PID=$(cat "$IMGDIR"/test.pidfile) + +found_missing_file=false +hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") + +for hook in "${hooks[@]}" +do + if [ ! -e "$IMGDIR/action-hook-$hook" ]; then + echo "ERROR: action-hook-$hook does not exist" + found_missing_file=true + fi +done + +if [ "$found_missing_file" = true ]; then + exit 1 +fi + +echo PASS + +rm -rf "$IMGDIR" +exit 0 From badf8060c6f9b1e8212df9bfcd2a0c2ecf3fc299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Thu, 24 Aug 2023 21:20:01 +0200 Subject: [PATCH 308/775] proc_parse: Log smaps entry while dumping VMA. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Help debugging problems with restoring custom VMAs. From: Michał Cłapiński Signed-off-by: Michał Mirosław --- criu/proc_parse.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index bd6c3dcbc..d113a21b8 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -841,6 +841,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du goto err; } + pr_debug("Handling VMA with the following smaps entry: %s\n", str); if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; From d5284313f581dec646b271da44709c62c058224c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 24 Aug 2023 20:54:20 +0200 Subject: [PATCH 309/775] kerndat: Make errors from clone3() check more precise. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- criu/kerndat.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 55a1e2bcd..e9777867d 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1417,17 +1417,20 @@ static bool kerndat_has_clone3_set_tid(void) */ pid = syscall(__NR_clone3, &args, sizeof(args)); - if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { - kdat.has_clone3_set_tid = false; - return 0; - } - if (pid == -1 && errno == EINVAL) { - kdat.has_clone3_set_tid = true; - } else { - pr_perror("Unexpected error from clone3"); + if (pid != -1) { + pr_err("Unexpected success: clone3() returned %d\n", pid); return -1; } + if (errno == ENOSYS || errno == E2BIG) + return 0; + + if (errno != EINVAL) { + pr_pwarn("Unexpected error from clone3"); + return 0; + } + + kdat.has_clone3_set_tid = true; return 0; } From 48056543709b17163f24711902d6a8753fe2d7f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 28 Aug 2023 14:16:54 +0200 Subject: [PATCH 310/775] kerndat: check_pagemap: close(fd) on error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plug a fd leak when returning error from check_pagemap(). (Cosmetic, as the process will exit soon anyway.) Signed-off-by: Michał Mirosław --- criu/kerndat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index e9777867d..37b265d8d 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -80,6 +80,7 @@ static int check_pagemap(void) ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); if (ret != sizeof(pfn)) { pr_perror("Can't read pagemap"); + close(fd); return -1; } /* The page can be swapped out by the time the read occurs, From c9b2633ca3061062379fc6f5cfbb7ff5b70b102b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 21:56:17 +0200 Subject: [PATCH 311/775] memfd: return original memfd fd for execveat() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there is only a single RW opened fd for a memfd, it can be used to pass it to execveat() with AT_EMPTY_PATH to have its contents executed. This currently works only for the original fd from memfd_create(). For now we ignore processes that reopen the memfd's rw and expect a particular executability trait of it. (Note: for security purposes recent kernels have SEAL_EXEC to make memfds non-executable.) Signed-off-by: Michał Mirosław --- criu/memfd.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/criu/memfd.c b/criu/memfd.c index 5fe0aeae3..a770c66a1 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -46,6 +46,7 @@ struct memfd_restore_inode { int fdstore_id; unsigned int pending_seals; MemfdInodeEntry *mie; + bool was_opened_rw; }; static LIST_HEAD(memfd_inodes); @@ -233,6 +234,7 @@ static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_im mutex_init(&inode->lock); inode->fdstore_id = -1; inode->pending_seals = 0; + inode->was_opened_rw = false; list_add_tail(&inode->list, &memfd_inodes); @@ -339,6 +341,24 @@ int memfd_open(struct file_desc *d, u32 *fdflags) /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; + + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { + /* + * If there is only a single RW-opened fd for a memfd, it can + * be used to pass it to execveat() with AT_EMPTY_PATH to have + * its contents executed. This currently works only for the + * original fd from memfd_create() so return the original fd + * once -- in case the caller expects to be the sole opener + * and does execveat() from this memfd. + */ + if (!fcntl(fd, F_SETFL, flags)) { + mfi->inode->was_opened_rw = true; + return fd; + } + + pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); + } + /* * Ideally we should call compat version open() to not force the * O_LARGEFILE file flag with regular open(). It doesn't seem that @@ -347,6 +367,8 @@ int memfd_open(struct file_desc *d, u32 *fdflags) _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); + else if ((flags & O_ACCMODE) == O_RDWR) + pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); close(fd); return _fd; From b7a8bb08892248b7fb1df194543bd215bf928f7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=C2=A0Miros=C5=82aw?= Date: Thu, 27 Jul 2023 17:33:42 +0200 Subject: [PATCH 312/775] zdtm: test execveat(memfd) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michał Mirosław --- test/zdtm/static/Makefile | 1 + test/zdtm/static/memfd04.c | 105 ++++++++++++++++++++++++++++++++++ test/zdtm/static/memfd04.desc | 1 + 3 files changed, 107 insertions(+) create mode 100644 test/zdtm/static/memfd04.c create mode 100644 test/zdtm/static/memfd04.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index cd53932db..b7fb79643 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -258,6 +258,7 @@ TST_NOFILE := \ memfd02 \ memfd02-hugetlb \ memfd03 \ + memfd04 \ shmemfd \ shmemfd-priv \ time \ diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c new file mode 100644 index 000000000..aae7864c1 --- /dev/null +++ b/test/zdtm/static/memfd04.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "exec(memfd)"; +const char *test_author = "Michał Mirosław "; + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +static int _execveat(int dirfd, const char *pathname, const char *const argv[], const char *const envp[], int flags) +{ + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); +} + +static const char *const script_argv[] = { "true", NULL }; +static const char *const script_env[] = { NULL }; + +static bool test_exec_fd(int fd) +{ + int err, pid, status; + + err = fcntl(fd, F_GETFD); + if (err < 0) { + fail("fcntl(F_GETFD)"); + return false; + } + if (err) { + errno = 0; + fail("F_GETFD for the memfd returned %d but expected 0", err); + return false; + } + + pid = fork(); + if (!pid) { + _execveat(fd, "", script_argv, script_env, AT_EMPTY_PATH); + err = errno; + pr_perror("execveat()"); + _exit(err); + } + + if (pid < 0) { + fail("fork()"); + return false; + } + + while (waitpid(pid, &status, 0) != pid) { + if (errno == EINTR) + continue; + fail("waitpid(child=%d)", pid); + return false; + } + + if (status != 0) { + pr_err("child exited with status=%d\n", status); + return false; + } + + return true; +} + +static const char script[] = "#!/bin/true"; +static const size_t script_len = sizeof(script) - 1; + +int main(int argc, char *argv[]) +{ + int fd; + + test_init(argc, argv); + + fd = _memfd_create("somename", 0); + if (fd < 0) { + fail("memfd_create()"); + return 1; + } + + if (write(fd, script, script_len) != script_len) { + fail("write(memfd)"); + return 1; + } + + if (!test_exec_fd(fd)) + return 1; + + test_msg("execveat(memfd) succeeded before C/R.\n"); + + test_daemon(); + test_waitsig(); + + if (!test_exec_fd(fd)) + return 1; + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd04.desc b/test/zdtm/static/memfd04.desc new file mode 100644 index 000000000..bbf136d14 --- /dev/null +++ b/test/zdtm/static/memfd04.desc @@ -0,0 +1 @@ +{'deps': ['/bin/true']} From 38bf7f42e5b733951dd63980db8296e65a045897 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 29 Aug 2023 11:09:59 +0800 Subject: [PATCH 313/775] CONTRIBUTING.md: don't mention ctags Ctags is mentioned in the beginning of the "Edit the source code" which is really confusing: Do you need ctags to edit CRIU code? - No. It is just one helpful tool to browse the code, and we do not want to enforce it. So, what is it doing in contribution guide? People who really need it should be able to find it in Makefile or just write oneliner of their own to collect tags... Signed-off-by: Pavel Tikhomirov --- CONTRIBUTING.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 87da08b34..3cd74128e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,12 +46,6 @@ This should create the `./criu/criu` executable. ## Edit the source code -If you use ctags, you can generate the ctags file by running - -``` - make tags -``` - When you change the source code, please keep in mind the following code conventions: * we prefer tabs and indentations to be 8 characters width From 08f286ed96a94e8e751891b3393bda547dadfa39 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 29 Aug 2023 12:10:59 +0800 Subject: [PATCH 314/775] CONTRIBUTING.md: improve coding-style related sections This is highlight that code readability is the real goal of all the coding-style rules. We should not do coding-style just for coding-style, e.g. when clang-format suggests crazy formating we should not follow it if we feel it is bad. Signed-off-by: Pavel Tikhomirov --- CONTRIBUTING.md | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3cd74128e..a70506bfb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -48,11 +48,16 @@ This should create the `./criu/criu` executable. When you change the source code, please keep in mind the following code conventions: +* code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches * we prefer tabs and indentations to be 8 characters width -* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community. +* we prefer line length of 80 characters or less, more is allowed if it helps with code readability +* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community -Other conventions can be learned from the source code itself. In short, make sure your new code -looks similar to what is already there. +Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. + +## Automatic tools to fix coding-style + +Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). @@ -84,6 +89,41 @@ to check the last *N* commits for formatting errors, without applying the change Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected we need to review the suggested changes and decide if they should be fixed before merging. +Here are some bad examples of clang-format-ing: + +* if clang-format tries to force 120 characters and breaks readability - it is wrong: + +``` +@@ -58,8 +59,7 @@ static int register_membarriers(void) + } + + if (!all_ok) { +- fail("can't register membarrier()s - tried %#x, kernel %#x", +- barriers_registered, barriers_supported); ++ fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); + return -1; + } +``` + +* if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: + +``` +--- a/test/zdtm/static/membarrier.c ++++ b/test/zdtm/static/membarrier.c +@@ -27,9 +27,10 @@ static const struct { + int register_cmd; + int execute_cmd; + } membarrier_cmds[] = { +- { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, +- { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, +- { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, ++ { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, ++ { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, ++ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, ++ { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + }; +``` + ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run From d2a0d1fa642c6eba199f9e872860f669918eb034 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 24 Aug 2023 13:34:05 +0800 Subject: [PATCH 315/775] lint: don't fail workflow on indent fail There are multiple cases where good human readable code block is converted to an unreadable mess by clang-format, so we don't want to rely on clang-format completely. Also there is no way, as far as I can see, to make clang-format only fix what we want it to fix without breaking something. So let's just display hints inline where clang-format is unhappy. When reviewer sees such a warning it's a good sign that something is broken in coding-style around this warning. We add special script which parses diff generated by indent and generates warning for each hunk. Signed-off-by: Pavel Tikhomirov --- .github/workflows/lint.yml | 18 ++++++++--------- Makefile | 1 + scripts/github-indent-warnings.py | 33 +++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 9 deletions(-) create mode 100755 scripts/github-indent-warnings.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index e18f921f3..f52bce812 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -26,15 +26,15 @@ jobs: run: make lint - name: Run make indent - run: > + continue-on-error: true + run: | if [ -z "${{github.base_ref}}" ]; then - git fetch --deepen=1 && - if ! make indent OPTS=--diff; then - exit 1 - fi + git fetch --deepen=1 + make indent else - git fetch origin ${{github.base_ref}} && - if ! make indent OPTS=--diff BASE=origin/${{github.base_ref}}; then - exit 1 - fi + git fetch origin ${{github.base_ref}} + make indent BASE=origin/${{github.base_ref}} fi + - name: Raise in-line make indent warnings + run: | + git diff | ./scripts/github-indent-warnings.py diff --git a/Makefile b/Makefile index 85407e299..4b3329473 100644 --- a/Makefile +++ b/Makefile @@ -432,6 +432,7 @@ lint: flake8 --config=scripts/flake8.cfg crit/setup.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump + flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install diff --git a/scripts/github-indent-warnings.py b/scripts/github-indent-warnings.py new file mode 100755 index 000000000..04f82d6c1 --- /dev/null +++ b/scripts/github-indent-warnings.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +import sys +import re + +re_file = r'^diff --git a/(\S\S*)\s.*$' +re_line = r'^@@ -(\d\d*)\D.*@@.*$' + +if __name__ == '__main__': + if len(sys.argv) != 1 and len(sys.argv) != 2: + print(f'usage: {sys.argv[0]} ') + print(f'usage: | {sys.argv[0]}') + exit(1) + + input_file = sys.stdin.fileno() + if len(sys.argv) == 2: + input_file = sys.argv[1] + + with open(input_file, 'r') as fi: + file_name = None + line_number = None + for line in fi: + file_matches = re.findall(re_file, line) + if len(file_matches) == 1: + file_name = file_matches[0] + continue + + if file_name is None: + continue + + line_matches = re.findall(re_line, line) + if len(line_matches) == 1: + line_number = int(line_matches[0]) + 3 + print(f'::warning file={file_name},line={line_number}::clang-format: Possible coding style problem (https://github.com/checkpoint-restore/criu/blob/criu-dev/CONTRIBUTING.md#automatic-tools-to-fix-coding-style)') From 862cb5c1cbb68923343f28aae2d76d33f338ec1e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 20 May 2023 08:45:50 +0100 Subject: [PATCH 316/775] vagrant: update to version 2.3.7 This patch also updated the download URL format from https://releases.hashicorp.com/vagrant/2.3.7/vagrant_2.3.7_x86_64.deb to https://releases.hashicorp.com/vagrant/2.3.7/vagrant_2.3.7-1_amd64.deb Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index ac4b5579d..e26b5d786 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,7 +6,7 @@ set -e set -x -VAGRANT_VERSION=2.2.19 +VAGRANT_VERSION=2.3.7 FEDORA_VERSION=37 FEDORA_BOX_VERSION=37.20221105.0 @@ -19,7 +19,7 @@ setup() { # Tar up the git checkout to have vagrant rsync it to the VM tar cf criu.tar ../../../criu # Cirrus has problems with the following certificate. - wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_"$(uname -m)".deb -O /tmp/vagrant.deb && \ + wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ From 32241b00de26b06a0d3ad6ce67c34564807593cc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 20 May 2023 08:47:04 +0100 Subject: [PATCH 317/775] vagrant: run tests with fedora 38 Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index e26b5d786..328903f38 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -7,8 +7,8 @@ set -e set -x VAGRANT_VERSION=2.3.7 -FEDORA_VERSION=37 -FEDORA_BOX_VERSION=37.20221105.0 +FEDORA_VERSION=38 +FEDORA_BOX_VERSION=38.20230413.1 setup() { if [ -n "$TRAVIS" ]; then From 2ba8727822df306feaa520e432a605a621c4d516 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Fri, 8 Sep 2023 23:34:53 +0200 Subject: [PATCH 318/775] dump: use MEMBARRIER_CMD_GET_REGISTRATIONS when available MEMBARRIER_CMD_GET_REGISTRATIONS can tell us whether or not the process used MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED unlike the old probing method. Falls back to the old method when MEMBARRIER_CMD_GET_REGISTRATIONS is unavailable. Signed-off-by: Michal Clapinski --- criu/include/kerndat.h | 1 + criu/include/parasite.h | 2 ++ criu/kerndat.c | 23 +++++++++++++++++++++ criu/parasite-syscall.c | 1 + criu/pie/parasite.c | 46 +++++++++++++++++++++++++++++------------ 5 files changed, 60 insertions(+), 13 deletions(-) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 0b2f715f3..f5d409acb 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -85,6 +85,7 @@ struct kerndat_s { bool has_ptrace_get_rseq_conf; struct __ptrace_rseq_configuration libc_rseq_conf; bool has_ipv6_freebind; + bool has_membarrier_get_registrations; }; extern struct kerndat_s kdat; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 5209b6da2..1244220f6 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -118,6 +118,8 @@ static inline int posix_timers_dump_size(int timer_n) */ struct parasite_dump_misc { + bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ + unsigned long brk; u32 pid; diff --git a/criu/kerndat.c b/criu/kerndat.c index 37b265d8d..fef5a46c1 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -17,6 +17,7 @@ #include #include #include +#include #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include @@ -1636,6 +1637,24 @@ static int kerndat_has_ipv6_freebind(void) return ret; } +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int kerndat_has_membarrier_get_registrations(void) +{ + int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); + if (ret < 0) { + if (errno != EINVAL) { + return ret; + } + + kdat.has_membarrier_get_registrations = false; + } else { + kdat.has_membarrier_get_registrations = true; + } + + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1879,6 +1898,10 @@ int kerndat_init(void) pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_membarrier_get_registrations()) { + pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index c08ed09b1..295e404ec 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -433,6 +433,7 @@ int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_mis struct parasite_dump_misc *ma; ma = compel_parasite_args(ctl, struct parasite_dump_misc); + ma->has_membarrier_get_registrations = kdat.has_membarrier_get_registrations; if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) return -1; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index c0604903b..e151ed656 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -246,6 +246,27 @@ static int get_membarrier_registration_mask(int cmd_bit) #define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 #define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 #define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int dump_membarrier_compat(int *membarrier_registration_mask) +{ + int ret; + + *membarrier_registration_mask = 0; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + return 0; +} static int dump_misc(struct parasite_dump_misc *args) { @@ -261,19 +282,18 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); - args->membarrier_registration_mask = 0; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); - if (ret < 0) - return -1; - args->membarrier_registration_mask |= ret; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); - if (ret < 0) - return -1; - args->membarrier_registration_mask |= ret; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); - if (ret < 0) - return -1; - args->membarrier_registration_mask |= ret; + if (args->has_membarrier_get_registrations) { + ret = sys_membarrier(1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + pr_err("membarrier(1 << %d) returned %d\n", MEMBARRIER_CMDBIT_GET_REGISTRATIONS, ret); + return -1; + } + args->membarrier_registration_mask = ret; + } else { + ret = dump_membarrier_compat(&args->membarrier_registration_mask); + if (ret) + return ret; + } ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) From 8caf460b9c03ca00a6838695be9d996480337a9f Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 13 Sep 2023 18:42:58 +0200 Subject: [PATCH 319/775] zdtm: test MEMBARRIER_CMD_GLOBAL_EXPEDITED migration Check membarrier registration both ways: 1. By issuing membarrier commands and checking if they succeed. 2. By issuing MEMBARRIER_CMD_GET_REGISTRATIONS. The first way is needed for older kernels. The second way is needed to test MEMBARRIER_CMD_GLOBAL_EXPEDITED. Signed-off-by: Michal Clapinski --- test/zdtm/static/membarrier.c | 51 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c index a04b36035..85d705ba7 100644 --- a/test/zdtm/static/membarrier.c +++ b/test/zdtm/static/membarrier.c @@ -8,14 +8,16 @@ const char *test_author = "Michał Mirosław "; /* * Define membarrier() CMDs to avoid depending on exact kernel header version. - * FIXME: use MEMBARRIER_CMD_GET_REGISTRATIONS if supported by kernel. */ +#define MEMBARRIER_CMD_GLOBAL_EXPEDITED (1 << 1) +#define MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED (1 << 2) #define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) #define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) #define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) +#define MEMBARRIER_CMD_GET_REGISTRATIONS (1 << 9) static int membarrier(int cmd, unsigned int flags, int cpu_id) { @@ -27,9 +29,14 @@ static const struct { int register_cmd; int execute_cmd; } membarrier_cmds[] = { - { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, - { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, - { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + { "GLOBAL_EXPEDITED", MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_GLOBAL_EXPEDITED }, + { "PRIVATE_EXPEDITED", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "PRIVATE_EXPEDITED_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "PRIVATE_EXPEDITED_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, }; static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); @@ -49,10 +56,10 @@ static int register_membarriers(void) if (~barriers_supported & membarrier_cmds[i].register_cmd) continue; - barriers_registered |= membarrier_cmds[i].execute_cmd; + barriers_registered |= membarrier_cmds[i].register_cmd; if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { - pr_perror("membarrier(REGISTER_PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + pr_perror("membarrier(REGISTER_%s)", membarrier_cmds[i].name_suffix); all_ok = false; } } @@ -71,15 +78,15 @@ static int register_membarriers(void) return barriers_registered; } -static bool check_membarriers(int barriers_registered) +static bool check_membarriers_compat(int barriers_registered) { bool all_ok = true; for (int i = 0; i < n_membarrier_cmds; ++i) { - if (~barriers_registered & membarrier_cmds[i].execute_cmd) + if (~barriers_registered & membarrier_cmds[i].register_cmd) continue; if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { - pr_perror("membarrier(PRIVATE_EXPEDITED%s)", membarrier_cmds[i].name_suffix); + pr_perror("membarrier(%s)", membarrier_cmds[i].name_suffix); all_ok = false; } } @@ -90,6 +97,32 @@ static bool check_membarriers(int barriers_registered) return all_ok; } +static bool check_membarriers_get_registrations(int barriers_registered) +{ + int ret = membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + if (errno == EINVAL) { + test_msg("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS) not supported by running kernel"); + return true; + } + fail("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS)"); + return false; + } + if (ret != barriers_registered) { + fail("MEMBARRIER_CMD_GET_REGISTRATIONS check failed, expected: %d, got: %d", + barriers_registered, ret); + return false; + } + + return true; +} + +static bool check_membarriers(int barriers_registered) +{ + return check_membarriers_compat(barriers_registered) && + check_membarriers_get_registrations(barriers_registered); +} + int main(int argc, char **argv) { int barriers_registered; From d06c9b5cdaa0f4b0f5824fdd307855acb81c7ab0 Mon Sep 17 00:00:00 2001 From: David Francis Date: Tue, 25 Apr 2023 09:39:28 -0400 Subject: [PATCH 320/775] criu/plugin: Add environment variable to cap size of buffers. The amdgpu plugin would create a memory buffer at the size of the largest VRAM bo (buffer object). On some systems, VRAM size exceeds RAM size, so the largest bo might be larger than the available memory. Add an environment variable KFD_MAX_BUFFER_SIZE, which caps the size of this buffer. By default, it is set to 0, and has no effect. When active, any bo larger than its value will be saved to/restored from file in multiple passes. Signed-off-by: David Francis --- Documentation/criu-amdgpu-plugin.txt | 9 + plugins/amdgpu/amdgpu_plugin.c | 320 ++++++++++++++++----------- 2 files changed, 196 insertions(+), 133 deletions(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 48a8e2f6d..35321a915 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -97,6 +97,15 @@ executing criu command. E.g: KFD_CAPABILITY_CHECK=1 +*KFD_MAX_BUFFER_SIZE*:: + On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping + and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) + to set a limit on the plugin's memory usage. + Default:0 (Disabled) + + E.g: + KFD_MAX_BUFFER_SIZE="2G" + AUTHOR ------ diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 6397ecdb7..6a79f8b19 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -107,6 +107,8 @@ extern bool kfd_vram_size_check; extern bool kfd_numa_check; extern bool kfd_capability_check; +size_t kfd_max_buffer_size; + /**************************************************************************************************/ int write_fp(FILE *fp, const void *buf, const size_t buf_len) @@ -449,6 +451,48 @@ void getenv_bool(const char *var, bool *value) pr_info("param: %s:%s\n", var, *value ? "Y" : "N"); } +void getenv_size_t(const char *var, size_t *value) +{ + char *value_str = getenv(var); + char *endp = value_str; + int sh = 0; + size_t size; + + pr_info("Value str: %s\n", value_str); + + if (value_str) { + size = (size_t)strtoul(value_str, &endp, 0); + if (errno || value_str == endp) { + pr_err("Ignoring invalid value for %s=%s, expecting a positive integer\n", var, value_str); + return; + } + switch (*endp) { + case 'k': + case 'K': + sh = 10; + break; + case 'M': + sh = 20; + break; + case 'G': + sh = 30; + break; + case '\0': + sh = 0; + break; + default: + pr_err("Ignoring invalid size suffix for %s=%s, expecting 'K'/k', 'M', or 'G'\n", var, value_str); + return; + } + if (SIZE_MAX >> sh < size) { + pr_err("Ignoring invalid value for %s=%s, exceeds SIZE_MAX\n", var, value_str); + return; + } + *value = size << sh; + } + pr_info("param: %s:0x%lx\n", var, *value); +} + int amdgpu_plugin_init(int stage) { pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); @@ -476,6 +520,9 @@ int amdgpu_plugin_init(int stage) getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check); getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check); } + kfd_max_buffer_size = 0; + getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size); + return 0; } @@ -607,16 +654,14 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) { - uint64_t size, gpu_addr_src, gpu_addr_dest, gpu_addr_ib; - uint64_t gpu_addr_src_orig, gpu_addr_dest_orig; - amdgpu_va_handle h_va_src, h_va_dest, h_va_ib; - amdgpu_bo_handle h_bo_src, h_bo_dest, h_bo_ib; + uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; + amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; + amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; struct amdgpu_bo_import_result res = { 0 }; - uint64_t copy_size, bytes_remain, j = 0; - uint64_t n_packets; struct amdgpu_cs_ib_info ib_info; amdgpu_bo_list_handle h_bo_list; struct amdgpu_cs_request cs_req; @@ -625,102 +670,100 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int err, shared_fd; + int j, err, shared_fd, packets_per_buffer; - shared_fd = bo_buckets[i].dmabuf_fd; - size = bo_buckets[i].size; + shared_fd = bo_bucket.dmabuf_fd; + size = bo_bucket.size; + buffer_bo_size = min(size, buffer_size); + packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; + src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; + dst_bo_size = (type == SDMA_OP_VRAM_READ) ? buffer_bo_size : size; plugin_log_msg("Enter %s\n", __func__); /* prepare src buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_src); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, src_bo_size, &h_bo_src); if (err) { pr_perror("failed to create userptr for sdma"); return -EFAULT; } - break; - case SDMA_OP_VRAM_READ: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); return -EFAULT; } - h_bo_src = res.buf_handle; break; - default: pr_perror("Invalid sdma operation"); return -EINVAL; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_src, &h_va_src, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, src_bo_size, 0x1000, 0, &gpu_addr_src, + &h_va_src, 0); if (err) { pr_perror("failed to alloc VA for src bo"); goto err_src_va; } - err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_src, 0, src_bo_size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the src BO"); goto err_src_bo_map; } - plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, size); + plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, src_bo_size); + /* prepare dest buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - - h_bo_dest = res.buf_handle; + h_bo_dst = res.buf_handle; break; - case SDMA_OP_VRAM_READ: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_dest); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, dst_bo_size, &h_bo_dst); if (err) { pr_perror("failed to create userptr for sdma"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } break; - default: pr_perror("Invalid sdma operation"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_dest, &h_va_dest, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, dst_bo_size, 0x1000, 0, &gpu_addr_dst, + &h_va_dst, 0); if (err) { pr_perror("failed to alloc VA for dest bo"); - goto err_dest_va; + goto err_dst_va; } - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, dst_bo_size, gpu_addr_dst, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the dest BO"); - goto err_dest_bo_map; + goto err_dst_bo_map; } - plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dest, size); + plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dst, dst_bo_size); - n_packets = (size + max_copy_size) / max_copy_size; /* prepare ring buffer/indirect buffer for command submission * each copy packet is 7 dwords so we need to alloc 28x size for ib */ - err = alloc_and_map(h_dev, n_packets * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, + err = alloc_and_map(h_dev, packets_per_buffer * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, (void **)&ib); if (err) { pr_perror("failed to allocate and map ib/rb"); goto err_ib_gpu_alloc; } - - plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, n_packets * 28); + plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, packets_per_buffer * 28); resources[0] = h_bo_src; - resources[1] = h_bo_dest; + resources[1] = h_bo_dst; resources[2] = h_bo_ib; err = amdgpu_bo_list_create(h_dev, 3, resources, NULL, &h_bo_list); if (err) { @@ -728,103 +771,122 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am goto err_bo_list; } - memset(&cs_req, 0, sizeof(cs_req)); - memset(&fence, 0, sizeof(fence)); - memset(ib, 0, n_packets * 28); - - plugin_log_msg("setting up sdma packets for command submission\n"); bytes_remain = size; - gpu_addr_src_orig = gpu_addr_src; - gpu_addr_dest_orig = gpu_addr_dest; + if (type == SDMA_OP_VRAM_WRITE) + copy_dst = gpu_addr_dst; + else + copy_src = gpu_addr_src; + while (bytes_remain > 0) { - copy_size = min(bytes_remain, max_copy_size); + memset(&cs_req, 0, sizeof(cs_req)); + memset(&fence, 0, sizeof(fence)); + memset(ib, 0, packets_per_buffer * 28); - ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); - ib[j++] = copy_size; - ib[j++] = 0; - ib[j++] = 0xffffffff & gpu_addr_src; - ib[j++] = (0xffffffff00000000 & gpu_addr_src) >> 32; - ib[j++] = 0xffffffff & gpu_addr_dest; - ib[j++] = (0xffffffff00000000 & gpu_addr_dest) >> 32; + if (type == SDMA_OP_VRAM_WRITE) { + err = read_fp(storage_fp, buffer, min(bytes_remain, buffer_bo_size)); + if (err) { + pr_perror("failed to read from storage"); + goto err_bo_list; + } + } - gpu_addr_src += copy_size; - gpu_addr_dest += copy_size; - bytes_remain -= copy_size; - } + buffer_space_remain = buffer_bo_size; + if (type == SDMA_OP_VRAM_WRITE) + copy_src = gpu_addr_src; + else + copy_dst = gpu_addr_dst; + j = 0; - gpu_addr_src = gpu_addr_src_orig; - gpu_addr_dest = gpu_addr_dest_orig; - plugin_log_msg("pad the IB to align on 8 dw boundary\n"); - /* pad the IB to the required number of dw with SDMA_NOP */ - while (j & 7) - ib[j++] = SDMA_NOP; + while (bytes_remain > 0 && buffer_space_remain > 0) { + copy_size = min(min(bytes_remain, max_copy_size), buffer_space_remain); - ib_info.ib_mc_address = gpu_addr_ib; - ib_info.size = j; + ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); + ib[j++] = copy_size; + ib[j++] = 0; + ib[j++] = 0xffffffff & copy_src; + ib[j++] = (0xffffffff00000000 & copy_src) >> 32; + ib[j++] = 0xffffffff & copy_dst; + ib[j++] = (0xffffffff00000000 & copy_dst) >> 32; - cs_req.ip_type = AMDGPU_HW_IP_DMA; - /* possible future optimization: may use other rings, info available in - * amdgpu_query_hw_ip_info() - */ - cs_req.ring = 0; - cs_req.number_of_ibs = 1; - cs_req.ibs = &ib_info; - cs_req.resources = h_bo_list; - cs_req.fence_info.handle = NULL; + copy_src += copy_size; + copy_dst += copy_size; + bytes_remain -= copy_size; + buffer_space_remain -= copy_size; + } + /* pad the IB to the required number of dw with SDMA_NOP */ + while (j & 7) + ib[j++] = SDMA_NOP; - plugin_log_msg("create the context\n"); - err = amdgpu_cs_ctx_create(h_dev, &h_ctx); - if (err) { - pr_perror("failed to create context for SDMA command submission"); - goto err_ctx; - } + ib_info.ib_mc_address = gpu_addr_ib; + ib_info.size = j; - plugin_log_msg("initiate sdma command submission\n"); - err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); - if (err) { - pr_perror("failed to submit command for SDMA IB"); - goto err_cs_submit_ib; - } + cs_req.ip_type = AMDGPU_HW_IP_DMA; + /* possible future optimization: may use other rings, info available in + * amdgpu_query_hw_ip_info() + */ + cs_req.ring = 0; + cs_req.number_of_ibs = 1; + cs_req.ibs = &ib_info; + cs_req.resources = h_bo_list; + cs_req.fence_info.handle = NULL; - fence.context = h_ctx; - fence.ip_type = AMDGPU_HW_IP_DMA; - fence.ip_instance = 0; - fence.ring = 0; - fence.fence = cs_req.seq_no; - err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); - if (err) { - pr_perror("failed to query fence status"); - goto err_cs_submit_ib; - } + err = amdgpu_cs_ctx_create(h_dev, &h_ctx); + if (err) { + pr_perror("failed to create context for SDMA command submission"); + goto err_ctx; + } + err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); + if (err) { + pr_perror("failed to submit command for SDMA IB"); + goto err_cs_submit_ib; + } - if (!expired) { - pr_err("IB execution did not complete\n"); - err = -EBUSY; - goto err_cs_submit_ib; - } + fence.context = h_ctx; + fence.ip_type = AMDGPU_HW_IP_DMA; + fence.ip_instance = 0; + fence.ring = 0; + fence.fence = cs_req.seq_no; + err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); + if (err) { + pr_perror("failed to query fence status"); + goto err_cs_submit_ib; + } + if (!expired) { + pr_err("IB execution did not complete\n"); + err = -EBUSY; + goto err_cs_submit_ib; + } - plugin_log_msg("done querying fence status\n"); + if (type == SDMA_OP_VRAM_READ) { + err = write_fp(storage_fp, buffer, buffer_bo_size - buffer_space_remain); + if (err) { + pr_perror("failed to write out to storage"); + goto err_cs_submit_ib; + } + } err_cs_submit_ib: - amdgpu_cs_ctx_free(h_ctx); + amdgpu_cs_ctx_free(h_ctx); + if (err) + break; + } err_ctx: amdgpu_bo_list_destroy(h_bo_list); err_bo_list: - free_and_unmap(n_packets * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); + free_and_unmap(packets_per_buffer * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); err_ib_gpu_alloc: - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_UNMAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, size, gpu_addr_dst, 0, AMDGPU_VA_OP_UNMAP); if (err) - pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dest, size); -err_dest_bo_map: - err = amdgpu_va_range_free(h_va_dest); + pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dst, size); +err_dst_bo_map: + err = amdgpu_va_range_free(h_va_dst); if (err) pr_perror("dest range free failed"); -err_dest_va: - err = amdgpu_bo_free(h_bo_dest); +err_dst_va: + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); - -err_dest_bo_prep: +err_dst_bo_prep: err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_UNMAP); if (err) pr_perror("failed to GPU unmap the src BO %lx, size = %lx", gpu_addr_src, size); @@ -836,7 +898,6 @@ err_src_va: err = amdgpu_bo_free(h_bo_src); if (err) pr_perror("src bo free failed"); - plugin_log_msg("Leaving sdma_copy_bo, err = %d\n", err); return err; } @@ -845,10 +906,9 @@ void *dump_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - BoEntry **bo_info = thread_data->bo_entries; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; - size_t max_bo_size = 0, image_size = 0; + size_t max_bo_size = 0, image_size = 0, buffer_size; uint64_t max_copy_size; uint32_t major, minor; int num_bos = 0; @@ -884,10 +944,11 @@ void *dump_bo_contents(void *_thread_data) } } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -910,15 +971,12 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ); if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; } - plugin_log_msg("** Successfully drained the BO using sDMA: bo_buckets[%d] **\n", i); - ret = write_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - break; } exit: @@ -939,8 +997,7 @@ void *restore_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - size_t image_size = 0, total_bo_size = 0, max_bo_size = 0; - BoEntry **bo_info = thread_data->bo_entries; + size_t image_size = 0, total_bo_size = 0, max_bo_size = 0, buffer_size; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; uint64_t max_copy_size; @@ -977,7 +1034,6 @@ void *restore_bo_contents(void *_thread_data) goto exit; } - /* Allocate buffer to fit biggest BO */ for (i = 0; i < thread_data->num_of_bos; i++) { if (bo_buckets[i].gpu_id == thread_data->gpu_id && (bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) { @@ -995,10 +1051,11 @@ void *restore_bo_contents(void *_thread_data) goto exit; } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -1012,11 +1069,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = read_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - goto exit; - - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; From 59fcfa80d8d393fd19988ccc42f9c7738f787ce7 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Thu, 31 Aug 2023 14:37:49 -0400 Subject: [PATCH 321/775] compel: Add support for ppc64le scv syscalls Power ISA 3.0 added a new syscall instruction. Kernel 5.9 added corresponding support. Add CRIU support to recognize the new instruction and kernel ABI changes to properly dump and restore threads executing in syscalls. Without this change threads executing in syscalls using the scv instruction will not be restored to re-execute the syscall, they will be restored to execute the following instruction and will return unexpected error codes (ERESTARTSYS, etc) to user code. Signed-off-by: Younes Manton --- compel/arch/ppc64/src/lib/infect.c | 68 +++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index db999ce37..1603ac92e 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -11,6 +11,7 @@ #include "log.h" #include "common/bug.h" #include "common/page.h" +#include "common/err.h" #include "infect.h" #include "infect-priv.h" @@ -303,34 +304,59 @@ out_free: return -1; /* still failing the checkpoint */ } -static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - pr_info("Dumping GP/FPU registers for %d\n", pid); +/* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ - /* - * This is inspired by kernel function check_syscall_restart in - * arch/powerpc/kernel/signal.c - */ #ifndef TRAP #define TRAP(r) ((r).trap & ~0xF) #endif - if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { - /* Restart the system call */ - switch (regs->gpr[3]) { - case ERESTARTNOHAND: - case ERESTARTSYS: - case ERESTARTNOINTR: - regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; - break; - case ERESTART_RESTARTBLOCK: - pr_warn("Will restore %d with interrupted system call\n", pid); - regs->gpr[3] = EINTR; - break; - } +static bool trap_is_scv(user_regs_struct_t *regs) +{ + return TRAP(*regs) == 0x3000; +} + +static bool trap_is_syscall(user_regs_struct_t *regs) +{ + return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; +} + +static void handle_syscall(pid_t pid, user_regs_struct_t *regs) +{ + unsigned long ret = regs->gpr[3]; + + if (trap_is_scv(regs)) { + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { + return; } + /* Restart or interrupt the system call */ + switch (ret) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; + break; + } +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (trap_is_syscall(regs)) + handle_syscall(pid, regs); + /* Resetting trap since we are now coming from user space. */ regs->trap = 0; From 8adefc90d2d9baffc114ff6601e953d071d7c970 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 25 Apr 2023 16:20:41 +0100 Subject: [PATCH 322/775] lib/pycriu: generate version.py The version of CRIU is specified in the Makefile.versions file. This patch generates '__varion__' value for the pycriu module. This value can be used by crit to implement `--version`. Signed-off-by: Radostin Stoyanov --- lib/py/.gitignore | 1 + lib/py/Makefile | 7 +++++-- lib/py/__init__.py | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/py/.gitignore b/lib/py/.gitignore index d3090fca3..fba7e3864 100644 --- a/lib/py/.gitignore +++ b/lib/py/.gitignore @@ -1,2 +1,3 @@ *_pb2.py *.pyc +version.py diff --git a/lib/py/Makefile b/lib/py/Makefile index 691b6bdd3..5ce9bc8f7 100644 --- a/lib/py/Makefile +++ b/lib/py/Makefile @@ -1,4 +1,4 @@ -all-y += libpy-images rpc_pb2.py +all-y += libpy-images rpc_pb2.py version.py $(obj)/images/Makefile: ; $(obj)/images/%: .FORCE @@ -11,7 +11,10 @@ libpy-images: rpc_pb2.py: $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) -cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) +version.py: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $(obj)/$@ + +cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc version.py) clean-lib-py: $(Q) $(MAKE) $(build)=$(obj)/images clean diff --git a/lib/py/__init__.py b/lib/py/__init__.py index 96b3e9526..44f66ffa4 100644 --- a/lib/py/__init__.py +++ b/lib/py/__init__.py @@ -1,3 +1,4 @@ from . import rpc_pb2 as rpc from . import images from .criu import * +from .version import __version__ From 61d9cf6f906a5da9b44bff684675d7a41f45be47 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 25 Apr 2023 16:38:44 +0100 Subject: [PATCH 323/775] crit/setup.py: use __version__ from pycriu Signed-off-by: Radostin Stoyanov --- crit/setup.py | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/crit/setup.py b/crit/setup.py index 1aaa73a13..2f584678f 100644 --- a/crit/setup.py +++ b/crit/setup.py @@ -1,23 +1,9 @@ -import os from setuptools import setup, find_packages - - -def get_version(): - version = '0.0.1' - env = os.environ - if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: - version = '{}.{}'.format( - env['CRIU_VERSION_MAJOR'], - env['CRIU_VERSION_MINOR'] - ) - if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: - version += '.' + env['CRIU_VERSION_SUBLEVEL'] - return version - +import pycriu setup( name='crit', - version=get_version(), + version=pycriu.__version__, description='CRiu Image Tool', author='CRIU team', author_email='criu@openvz.org', From 7a2910f89775a68acf3961151fc85e55da5fb500 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 25 Apr 2023 16:41:24 +0100 Subject: [PATCH 324/775] py/cli: add --version option This patch implements the '--version' for the crit tool. $ crit --version 3.17 Signed-off-by: Radostin Stoyanov --- lib/py/cli.py | 1 + test/others/crit/test.sh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/lib/py/cli.py b/lib/py/cli.py index 594035d27..a3a0870f8 100755 --- a/lib/py/cli.py +++ b/lib/py/cli.py @@ -364,6 +364,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--version', action='version', version=pycriu.__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 105aac72b..2698bbd3c 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -101,6 +101,8 @@ function run_test2 { ${CRIT} x ./ rss || exit 1 } +${CRIT} --version + gen_imgs run_test1 run_test2 From 5e544dc4490b6b3ecb04df8d7902061d41c6caca Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 23 Sep 2023 08:50:05 -0700 Subject: [PATCH 325/775] ci: stop testing ubuntu overlayfs They break it with each kernel rebase. More details are here: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 Last time, it was fixed a few month ago and it has been broken again in 5.15.0-1046-azure. Let's bind-mount the CRIU directory into a test container to make it independent of a container file system. Signed-off-by: Andrei Vagin --- scripts/ci/Makefile | 35 +++++------------------------------ scripts/ci/asan.sh | 3 +++ 2 files changed, 8 insertions(+), 30 deletions(-) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index ce844a17c..1caa1e423 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -20,14 +20,6 @@ export CONTAINER_RUNTIME alpine: ZDTM_OPTS=-x zdtm/static/binfmt_misc -x zdtm/static/sched_policy00 -define DOCKER_JSON -{ - "storage-driver": "devicemapper" -} -endef - -export DOCKER_JSON - ifeq ($(GITHUB_ACTIONS),true) # GitHub Actions does not give us a real TTY and errors out with # 'the input device is not a TTY' if using '-t' @@ -47,34 +39,20 @@ else endif ifeq ($(CONTAINER_RUNTIME),podman) - # Just as Docker needs to use devicemapper Podman needs vfs - # as graphdriver as overlayfs does not support all test cases - STORAGE_DRIVER := vfs # Podman limits the number of processes in a container using cgroups. # Disable it as it breaks the thread-bomb test CONTAINER_OPTS += --pids-limit=0 endif -export STORAGE_DRIVER - -restart-docker: - if [ "$$UNAME" = "x86_64" ] && [ "$$CONTAINER_RUNTIME" = "docker" ]; then \ - echo "$$DOCKER_JSON" > /etc/docker/daemon.json; \ - cat /etc/docker/daemon.json; \ - systemctl status docker; \ - systemctl restart docker; \ - systemctl status docker; \ - fi - export ZDTM_OPTS -$(TARGETS): restart-docker +$(TARGETS): $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run --env-file docker.env $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh + $(CONTAINER_RUNTIME) run --env-file docker.env -v `pwd`/../../:/criu $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh -fedora-asan: restart-docker +fedora-asan: $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) + $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) -v `pwd`/../../:/criu criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) docker-test: ./docker-test.sh @@ -82,10 +60,7 @@ docker-test: podman-test: ./podman-test.sh -# overlayfs behaves differently on Ubuntu and breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 -# Switch to devicemapper -java-test: restart-docker +java-test: ./java-test.sh setup-vagrant: diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index deeeca0b9..8b72fa5f1 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -4,6 +4,9 @@ set -x cat /proc/self/mountinfo +time make ASAN=1 -j 4 V=1 +time make -j4 -C test/zdtm V=1 + chmod 0777 test chmod 0777 test/zdtm/transition/ chmod 0777 test/zdtm/static From 785d97a61943fe425b6f82226f55c7d0114605c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Mon, 29 May 2023 17:22:16 +0200 Subject: [PATCH 326/775] zdtm: If ignoring kernel taint, also ignore taint changes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least in Google's VM environment, the kernel taints are unrelated to CRIU runs. Don't fail tests if taints change, if kernel taints are ignored. Signed-off-by: Michał Mirosław --- test/zdtm.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index c6e852dc1..bc14e3f73 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2003,12 +2003,20 @@ class Launcher: file=self.__file_report) print(u"# ", file=self.__file_report) print(u"1.." + str(nr_tests), file=self.__file_report) - with open("/proc/sys/kernel/tainted") as taintfd: - self.__taint = taintfd.read() + self.__taint = self.__read_kernel_tainted() if int(self.__taint, 0) != 0: - print("The kernel is tainted: %r" % self.__taint) - if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != '1': - raise Exception("The kernel is tainted: %r" % self.__taint) + self.__report_kernel_taint("The kernel is tainted: %r" % self.__taint) + + @staticmethod + def __read_kernel_tainted(): + with open("/proc/sys/kernel/tainted") as taintfd: + return taintfd.read().strip() + + @staticmethod + def __report_kernel_taint(msg): + print(msg) + if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != "1": + raise Exception(msg) def __show_progress(self, msg): perc = int(self.__nr * 16 / self.__total) @@ -2034,11 +2042,12 @@ class Launcher: if len(self.__subs) >= self.__max: self.wait() - with open("/proc/sys/kernel/tainted") as taintfd: - taint = taintfd.read() + taint = self.__read_kernel_tainted() if self.__taint != taint: - raise Exception("The kernel is tainted: %r (%r)" % - (taint, self.__taint)) + prev_taint = self.__taint + self.__taint = taint + self.__report_kernel_taint( + "The kernel is tainted: %r (was %r)" % (taint, prev_taint)) ''' The option --link-remap allows criu to hardlink open files back to the From 131e464a0545fe241805b84aa8db5be2aaa06941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Tue, 30 May 2023 20:00:10 +0200 Subject: [PATCH 327/775] zdtm: cgroup04: Improve error messages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the errno values reported by cgroup04 always correct and showing relevant parameters. Constify constant strings, while at it. Signed-off-by: Michał Mirosław --- test/zdtm/static/cgroup04.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 8c40ffd6b..f586a0628 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -17,25 +17,25 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *cgname = "zdtmtst"; +static const char *const cgname = "zdtmtst"; int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", dirname); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", subdir); return -1; } if (mount("none", subdir, "cgroup", 0, controller)) { - pr_perror("Can't mount cgroups"); + pr_perror("Can't mount cgroup controller %s at %s", controller, subdir); goto err_rd; } @@ -52,7 +52,8 @@ int mount_and_add(const char *controller, const char *path, const char *prop, co goto err_rs; ssprintf(paux, "%s/%s/special_prop_check", subdir, path); - mkdir(paux, 0600); + if (mkdir(paux, 0600) < 0) + pr_perror("Can't make dir %s", paux); return 0; err_rs: @@ -74,11 +75,11 @@ bool checkval(char *path, char *val) } n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) + pr_perror("read %s", path); close(fd); - if (n < 0) { - pr_perror("read"); + if (n < 0) return false; - } buf[n] = 0; if (strcmp(val, buf)) { @@ -95,7 +96,7 @@ int main(int argc, char **argv) char buf[1024], path[PATH_MAX]; struct stat sb; - char *dev_allow[] = { + const char *const dev_allow[] = { "c *:* m", "b *:* m", "c 1:3 rwm", "c 1:5 rwm", "c 1:7 rwm", "c 5:0 rwm", "c 5:2 rwm", "c 1:8 rwm", "c 1:9 rwm", "c 136:* rwm", "c 10:229 rwm", }; @@ -126,12 +127,14 @@ int main(int argc, char **argv) sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); if (!checkval(path, buf)) { + errno = 0; fail(); goto out; } sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); if (!checkval(path, "268435456\n")) { + errno = 0; fail(); goto out; } @@ -143,6 +146,7 @@ int main(int argc, char **argv) } if (!S_ISDIR(sb.st_mode)) { + errno = 0; fail("special_prop_check not a directory?"); goto out; } From c29c5a1df0c0ad185bc2cfc692455d1d1a674e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 10:20:09 +0200 Subject: [PATCH 328/775] zdtm: cgroup04: Improve skip check's robustness. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup04 test needs full control over mem and devices cgroup hierarchies. Make the test's .checkskip script better at detecting if the cgroups are available for use. Signed-off-by: Michał Mirosław --- test/zdtm/static/cgroup04.checkskip | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/cgroup04.checkskip b/test/zdtm/static/cgroup04.checkskip index 205f8fc53..1ccbada4d 100755 --- a/test/zdtm/static/cgroup04.checkskip +++ b/test/zdtm/static/cgroup04.checkskip @@ -1,3 +1,20 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +for ctl in devices memory; do + # Check that the controller is available. + + grep -q "^${ctl}\\s" /proc/cgroups + + # Check that the controller is not co-mounted with any other. + + # /proc/self/cgroup may have: + # "1:devices:/sys" + if ! grep -q "^[0-9]*:${ctl}:" /proc/self/cgroup; then + # but not eg: + # "1:devices,job:/sys" + grep -qE "^[0-9]*:([^:]*,)?${ctl}(,[^:]*)?:" /proc/self/cgroup && exit 1 + fi +done From f6e820bedb53b661b6d5350d9a15b2ce9a5a69d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 21 Apr 2023 15:57:32 +0200 Subject: [PATCH 329/775] zdtm: Treat ESRCH from kill() as success. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a failure to clean up after a failed test, where CRIU didn't start properly. ``` ===================== Run zdtm/transition/socket-tcp in h ====================== Start test ./socket-tcp --pidfile=socket-tcp.pid --outfile=socket-tcp.out Traceback (most recent call last): File ".../zdtm_py.py", line 1906, in do_run_test cr(cr_api, t, opts) File ".../zdtm_py.py", line 1584, in cr cr_api.dump("dump") File ".../zdtm_py.py", line 1386, in dump self.__dump_process = self.__criu_act(action, File ".../zdtm_py.py", line 1224, in __criu_act raise test_fail_exc("CRIU %s" % action) test_fail_exc: CRIU dump During handling of the above exception, another exception occurred: Traceback (most recent call last): File "", line 182, in run_filename_from_loader_as_main File "", line 34, in _run_code_in_main File ".../zdtm_py.py", line 2790, in fork_zdtm() File ".../zdtm_py.py", line 2782, in fork_zdtm do_run_test(tinfo[0], tinfo[1], tinfo[2], tinfo[3]) File ".../zdtm_py.py", line 1922, in do_run_test t.kill() File ".../zdtm_py.py", line 509, in kill os.kill(int(self.__pid), sig) ProcessLookupError: [Errno 3] No such process ``` Signed-off-by: Michał Mirosław --- test/zdtm.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index bc14e3f73..810873575 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -507,8 +507,15 @@ class zdtm_test: self.__freezer.thaw() if self.__pid: print("Send the %d signal to %s" % (sig, self.__pid)) - os.kill(int(self.__pid), sig) - self.gone(sig == signal.SIGKILL) + try: + os.kill(int(self.__pid), sig) + except ProcessLookupError: + if sig != signal.SIGKILL: + raise + print("The process %s doesn't exist" % self.__pid) + self.gone(True) + else: + self.gone(sig == signal.SIGKILL) self.__flavor.fini() From ba48ceb57553183fc3435db4dbeb1fd52306b0c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 15:31:10 +0200 Subject: [PATCH 330/775] zdtm: socket_udp_shutdown: Make the test fail instead of timing out. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When -- after restore -- sockets can't communicate, the test times out while waiting on recvfrom(). Since the communication is local, send() works instantaneously - so mark sockets with SOCK_NONBLOCK and report failure if the message is not received immediately. Signed-off-by: Michał Mirosław --- test/zdtm/static/socket_udp_shutdown.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/zdtm/static/socket_udp_shutdown.c b/test/zdtm/static/socket_udp_shutdown.c index 91dc8f30a..a7658b9dd 100644 --- a/test/zdtm/static/socket_udp_shutdown.c +++ b/test/zdtm/static/socket_udp_shutdown.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) test_init(argc, argv); - sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); - sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + sk1 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); + sk2 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); if (sk1 < 0 || sk2 < 0) { pr_perror("Can't create socket"); exit(1); From 00f8a56b6e8c888f7e0883456e326a9c7ef79fe9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 26 Sep 2023 17:00:36 -0700 Subject: [PATCH 331/775] zdtm: check userns once All test logs are flooded with the "userns is supported" messages... Signed-off-by: Andrei Vagin --- test/zdtm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index 810873575..7a7cdfd3b 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2404,6 +2404,7 @@ def run_tests(opts): "Specify --criu-image-streamer-dir or modify PATH to provide an alternate location") .format(streamer_dir)) + usernsIsSupported = criu.check("userns") launcher = Launcher(opts, len(torun)) try: for t in torun: @@ -2473,7 +2474,7 @@ def run_tests(opts): run_flavs = set(test_flavs) & set(opts_flavs) else: run_flavs = set([test_flavs.pop()]) - if not criu.check("userns"): + if not usernsIsSupported: run_flavs -= set(['uns']) if opts['user']: # FIXME -- probably uns will make sense From 28adebefb759306959df41acbe9cb74bfebd03ea Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Tue, 20 Jun 2023 13:23:24 +0400 Subject: [PATCH 332/775] Return page size as unsigned long Currently page_size() returns unsigned int value that is after "bitwise not" is promoted to unsigned long value e.g. in uffd.c handle_page_fault. Since the value is unsigned promotion is done with 0 MSB that results in lost of MSB pagefault address bits. So make page_size to return unsigned long to avoid such situation. Signed-off-by: Vladislav Khmelevsky --- compel/plugins/std/infect.c | 2 +- criu/pie/restorer.c | 2 +- include/common/arch/aarch64/asm/page.h | 4 ++-- include/common/arch/loongarch64/asm/page.h | 4 ++-- include/common/arch/mips/asm/page.h | 4 ++-- include/common/arch/ppc64/asm/page.h | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index abecc140f..60b21d313 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -27,7 +27,7 @@ static struct rt_sigframe *sigframe; */ static unsigned __page_size; -unsigned __attribute((weak)) page_size(void) +unsigned long __attribute((weak)) page_size(void) { return __page_size; } diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0de2423a1..ba6f290dc 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -104,7 +104,7 @@ bool fault_injected(enum faults f) * Hint: compel on aarch64 shall learn relocs for that. */ static unsigned __page_size; -unsigned page_size(void) +unsigned long page_size(void) { return __page_size; } diff --git a/include/common/arch/aarch64/asm/page.h b/include/common/arch/aarch64/asm/page.h index 90670d126..4555debbd 100644 --- a/include/common/arch/aarch64/asm/page.h +++ b/include/common/arch/aarch64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h index 25bdbc141..4fcdb64dc 100644 --- a/include/common/arch/loongarch64/asm/page.h +++ b/include/common/arch/loongarch64/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/mips/asm/page.h b/include/common/arch/mips/asm/page.h index 25bdbc141..4fcdb64dc 100644 --- a/include/common/arch/mips/asm/page.h +++ b/include/common/arch/mips/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/ppc64/asm/page.h b/include/common/arch/ppc64/asm/page.h index a1ff6718a..2b0c0e504 100644 --- a/include/common/arch/ppc64/asm/page.h +++ b/include/common/arch/ppc64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ From 9b4e8292cd8b3247260144098096e7955b3806ef Mon Sep 17 00:00:00 2001 From: Vladislav Khmelevsky Date: Tue, 20 Jun 2023 13:34:27 +0400 Subject: [PATCH 333/775] vma: Add !VVAR condition to vma_entry_can_be_lazy Currently most of the times we don't have problems with VVAR segment and lazy restore because when VDSO is parked there is an munmap call that calls UFFDIO_UNREGISTER on the destination address. But we don't want to enable userfaultfd for VDSO and VVAR at the first place. Signed-off-by: Vladislav Khmelevsky --- criu/include/vma.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/include/vma.h b/criu/include/vma.h index 106c56af2..4b663ee50 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -122,8 +122,8 @@ static inline struct vma_area *vma_next(struct vma_area *vma) static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && - !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && - !(e->flags & MAP_HUGETLB)); + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && + !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); } #endif /* __CR_VMA_H__ */ From 66f39adf12a41a7af130f5c092debfc9740b2b1f Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Sat, 30 Sep 2023 00:26:09 +0200 Subject: [PATCH 334/775] criu: change the comment about magic numbers Signed-off-by: Michal Clapinski --- criu/include/magic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/magic.h b/criu/include/magic.h index 22d7218e4..0e8c37234 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -29,7 +29,7 @@ /* * The magic-s below correspond to coordinates - * of various Russian towns in the NNNNEEEE form. + * of various towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ From a68975c06d7877a7e6751a8c70010d2cda4c9c29 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:38:45 -0700 Subject: [PATCH 335/775] plugins: the UPDATE_VMA_MAP callback returns fd with the full control It means CRIU has to close it when it is not needed. It looks more logically correct and matches the behaviour of the RESTORE_EXT_FILE callback. Signed-off-by: Andrei Vagin --- criu/files-reg.c | 2 +- plugins/amdgpu/amdgpu_plugin.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index cf0c84b52..c80da1d8c 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2506,7 +2506,7 @@ static int open_filemap(int pid, struct vma_area *vma) * using dup because dup returns a reference to the same struct file inside kernel, but we * cannot open a new FD. */ - ret = dup(plugin_fd); + ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { if (!inherited_fd(vma->vmfd, &ret)) ret = memfd_open(vma->vmfd, &flags); diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 6a79f8b19..9dae8861c 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1955,10 +1955,15 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) { *new_offset = vma_md->new_pgoff; - if (is_renderD) - *updated_fd = vma_md->fd; - else - *updated_fd = -1; + *updated_fd = -1; + if (is_renderD) { + int fd = dup(vma_md->fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + *updated_fd = fd; + } plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, *updated_fd); From 940a05c0ba54faffc1f340f792f2685f3560d0fa Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:47:50 -0700 Subject: [PATCH 336/775] amdgpu: don't leak fd on an error path in open_img_file Signed-off-by: Andrei Vagin --- plugins/amdgpu/amdgpu_plugin.c | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 9dae8861c..e22168d93 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -165,6 +165,7 @@ FILE *open_img_file(char *path, bool write, size_t *size) fp = fdopen(fd, write ? "w" : "r"); if (!fp) { pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read"); + close(fd); return NULL; } From aa38a59899f39abde844895fc36d6043de84948b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:49:39 -0700 Subject: [PATCH 337/775] amdgpu: print an error if the dup syscall fails Signed-off-by: Andrei Vagin --- plugins/amdgpu/amdgpu_plugin.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index e22168d93..2ebc5e178 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1796,7 +1796,12 @@ int amdgpu_plugin_restore_file(int id) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - return dup(fd); + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); From ba168ab78cb4536a7ed3813e929cbc874a7d998d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 30 Sep 2023 02:56:52 +0100 Subject: [PATCH 338/775] ci: enable build with amdgpu plugin This patch adds the `libdrm-dev` package to the list of CRIU dependencies installed in CI to build CRIU with amdgpu plugin. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 4 ++-- plugins/amdgpu/kfd_ioctl.h | 2 +- scripts/build/Dockerfile.alpine | 1 + scripts/build/Dockerfile.archlinux | 1 + scripts/ci/prepare-for-fedora-rawhide.sh | 1 + scripts/ci/run-ci-tests.sh | 7 ++++++- 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index e559ec772..6a586d58b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is @@ -108,7 +108,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index b88fe20cf..e1ebb75a3 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,7 +23,7 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include +#include #include /* diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index af1858ab5..cb746757a 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -23,6 +23,7 @@ RUN apk update && apk add \ python3 \ sudo \ libcap-utils \ + libdrm-dev \ util-linux COPY . /criu diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index f2bce1e5b..b9968e876 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -35,6 +35,7 @@ RUN pacman -Syu --noconfirm \ asciidoctor \ python-junit-xml \ python-importlib-metadata \ + libdrm \ diffutils COPY . /criu diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 1c8a46fbf..e31814a95 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -35,6 +35,7 @@ dnf install -y \ which \ e2fsprogs \ rubygem-asciidoctor \ + libdrm-devel \ kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 47749e7fa..1aae555f7 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -6,7 +6,7 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time flake8 libbsd-dev python3-yaml libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata python3-junit.xml) + python3-importlib-metadata python3-junit.xml libdrm-dev) X86_64_PKGS=(gcc-multilib) @@ -326,3 +326,8 @@ make -C test/others/action-script run # compel testing make -C compel/test + +# amdgpu_plugin testing +make amdgpu_plugin +make -C plugins/amdgpu/ test_topology_remap +./plugins/amdgpu/test_topology_remap From 28e854d662e203e62c4f75140e7130b9df7863b8 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 1 Oct 2023 08:58:59 +0100 Subject: [PATCH 339/775] amdgpu: fix clang warnings amdgpu_plugin.c:930:6: error: variable 'buffer' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] if (ret) { ^~~ amdgpu_plugin.c:988:8: note: uninitialized use occurs here xfree(buffer); Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 2ebc5e178..32ff8f936 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -915,7 +915,7 @@ void *dump_bo_contents(void *_thread_data) int num_bos = 0; int i, ret = 0; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; pr_info("Thread[0x%x] started\n", thread_data->gpu_id); @@ -1004,7 +1004,7 @@ void *restore_bo_contents(void *_thread_data) uint64_t max_copy_size; uint32_t major, minor; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; int num_bos = 0; int i, ret = 0; From e86e8dac0d91eca31e219226b43fee0d600b8c89 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 22 Sep 2023 22:40:27 +0000 Subject: [PATCH 340/775] memfd: don't reopen file descriptors for memory mappings One memfd can be shared by a few restored files. Only of these files is restored with a file created with memfd_open. Others are restored by reopening memfd files via /proc/self/fd/. It seems unnecessary for restoring memfd memory mappings. We can always use the origin file. Signed-off-by: Andrei Vagin --- criu/files-reg.c | 2 +- criu/include/memfd.h | 4 +++- criu/memfd.c | 9 ++++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index c80da1d8c..9fbab0d42 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -2509,7 +2509,7 @@ static int open_filemap(int pid, struct vma_area *vma) ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { if (!inherited_fd(vma->vmfd, &ret)) - ret = memfd_open(vma->vmfd, &flags); + ret = memfd_open(vma->vmfd, &flags, true); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } diff --git a/criu/include/memfd.h b/criu/include/memfd.h index 1b1dc79bb..78d810019 100644 --- a/criu/include/memfd.h +++ b/criu/include/memfd.h @@ -1,7 +1,9 @@ #ifndef __CR_MEMFD_H__ #define __CR_MEMFD_H__ +#include #include + #include "int.h" #include "common/config.h" @@ -12,7 +14,7 @@ extern int is_memfd(dev_t dev); extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); extern const struct fdtype_ops memfd_dump_ops; -extern int memfd_open(struct file_desc *d, u32 *fdflags); +extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); extern struct collect_image_info memfd_cinfo; extern struct file_desc *collect_memfd(u32 id); extern int apply_memfd_seals(void); diff --git a/criu/memfd.c b/criu/memfd.c index a770c66a1..9d9f0621f 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -323,7 +323,7 @@ static int memfd_open_inode(struct memfd_restore_inode *inode) return fd; } -int memfd_open(struct file_desc *d, u32 *fdflags) +int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) { struct memfd_info *mfi; MemfdFileEntry *mfe; @@ -342,6 +342,9 @@ int memfd_open(struct file_desc *d, u32 *fdflags) /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; + if (filemap && (flags & O_ACCMODE) == O_RDWR) + return fd; + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { /* * If there is only a single RW-opened fd for a memfd, it can @@ -367,7 +370,7 @@ int memfd_open(struct file_desc *d, u32 *fdflags) _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); - else if ((flags & O_ACCMODE) == O_RDWR) + else if (!filemap && (flags & O_ACCMODE) == O_RDWR) pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); close(fd); @@ -382,7 +385,7 @@ static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) if (inherited_fd(d, new_fd)) return 0; - fd = memfd_open(d, NULL); + fd = memfd_open(d, NULL, false); if (fd < 0) return -1; From 81a30c32062b5b854343d47044a6f726d3c7203e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 18:07:13 +0000 Subject: [PATCH 341/775] zdtm/memfd04: check execveat on memfd that has memory mappings Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 2 ++ test/zdtm/static/memfd04.c | 33 ++++++++++++++++++++++++++++++--- test/zdtm/static/memfd05.c | 1 + test/zdtm/static/memfd05.desc | 1 + 4 files changed, 34 insertions(+), 3 deletions(-) create mode 120000 test/zdtm/static/memfd05.c create mode 120000 test/zdtm/static/memfd05.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index b7fb79643..4c7ca72fd 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -259,6 +259,7 @@ TST_NOFILE := \ memfd02-hugetlb \ memfd03 \ memfd04 \ + memfd05 \ shmemfd \ shmemfd-priv \ time \ @@ -656,6 +657,7 @@ socket-tcp6-unconn: CFLAGS += -D ZDTM_IPV6 socket-tcp4v6-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK -D ZDTM_IPV4V6 socket-tcp4v6-closing: CFLAGS += -D ZDTM_IPV4V6 memfd02-hugetlb: CFLAGS += -D ZDTM_HUGETLB +memfd05: CFLAGS += -D ZDTM_MEMFD05 sockets00-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET sockets01-seqpacket: CFLAGS += -D ZDTM_UNIX_SEQPACKET diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c index aae7864c1..215e949d1 100644 --- a/test/zdtm/static/memfd04.c +++ b/test/zdtm/static/memfd04.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -73,20 +74,46 @@ static const size_t script_len = sizeof(script) - 1; int main(int argc, char *argv[]) { +#ifdef MEMFD05 + char path[PATH_MAX]; + char *addr_p, *addr_s; + int rofd; +#endif int fd; test_init(argc, argv); fd = _memfd_create("somename", 0); if (fd < 0) { - fail("memfd_create()"); + pr_perror("memfd_create()"); + return 1; + } + if (ftruncate(fd, script_len) == -1) { + pr_perror("ftruncate"); return 1; } - if (write(fd, script, script_len) != script_len) { - fail("write(memfd)"); + pr_perror("write(memfd)"); return 1; } +#ifdef MEMFD05 + snprintf(path, PATH_MAX - 1, "/proc/self/fd/%d", fd); + rofd = open(path, O_RDONLY); + if (rofd < 0) { + pr_perror("unable to open read-only memfd"); + return 1; + } + addr_p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, rofd, 0); + if (addr_p == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + addr_s = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr_s == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } +#endif if (!test_exec_fd(fd)) return 1; diff --git a/test/zdtm/static/memfd05.c b/test/zdtm/static/memfd05.c new file mode 120000 index 000000000..6caa9556f --- /dev/null +++ b/test/zdtm/static/memfd05.c @@ -0,0 +1 @@ +memfd04.c \ No newline at end of file diff --git a/test/zdtm/static/memfd05.desc b/test/zdtm/static/memfd05.desc new file mode 120000 index 000000000..1b4963572 --- /dev/null +++ b/test/zdtm/static/memfd05.desc @@ -0,0 +1 @@ +memfd04.desc \ No newline at end of file From 5d6c8bc58494a26e2057d2ae481b30a79b87c022 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 30 Aug 2023 11:57:40 +0800 Subject: [PATCH 342/775] clang-format: disable column limit constraint The "ColumnLimit: 120" is not only allowing lines to be longer than 80 characters but it also forces line wrapping at 120 characters. If total expression length is more than 120 characters, clang-format will try to wrap it as close to 120 as it can, it would not even allow to wrap at 80 characters if we really want it. But as we all know 80 characters is Linux kernel coding style default and as far as our coding style is based on it it is really strange to prohibit wrapping lines at 80 characters... Signed-off-by: Pavel Tikhomirov --- .clang-format | 2 +- scripts/fetch-clang-format.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.clang-format b/.clang-format index 475638015..fb40bc613 100644 --- a/.clang-format +++ b/.clang-format @@ -53,7 +53,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false -ColumnLimit: 120 +ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index b80175f05..5b6037d61 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -8,7 +8,7 @@ URL="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/.c curl -s "${URL}" | sed -e " s,^\( *\)#\([A-Z]\),\1\2,g; s,ControlStatements,ControlStatementsExceptForEachMacros,g; - s,ColumnLimit: 80,ColumnLimit: 120,g; + s,ColumnLimit: 80,ColumnLimit: 0,g; s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_bit',g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; From c9fdd355f6e4695e24bdd6eb42e42cf831f09219 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 14 Feb 2022 11:47:30 +0000 Subject: [PATCH 343/775] pie: Mark __export_*() functions as externally_visible GCC's lto source: > To avoid this problem the compiler must assume that it sees the > whole program when doing link-time optimization. Strictly > speaking, the whole program is rarely visible even at link-time. > Standard system libraries are usually linked dynamically or not > provided with the link-time information. In GCC, the whole > program option (@option{-fwhole-program}) asserts that every > function and variable defined in the current compilation > unit is static, except for function @code{main} (note: at > link time, the current unit is the union of all objects compiled > with LTO). Since some functions and variables need to > be referenced externally, for example by another DSO or from an > assembler file, GCC also provides the function and variable > attribute @code{externally_visible} which can be used to disable > the effect of @option{-fwhole-program} on a specific symbol. As far as I read gcc's source, ipa_comdats() will avoid placing symbols that are either already in a user-defined section or have externally_visible attribute into new optimized gcc sections. Signed-off-by: Dmitry Safonov Signed-off-by: Andrei Vagin --- criu/pie/restorer.c | 6 +++--- include/common/compiler.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index ba6f290dc..02971657e 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -735,7 +735,7 @@ static int recv_cg_set_restore_ack(int sk) * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. */ -long __export_restore_thread(struct thread_restore_args *args) +__visible long __export_restore_thread(struct thread_restore_args *args) { struct rt_sigframe *rt_sigframe; k_rtsigset_t to_block; @@ -1276,7 +1276,7 @@ unsigned long vdso_rt_size = 0; void *bootstrap_start = NULL; unsigned int bootstrap_len = 0; -void __export_unmap(void) +__visible void __export_unmap(void) { sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } @@ -1608,7 +1608,7 @@ static int restore_membarrier_registrations(int mask) * and jump execution to some predefined ip read from * core file. */ -long __export_restore_task(struct task_restore_args *args) +__visible long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; diff --git a/include/common/compiler.h b/include/common/compiler.h index 1c9d3db8d..1347b6236 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -30,6 +30,17 @@ #define __always_unused __attribute__((unused)) #define __must_check __attribute__((__warn_unused_result__)) +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +/* Not supported by clang */ +#if __has_attribute(__externally_visible__) +#define __visible __attribute__((__externally_visible__)) +#else +#define __visible +#endif + #define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline From 4c9d23d33de0f6ed77978b145daa1a55d8a261fc Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 28 Sep 2023 21:26:41 +0000 Subject: [PATCH 344/775] util: allow to run criu under strace fork_and_ptrace_attach has to fork a child with CLONE_UNTRACED, so that strace doesn't trace it. Signed-off-by: Andrei Vagin --- criu/util.c | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/criu/util.c b/criu/util.c index 993ab97bb..95ba0feda 100644 --- a/criu/util.c +++ b/criu/util.c @@ -661,40 +661,54 @@ out: return ret; } +struct child_args { + int *sk_pair; + int (*child_setup)(void); +}; + +static int child_func(void *_args) +{ + struct child_args *args = _args; + int sk, *sk_pair = args->sk_pair; + char c = 0; + + sk = sk_pair[1]; + close(sk_pair[0]); + + if (args->child_setup && args->child_setup() != 0) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + while (1) + sleep(1000); + exit(1); +} + pid_t fork_and_ptrace_attach(int (*child_setup)(void)) { pid_t pid; int sk_pair[2], sk; char c = 0; + struct child_args cargs = { + .sk_pair = sk_pair, + .child_setup = child_setup, + }; if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } - pid = fork(); + pid = clone_noasan(child_func, CLONE_UNTRACED | SIGCHLD, &cargs); if (pid < 0) { pr_perror("fork"); return -1; } - if (pid == 0) { - sk = sk_pair[1]; - close(sk_pair[0]); - - if (child_setup && child_setup() != 0) - exit(1); - - if (write(sk, &c, 1) != 1) { - pr_perror("write"); - exit(1); - } - - while (1) - sleep(1000); - exit(1); - } - sk = sk_pair[0]; close(sk_pair[1]); From 5e8d7dc94b365e07a4ba538a171e8ccadb1e0036 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 5 Oct 2023 15:16:36 -0700 Subject: [PATCH 345/775] tun: don't parse buffers that have not been filled with data read_ns_sys_file() can return an error, but we are trying to parse a buffer before checking a return code. CID 417395 (#3 of 3): String not null terminated (STRING_NULL) 2. string_null: Passing unterminated string buf to strtol, which expects a null-terminated string. Signed-off-by: Andrei Vagin --- criu/net.c | 7 +++++-- criu/tun.c | 15 +++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/criu/net.c b/criu/net.c index 4abfc182a..e422e2f69 100644 --- a/criu/net.c +++ b/criu/net.c @@ -111,15 +111,18 @@ int read_ns_sys_file(char *path, char *buf, int len) } rlen = read(fd, buf, len); + if (rlen == -1) + pr_perror("Can't read ns' %s", path); close(fd); if (rlen == len) { + buf[0] = '\0'; pr_err("Too small buffer to read ns sys file %s\n", path); return -1; } - if (rlen > 0) - buf[rlen - 1] = '\0'; + if (rlen >= 0) + buf[rlen] = '\0'; return rlen; } diff --git a/criu/tun.c b/criu/tun.c index 2e2cc32bf..9d66f9929 100644 --- a/criu/tun.c +++ b/criu/tun.c @@ -455,27 +455,26 @@ int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **in TunLinkEntry tle = TUN_LINK_ENTRY__INIT; char spath[64]; char buf[64]; - int ret = 0; struct tun_link *tl; sprintf(spath, "class/net/%s/tun_flags", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.flags = strtol(buf, NULL, 0); sprintf(spath, "class/net/%s/owner", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.owner = strtol(buf, NULL, 10); sprintf(spath, "class/net/%s/group", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.group = strtol(buf, NULL, 10); - if (ret < 0) - return ret; - tl = get_tun_link_fd(nde->name, nde->peer_nsid, tle.flags); if (!tl) - return ret; + return -1; tle.vnethdr = tl->dmp.vnethdr; tle.sndbuf = tl->dmp.sndbuf; From 45670b65566c9c5a95f356e417d2d7fe88f3a30f Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:07:13 -0500 Subject: [PATCH 346/775] apparmor: remove the redundant check This check is redundant as line 201 checks for this condition. Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- criu/apparmor.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/apparmor.c b/criu/apparmor.c index 5b62759e2..e46e239f5 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -207,8 +207,6 @@ static int by_time(const struct dirent **de1, const struct dirent **de2) } else { if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) return -1; - if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) - return 0; return 1; } } From 48d6a59a22a916051422358849979224c7b1ef38 Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:08:37 -0500 Subject: [PATCH 347/775] arch/x86: remove the redundant check The is_native field is a boolean. Therefore, else if() should can be changed to a simple else{}. Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- criu/arch/x86/sigframe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 4fa7eb3dc..46612e70d 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -23,7 +23,7 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *r } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; - } else if (!sigframe->is_native) { + } else { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; if ((addr % 64ul)) { From 36a84751eddd27b86d9b64a93136040198eb3eb9 Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:18:16 -0500 Subject: [PATCH 348/775] zdtm/cow00: fix typo The condition meant to check fd2 instead of fd1, which is checked in line 24. Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- test/zdtm/static/cow00.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/cow00.c b/test/zdtm/static/cow00.c index cb0c6733e..456b6a7b4 100644 --- a/test/zdtm/static/cow00.c +++ b/test/zdtm/static/cow00.c @@ -29,7 +29,7 @@ static int is_cow(void *addr, pid_t p1, pid_t p2) snprintf(buf, sizeof(buf), "/proc/%d/pagemap", p2); fd2 = open(buf, O_RDONLY); - if (fd1 < 0) { + if (fd2 < 0) { pr_perror("Unable to open file %s", buf); return -1; } From cfaacfb58208a4da3adf134a33d0946b106c163a Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:19:09 -0500 Subject: [PATCH 349/775] zdtm/thread_different_uid_gid: remove the redundant check line 131 checks if (ret >= 0). line 133 could be replaced by a simple else statement Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- test/zdtm/static/thread_different_uid_gid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/thread_different_uid_gid.c b/test/zdtm/static/thread_different_uid_gid.c index 3a0b6291b..88f99659b 100644 --- a/test/zdtm/static/thread_different_uid_gid.c +++ b/test/zdtm/static/thread_different_uid_gid.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); if (ret >= 0) { ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); - } else if (ret < 0) { + } else { pr_perror("Failed to drop privileges"); exit(1); } From 5d4179ddb1c569af26669d3089996f95e0510482 Mon Sep 17 00:00:00 2001 From: Taemin Ha Date: Mon, 27 Mar 2023 21:17:38 -0500 Subject: [PATCH 350/775] criu/proc_parse: refactor the eventpoll parser Eventpollentry's fields are set only when ret == 3 or ret == 6. The remaining cases can be grouped together to an error Signed-off-by: Taemin Ha Signed-off-by: Andrei Vagin --- criu/proc_parse.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index d113a21b8..9d43e2394 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1972,10 +1972,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) " pos:%lli ino:%lx sdev:%x", &e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos, (long *)&e->inode, &e->dev); - if (ret < 3 || ret > 6) { - eventpoll_tfd_entry__free_unpacked(e, NULL); - goto parse_err; - } else if (ret == 3) { + if (ret == 3) { e->has_dev = false; e->has_inode = false; e->has_pos = false; @@ -1983,7 +1980,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) e->has_dev = true; e->has_inode = true; e->has_pos = true; - } else if (ret < 6) { + } else { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } From a77185daeea60acda31576a75a751ce5067d0a17 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 11 Oct 2023 14:21:34 +0200 Subject: [PATCH 351/775] files-reg: don't change the file pos in get_build_id At this point the correct position is already restored, so reading from the fd results in the position being moved forward by 5 bytes. Fixes: 9191f8728d62 ("criu/files-reg.c: add build-id validation functionality") Signed-off-by: Michal Clapinski --- criu/files-reg.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index 9fbab0d42..fc6149350 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1650,22 +1650,10 @@ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, co */ static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) { - char buf[SELFMAG + 1]; - void *start_addr; + char *start_addr; size_t mapped_size; int ret = -1; - if (read(fd, buf, SELFMAG + 1) != SELFMAG + 1) - return -1; - - /* - * The first 4 bytes contain a magic number identifying the file as an - * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and - * ‘F’, respectively. These characters are together defined as ELFMAG. - */ - if (strncmp(buf, ELFMAG, SELFMAG)) - return -1; - /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore at most only the first 1 MB of the @@ -1673,16 +1661,25 @@ static int get_build_id(const int fd, const struct stat *fd_status, unsigned cha */ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); - if (start_addr == MAP_FAILED) { + if ((void*)start_addr == MAP_FAILED) { pr_warn("Couldn't mmap file with fd %d\n", fd); return -1; } - if (buf[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(start_addr, build_id, fd, mapped_size); - if (buf[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(start_addr, build_id, fd, mapped_size); + /* + * The first 4 bytes contain a magic number identifying the file as an + * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and + * ‘F’, respectively. These characters are together defined as ELFMAG. + */ + if (memcmp(start_addr, ELFMAG, SELFMAG)) + goto out; + if (start_addr[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); + if (start_addr[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); + +out: munmap(start_addr, mapped_size); return ret; } From 29026496d40f6033fd7f0465c74921bd638cd83e Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 11 Oct 2023 14:26:39 +0200 Subject: [PATCH 352/775] zdtm/lib: add missing signal.h header Signed-off-by: Michal Clapinski --- test/zdtm/lib/lock.h | 1 + 1 file changed, 1 insertion(+) diff --git a/test/zdtm/lib/lock.h b/test/zdtm/lib/lock.h index 2b23550be..cc5306e06 100644 --- a/test/zdtm/lib/lock.h +++ b/test/zdtm/lib/lock.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "asm/atomic.h" #define BUG_ON(condition) \ From 41938f14b62cc517eab63f5d4245255b64d99a1f Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Wed, 11 Oct 2023 14:27:29 +0200 Subject: [PATCH 353/775] zdtm/static: test the offset migration of ELF files Signed-off-by: Michal Clapinski --- test/zdtm/static/Makefile | 1 + test/zdtm/static/fd_offset.c | 42 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 test/zdtm/static/fd_offset.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4c7ca72fd..07d3bc6e2 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -269,6 +269,7 @@ TST_NOFILE := \ sigtrap \ sigtrap01 \ change_mnt_context \ + fd_offset \ # jobctl00 \ PKG_CONFIG ?= pkg-config diff --git a/test/zdtm/static/fd_offset.c b/test/zdtm/static/fd_offset.c new file mode 100644 index 000000000..96255a4a1 --- /dev/null +++ b/test/zdtm/static/fd_offset.c @@ -0,0 +1,42 @@ +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check that criu properly restores offsets on ELF files"; +const char *test_author = "Michal Clapinski "; + +void check_offset(int fd) +{ + int offset = lseek(fd, 0, SEEK_CUR); + if (offset < 0) { + fail("lseek"); + exit(1); + } + if (offset != 0) { + fail("wrong offset; expected: 0, got: %d", offset); + exit(1); + } +} + +int main(int argc, char **argv) +{ + int fd; + + test_init(argc, argv); + + fd = open("/proc/self/exe", O_RDONLY); + if (fd < 0) { + fail("open"); + exit(1); + } + check_offset(fd); + + test_daemon(); + test_waitsig(); + + check_offset(fd); + + pass(); + return 0; +} From e076c11e22a8b7c527dcc125c752cb62f77a6c3b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 21 Nov 2023 12:02:31 -0800 Subject: [PATCH 354/775] ci: fix codespell errors Signed-off-by: Andrei Vagin --- criu/net.c | 2 +- criu/pagemap-cache.c | 2 +- lib/py/images/pb2dict.py | 2 +- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- scripts/nmk/scripts/main.mk | 2 +- test/zdtm/static/mntns_open.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/criu/net.c b/criu/net.c index e422e2f69..7109e6876 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3274,7 +3274,7 @@ int macvlan_ext_add(struct external *ext) /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the - * kernel will synchonously go on a very slow routine called + * kernel will synchronously go on a very slow routine called * synchronize_rcu() trying to put a reference on old namespaces. * * To avoid doing this more than once we pre-create all the diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 00f088ff3..09dbc6a36 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -115,7 +115,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) * fit in solid manner, iow -- either the whole vma fits * the cache window, either plain read is used. * - * The benefit (apart redusing the number of read() calls) + * The benefit (apart reducing the number of read() calls) * is to walk page tables less. */ if (!pagemap_cache_disabled && len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { diff --git a/lib/py/images/pb2dict.py b/lib/py/images/pb2dict.py index fe41642d5..3f5f390e3 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/py/images/pb2dict.py @@ -365,7 +365,7 @@ def pb2dict(pb, pretty=False, is_hex=False): def _dict2pb_cast(field, value): # Not considering TYPE_MESSAGE here, as repeated # and non-repeated messages need special treatment - # in this case, and are hadled separately. + # in this case, and are handled separately. if field.type == FD.TYPE_BYTES: return get_bytes_dec(field)(value) elif field.type == FD.TYPE_ENUM: diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 6d004247b..ef79e5ef4 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -1063,7 +1063,7 @@ static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest) * * Nodes compatibility are determined by: * 1. Comparing the node properties - * 2. Making sure iolink mappings to CPUs would be compabitle with existing iolink mappings in maps + * 2. Making sure iolink mappings to CPUs would be compatible with existing iolink mappings in maps * * If src_node and dest_node are mappable, then map_device will push the new mapping * for src_node -> dest_node into new_maps. diff --git a/scripts/nmk/scripts/main.mk b/scripts/nmk/scripts/main.mk index 493a164f8..7f11bda23 100644 --- a/scripts/nmk/scripts/main.mk +++ b/scripts/nmk/scripts/main.mk @@ -1,7 +1,7 @@ ifndef ____nmk_defined__main # -# Genaral inclusion statement +# General inclusion statement ifndef ____nmk_defined__include include $(__nmk_dir)include.mk diff --git a/test/zdtm/static/mntns_open.c b/test/zdtm/static/mntns_open.c index 7d8bbbaa4..0430f5b99 100644 --- a/test/zdtm/static/mntns_open.c +++ b/test/zdtm/static/mntns_open.c @@ -17,7 +17,7 @@ #define CLONE_NEWNS 0x00020000 #endif -const char *test_doc = "Check that mnt_id is repsected"; +const char *test_doc = "Check that mnt_id is respected"; const char *test_author = "Pavel Emelianov "; #define MPTS_FILE "F" From 97b8b659c9497c66c7653ab226ec9bb093910795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Wed, 31 May 2023 13:31:34 +0200 Subject: [PATCH 355/775] zdtm: cgroup_ifpriomap: Improve skip check's robustness. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup_ifpriomap test needs net_prio cgroup, which might not be available. Make the .checkskip script check it. Signed-off-by: Michał Mirosław --- test/zdtm/static/cgroup_ifpriomap.checkskip | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/cgroup_ifpriomap.checkskip b/test/zdtm/static/cgroup_ifpriomap.checkskip index 205f8fc53..f401ad1b2 100755 --- a/test/zdtm/static/cgroup_ifpriomap.checkskip +++ b/test/zdtm/static/cgroup_ifpriomap.checkskip @@ -1,3 +1,6 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +grep -q '^net_prio\s' /proc/cgroups From 0b62f4267a9939a192f4b88257fb19befa8b8bfb Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 9 Oct 2023 15:55:25 +0100 Subject: [PATCH 356/775] lib: use separate packages for pycriu and crit Newer versions of pip use an isolated virtual environment when building Python projects. However, when the source code of CRIT is copied into the isolated environment, the symlink for `../lib/py` (pycriu) becomes invalid. As a workaround, we used the `--no-build-isolation` option for `pip install`. However, this functionality has issues in some versions of PIP [1, 2]. To fix this problem, this patch adds separate packages for pycriu and crit, and each package is installed independently. [1] https://github.com/pypa/pip/pull/8221 [2] https://github.com/pypa/pip/issues/8165#issuecomment-625401463 Signed-off-by: Radostin Stoyanov --- Makefile | 16 ++++++--- Makefile.install | 7 +++- coredump/pycriu | 2 +- crit/.gitignore | 2 ++ crit/Makefile | 40 ++++++++++++++++++++++ crit/crit/__init__.py | 1 + lib/py/cli.py => crit/crit/__main__.py | 20 +++++------ crit/pycriu | 1 - crit/pyproject.toml | 23 +++++++++++-- crit/requirements.txt | 7 ---- crit/setup.cfg | 20 +++++++++++ crit/setup.py | 19 +++------- lib/.gitignore | 1 + lib/Makefile | 32 ++++++++--------- lib/{py => pycriu}/.gitignore | 1 + lib/{py => pycriu}/Makefile | 0 lib/{py => pycriu}/__init__.py | 2 +- lib/{py => pycriu}/criu.py | 0 lib/{py => pycriu}/images/.gitignore | 0 lib/{py => pycriu}/images/Makefile | 0 lib/{py => pycriu}/images/__init__.py | 0 lib/{py => pycriu}/images/images.py | 0 lib/{py => pycriu}/images/pb2dict.py | 0 lib/pyproject.toml | 19 ++++++++++ lib/setup.cfg | 16 +++++++++ crit/crit => lib/setup.py | 4 +-- test/others/env.sh | 11 ++++-- test/pycriu | 2 +- test/zdtm/static/socket-tcp-fin-wait1.hook | 2 +- 29 files changed, 182 insertions(+), 66 deletions(-) create mode 100644 crit/Makefile create mode 100644 crit/crit/__init__.py rename lib/py/cli.py => crit/crit/__main__.py (95%) delete mode 120000 crit/pycriu delete mode 100644 crit/requirements.txt create mode 100644 crit/setup.cfg create mode 100644 lib/.gitignore rename lib/{py => pycriu}/.gitignore (68%) rename lib/{py => pycriu}/Makefile (100%) rename lib/{py => pycriu}/__init__.py (67%) rename lib/{py => pycriu}/criu.py (100%) rename lib/{py => pycriu}/images/.gitignore (100%) rename lib/{py => pycriu}/images/Makefile (100%) rename lib/{py => pycriu}/images/__init__.py (100%) rename lib/{py => pycriu}/images/images.py (100%) rename lib/{py => pycriu}/images/pb2dict.py (100%) create mode 100644 lib/pyproject.toml create mode 100644 lib/setup.cfg rename crit/crit => lib/setup.py (55%) mode change 100755 => 100644 diff --git a/Makefile b/Makefile index 4b3329473..88a23617b 100644 --- a/Makefile +++ b/Makefile @@ -164,7 +164,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib +all: criu lib crit .PHONY: all # @@ -288,9 +288,9 @@ clean mrproper: $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ - $(Q) $(MAKE) $(build)=lib $@ .PHONY: clean mrproper clean-amdgpu_plugin: @@ -337,6 +337,10 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +crit: lib + $(Q) $(MAKE) -C crit +.PHONY: crit + # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -402,6 +406,7 @@ help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' + @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @@ -425,11 +430,12 @@ lint: flake8 --config=scripts/flake8.cfg test/zdtm.py flake8 --config=scripts/flake8.cfg test/inhfd/*.py flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py - flake8 --config=scripts/flake8.cfg lib/py/images/images.py + flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py + flake8 --config=scripts/flake8.cfg lib/pycriu/images/images.py flake8 --config=scripts/flake8.cfg scripts/criu-ns flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py - flake8 --config=scripts/flake8.cfg crit/setup.py + flake8 --config=scripts/flake8.cfg crit/*.py + flake8 --config=scripts/flake8.cfg crit/crit/*.py flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py diff --git a/Makefile.install b/Makefile.install index c798637be..6f5b31924 100644 --- a/Makefile.install +++ b/Makefile.install @@ -37,6 +37,10 @@ install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib +install-crit: lib + $(Q) $(MAKE) $(build)=crit install +.PHONY: install-crit + install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu @@ -50,12 +54,13 @@ install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/coredump/pycriu b/coredump/pycriu index d13a8790a..d1b6ed5c4 120000 --- a/coredump/pycriu +++ b/coredump/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/crit/.gitignore b/crit/.gitignore index 810661179..10c8ab186 100644 --- a/crit/.gitignore +++ b/crit/.gitignore @@ -1,2 +1,4 @@ crit.egg-info/ build/ +dist/ +version.py diff --git a/crit/Makefile b/crit/Makefile new file mode 100644 index 000000000..9a856db6d --- /dev/null +++ b/crit/Makefile @@ -0,0 +1,40 @@ +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES := 0 + +VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) + +all-y += ${VERSION_FILE} +cleanup-y += ${VERSION_FILE} + +${VERSION_FILE}: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ + +install: ${VERSION_FILE} +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +else + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit +endif +.PHONY: install + +uninstall: +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +else + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +endif +.PHONY: uninstall diff --git a/crit/crit/__init__.py b/crit/crit/__init__.py new file mode 100644 index 000000000..58f3ace6c --- /dev/null +++ b/crit/crit/__init__.py @@ -0,0 +1 @@ +from .version import __version__ diff --git a/lib/py/cli.py b/crit/crit/__main__.py similarity index 95% rename from lib/py/cli.py rename to crit/crit/__main__.py index a3a0870f8..e15327f50 100755 --- a/lib/py/cli.py +++ b/crit/crit/__main__.py @@ -5,6 +5,7 @@ import json import os import pycriu +from . import __version__ def inf(opts): @@ -41,9 +42,9 @@ def decode(opts): try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: - print("Unknown magic %#x.\n"\ - "Maybe you are feeding me an image with "\ - "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + print("Unknown magic %#x.\n" + "Maybe you are feeding me an image with " + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: @@ -59,9 +60,9 @@ def encode(opts): try: img = json.load(inf(opts)) except UnicodeDecodeError: - print("Cannot read JSON.\n"\ - "Maybe you are feeding me an image with protobuf data? "\ - "Encode expects JSON input.", file=sys.stderr) + print("Cannot read JSON.\n" + "Maybe you are feeding me an image with protobuf data? " + "Encode expects JSON input.", file=sys.stderr) sys.exit(1) pycriu.images.dump(img, outf(opts, False)) @@ -131,7 +132,7 @@ def ftype_find_in_files(opts, ft, fid): if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] - except: + except Exception: files_img = [] if len(files_img) == 0: @@ -364,7 +365,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--version', action='version', version=pycriu.__version__) + parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') @@ -374,8 +375,7 @@ def main(): 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', - help= - 'Multiline with indents and some numerical fields in field-specific format', + help='Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', diff --git a/crit/pycriu b/crit/pycriu deleted file mode 120000 index d13a8790a..000000000 --- a/crit/pycriu +++ /dev/null @@ -1 +0,0 @@ -../lib/py/ \ No newline at end of file diff --git a/crit/pyproject.toml b/crit/pyproject.toml index 019b0d848..9089f0a39 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -1,3 +1,22 @@ [build-system] -# Minimum requirements for the build system to execute. -requires = ["setuptools", "wheel"] # PEP 508 specifications. +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "crit" +description = "CRiu Image Tool" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[project.scripts] +crit = "crit.__main__:main" + +[tool.setuptools] +packages = ["crit"] + +[tool.setuptools.dynamic] +version = {attr = "crit.__version__"} diff --git a/crit/requirements.txt b/crit/requirements.txt deleted file mode 100644 index c27e6d4f0..000000000 --- a/crit/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -# We need pip version 20.1 or newer to correctly build with 'pycriu' symlink. -# - Building of local directories with pip 20.1 or newer is done in place, -# instead of a temporary location containing a copy of the directory tree. -# (https://github.com/pypa/pip/issues/7555) -pip>=20.1 -setuptools>=42.0.0 -wheel diff --git a/crit/setup.cfg b/crit/setup.cfg new file mode 100644 index 000000000..fbc9a5143 --- /dev/null +++ b/crit/setup.cfg @@ -0,0 +1,20 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = crit +description = CRiu Image Tool +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: crit.__version__ + +[options] +packages = crit +python_requires = >=3.6 + +[options.entry_points] +console_scripts = + crit = crit.__main__:main diff --git a/crit/setup.py b/crit/setup.py index 2f584678f..618ac1de4 100644 --- a/crit/setup.py +++ b/crit/setup.py @@ -1,15 +1,6 @@ -from setuptools import setup, find_packages -import pycriu +#!/usr/bin/env python3 +import setuptools -setup( - name='crit', - version=pycriu.__version__, - description='CRiu Image Tool', - author='CRIU team', - author_email='criu@openvz.org', - license='GPLv2', - url='https://github.com/checkpoint-restore/criu', - packages=find_packages('.'), - scripts=['crit'], - install_requires=[], -) + +if __name__ == '__main__': + setuptools.setup() diff --git a/lib/.gitignore b/lib/.gitignore new file mode 100644 index 000000000..a10181b80 --- /dev/null +++ b/lib/.gitignore @@ -0,0 +1 @@ +pycriu.egg-info/ diff --git a/lib/Makefile b/lib/Makefile index 32d238de4..ae371e78e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -28,17 +28,17 @@ lib-a: lib/c/$(CRIU_A) # # Python bindings. -lib/py/Makefile: ; -lib/py/%: .FORCE +lib/pycriu/Makefile: ; +lib/pycriu/%: .FORCE $(call msg-gen, $@) - $(Q) $(MAKE) $(build)=lib/py $@ + $(Q) $(MAKE) $(build)=lib/pycriu $@ lib-py: - $(Q) $(MAKE) $(build)=lib/py all + $(Q) $(MAKE) $(build)=lib/pycriu all .PHONY: lib-py clean-lib: $(Q) $(MAKE) $(build)=lib/c clean - $(Q) $(MAKE) $(build)=lib/py clean + $(Q) $(MAKE) $(build)=lib/pycriu clean .PHONY: clean-lib clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc @@ -59,17 +59,15 @@ install: lib-c lib-a lib-py lib/c/criu.pc.in $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " SKIP INSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" else - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt - $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib endif else - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install -r ./crit/requirements.txt - $(Q) $(PYTHON) -m pip install --no-build-isolation --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib endif .PHONY: install @@ -84,14 +82,14 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" + $(E) " SKIP UNINSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" else - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu endif else - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu endif .PHONY: uninstall diff --git a/lib/py/.gitignore b/lib/pycriu/.gitignore similarity index 68% rename from lib/py/.gitignore rename to lib/pycriu/.gitignore index fba7e3864..111642787 100644 --- a/lib/py/.gitignore +++ b/lib/pycriu/.gitignore @@ -1,3 +1,4 @@ +__pycache__ *_pb2.py *.pyc version.py diff --git a/lib/py/Makefile b/lib/pycriu/Makefile similarity index 100% rename from lib/py/Makefile rename to lib/pycriu/Makefile diff --git a/lib/py/__init__.py b/lib/pycriu/__init__.py similarity index 67% rename from lib/py/__init__.py rename to lib/pycriu/__init__.py index 44f66ffa4..2abcf029d 100644 --- a/lib/py/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,4 +1,4 @@ from . import rpc_pb2 as rpc from . import images from .criu import * -from .version import __version__ +from .version import __version__ \ No newline at end of file diff --git a/lib/py/criu.py b/lib/pycriu/criu.py similarity index 100% rename from lib/py/criu.py rename to lib/pycriu/criu.py diff --git a/lib/py/images/.gitignore b/lib/pycriu/images/.gitignore similarity index 100% rename from lib/py/images/.gitignore rename to lib/pycriu/images/.gitignore diff --git a/lib/py/images/Makefile b/lib/pycriu/images/Makefile similarity index 100% rename from lib/py/images/Makefile rename to lib/pycriu/images/Makefile diff --git a/lib/py/images/__init__.py b/lib/pycriu/images/__init__.py similarity index 100% rename from lib/py/images/__init__.py rename to lib/pycriu/images/__init__.py diff --git a/lib/py/images/images.py b/lib/pycriu/images/images.py similarity index 100% rename from lib/py/images/images.py rename to lib/pycriu/images/images.py diff --git a/lib/py/images/pb2dict.py b/lib/pycriu/images/pb2dict.py similarity index 100% rename from lib/py/images/pb2dict.py rename to lib/pycriu/images/pb2dict.py diff --git a/lib/pyproject.toml b/lib/pyproject.toml new file mode 100644 index 000000000..8eb4b7084 --- /dev/null +++ b/lib/pyproject.toml @@ -0,0 +1,19 @@ +[build-system] +requires = ["setuptools", "protobuf<4.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pycriu" +description = "Python bindings for CRIU" +authors = [ + {name = "CRIU team", email = "criu@openvz.org"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[tool.setuptools] +packages = ["pycriu", "pycriu.images"] + +[tool.setuptools.dynamic] +version = {attr = "pycriu.__version__"} diff --git a/lib/setup.cfg b/lib/setup.cfg new file mode 100644 index 000000000..23ee48dd5 --- /dev/null +++ b/lib/setup.cfg @@ -0,0 +1,16 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = pycriu +description = Python bindings for CRIU +author = CRIU team +author_email = criu@openvz.org +license = GPLv2 +version = attr: pycriu.__version__ + +[options] +packages = find: +python_requires = >=3.6 diff --git a/crit/crit b/lib/setup.py old mode 100755 new mode 100644 similarity index 55% rename from crit/crit rename to lib/setup.py index 3b15ca654..618ac1de4 --- a/crit/crit +++ b/lib/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import setuptools -from pycriu import cli if __name__ == '__main__': - cli.main() + setuptools.setup() diff --git a/test/others/env.sh b/test/others/env.sh index 6d830fb58..6fa2c9691 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -1,8 +1,13 @@ #!/bin/sh -CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) +BASE_DIR="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/../../")" + +CRIU="${BASE_DIR}/criu/criu" criu=$CRIU -CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit) + +export PYTHONPATH="${BASE_DIR}/lib:${BASE_DIR}/crit:${PYTHONPATH-}" +CRIT="python3 -m crit" crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump) + +CRIU_COREDUMP="${BASE_DIR}/coredump/coredump" criu_coredump=$CRIU_COREDUMP diff --git a/test/pycriu b/test/pycriu index d13a8790a..d1b6ed5c4 120000 --- a/test/pycriu +++ b/test/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 9dcd08999..30f8ce071 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import sys -sys.path.append("../crit") +sys.path.append("../lib") import pycriu import os, os.path From f104b3d6d7c12447ce457a518825b7966c6ef53b Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Fri, 20 Oct 2023 08:10:35 +0200 Subject: [PATCH 357/775] Makefile: introduce ARCHCFLAGS for arch specific cflags Do not use $(USERCFLAGS) for anything other than what the user provide. Signed-off-by: Marcus Folkesson --- Makefile | 8 ++++---- test/zdtm/Makefile.inc | 8 ++++---- test/zdtm/lib/Makefile | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 88a23617b..9f0928b01 100644 --- a/Makefile +++ b/Makefile @@ -35,18 +35,18 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp endif ifeq ($(ARMV),8) # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif @@ -159,7 +159,7 @@ export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ -CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index d34523315..2456260e6 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -23,12 +23,12 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 else ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) # To build aarch32 on armv8 Travis-CI (see criu Makefile) - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif endif @@ -40,7 +40,7 @@ endif PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes -CFLAGS += $(USERCFLAGS) +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) CFLAGS += -D_GNU_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index b574e1d3e..428d726d6 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -1,6 +1,6 @@ LIBDIR := . -CFLAGS += $(USERCFLAGS) +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) LIB := libzdtmtst.a From f8b14286b092853a4485813e1efd564109df9123 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 16 Nov 2023 12:52:16 -0800 Subject: [PATCH 358/775] criu: Version 3.19 (Bronze Peacock) Two major highlights of this release: * LoongArch64 support * A lot of fixes and improvments form the Google backlog. The full changelog can be found here: https://criu.org/Download/criu/3.19. This marks the final release of the 3.x series. The upcoming version will be 4.0! Additionally, the naming pattern will be changed. Any ideas are welcome. Signed-off-by: Andrei Vagin --- Makefile.versions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.versions b/Makefile.versions index 4c645cd6c..5f21c11c2 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. CRIU_VERSION_MAJOR := 3 -CRIU_VERSION_MINOR := 18 +CRIU_VERSION_MINOR := 19 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := Silver Sandpiper +CRIU_VERSION_NAME := Bronze Peacock CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL From 9f9737c800fd19ebc133e654d69cc8b81aeb2e5e Mon Sep 17 00:00:00 2001 From: sally kang Date: Mon, 27 Nov 2023 22:23:55 +0800 Subject: [PATCH 359/775] comple: correct the syscall number of bind on ARM64 In the compel/arch/arm/plugins/std/syscalls/syscall.def, the syscall number of bind on ARM64 should be 200 instead of 235 Signed-off-by: Sally Kang --- compel/arch/arm/plugins/std/syscalls/syscall.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 7489ee0c1..217e346a3 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -39,7 +39,7 @@ recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, str sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) -bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +bind 200 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) From fc94b2d158961611d39b7e692c019487da43e01b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Nov 2023 08:27:29 +0000 Subject: [PATCH 360/775] ci: fix rawhide netlink error The rawhide netlink errors are fixed with a newer kernel than the default 6.2 available in Fedora 38. Signed-off-by: Adrian Reber --- scripts/ci/vagrant.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 328903f38..c0c8e88c1 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -57,6 +57,11 @@ fedora-no-vdso() { } fedora-rawhide() { + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks + # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously + # installed this reboots the VM. + vagrant reload + ssh default uname -a # # Workaround the problem: # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected From 900909d95e0bf13dba0ba93488bcebbdcdebc86b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Nov 2023 12:30:42 +0000 Subject: [PATCH 361/775] test: check for btrfs in the current directory The old test was checking if '/' is btrfs but we should check if the current directory is btrfs. Signed-off-by: Adrian Reber --- test/jenkins/criu-fault.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 7f503e817..4a6d55e6b 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -9,7 +9,7 @@ prep ./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail # FIXME: fhandles looks broken on btrfs -grep -P "/.* / " /proc/self/mountinfo | grep -q btrfs || NOBTRFS=$? +findmnt --noheadings --target . | grep -q btrfs || NOBTRFS=$? if [ $NOBTRFS -eq 1 ] ; then ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail fi From 088390ea89b86ea2cca89ebcecd4e4046b52ace3 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Nov 2023 14:56:41 +0000 Subject: [PATCH 362/775] ci: switch to permissive selinux mode during test Signed-off-by: Adrian Reber --- scripts/ci/run-ci-tests.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 1aae555f7..e05ead668 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -292,10 +292,18 @@ if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu + if [ -d /sys/fs/selinux ]; then + # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. + selinuxmode=$(getenforce) + setenforce Permissive + fi # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" + if [ -d /sys/fs/selinux ]; then + setenforce "$selinuxmode" + fi setcap -r criu/criu else echo "Skipping unprivileged mode tests" From 1004625facde4f2b747321ed7ef7260763d3650d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 28 Nov 2023 13:18:23 +0000 Subject: [PATCH 363/775] docker-test: fix condition for max tries Replace a recursive call with a loop. Reported-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 22d326a37..174c2e109 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -87,27 +87,25 @@ print_logs () { } declare -i max_restore_container_tries=3 -current_iteration= restore_container () { CHECKPOINT_NAME=$1 - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { + for i in $(seq $max_restore_container_tries); do + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log && break + # FIXME: There is a race condition in docker/containerd that causes # docker to occasionally fail when starting a container from a # checkpoint immediately after the checkpoint has been created. # https://github.com/moby/moby/issues/42900 - if [ "$current_iteration" -gt "$max_restore_container_tries" ]; then + if grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log; then + echo "Retry container restore: $i/$max_restore_container_tries" + sleep 1; + else print_logs fi - grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log && { - ((current_iteration+=1)) - echo "Retry container restore: $current_iteration" - sleep 1; - restore_container "$CHECKPOINT_NAME" - } || - print_logs - } && current_iteration=0 + + done } # Scenario: Create multiple containers and checkpoint and restore them once From 37d62fa47552b1c70cd8cb8ea5564660d67c69ef Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 29 Nov 2023 11:46:08 +0000 Subject: [PATCH 364/775] docker-test: downgrade docker to v24.0.7 Checkpoint/restore with version 25.0.0-beta.1 fails with the following error: $ docker start --checkpoint=c1 cr Error response from daemon: failed to create task for container: content digest fdb1054b00a8c07f08574ce52198c5501d1f552b6a5fb46105c688c70a9acb45: not found: unknown Release notes: https://github.com/moby/moby/discussions/46816 Signed-off-by: Radostin Stoyanov --- scripts/ci/docker-test.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 174c2e109..7e7ef7197 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -15,10 +15,11 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - -# shellcheck source=/dev/null -. /etc/lsb-release +# checkpoint/restore is broken in Docker Engine (Community) version 25.0.0-beta.1 +# https://github.com/moby/moby/discussions/46816 +# Downgrade to the latest stable version. +VERSION_STRING=5:24.0.7-1~ubuntu.20.04~focal +./apt-install docker-ce=$VERSION_STRING docker-ce-cli=$VERSION_STRING containerd.io docker-buildx-plugin docker-compose-plugin # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json From 8a51639e3d5b5cda0cc81c19ce0645a9f2f46019 Mon Sep 17 00:00:00 2001 From: "Ivan A. Melnikov" Date: Wed, 6 Dec 2023 18:01:08 +0400 Subject: [PATCH 365/775] Makefile: Use common warnings settings for loongarch64 WARNINGS variable should be amended, not redefined. We still need, e.g., `-Wno-dangling-pointer` to build criu on loongarch64 with gcc13. Signed-off-by: Ivan A. Melnikov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9f0928b01..05f1964a2 100644 --- a/Makefile +++ b/Makefile @@ -127,7 +127,7 @@ WARNINGS := -rdynamic endif ifeq ($(ARCH),loongarch64) -WARNINGS := -Wno-implicit-function-declaration +WARNINGS += -Wno-implicit-function-declaration endif ifneq ($(GCOV),) From f86f1b84911784ffd45db05204cea01a2a82afb3 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 8 Dec 2023 17:36:32 +0100 Subject: [PATCH 366/775] tty: skip ioctl(TIOCSLCKTRMIOS) if possible If ioctl(TIOCSLCKTRMIOS) fails with EPERM it means that a CRIU process lacks of CAP_SYS_ADMIN capability. But we can use ioctl(TIOCGLCKTRMIOS) to *read* current ->termios_locked value from the kernel and if it's the same as we already have we can skip failing ioctl(TIOCSLCKTRMIOS) safely. Adrian has recently posted [1] a very good patch to allow ioctl(TIOCSLCKTRMIOS) for processes that have CAP_CHECKPOINT_RESTORE (right now it requires CAP_SYS_ADMIN). [1] https://lore.kernel.org/all/20231206134340.7093-1-areber@redhat.com/ Suggested-by: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/tty.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/criu/tty.c b/criu/tty.c index 9faf602f2..ae23094b7 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -817,8 +817,26 @@ static int do_restore_tty_parms(void *arg, int fd, pid_t pid) * on termios too. Just to be on the safe side. */ - if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) - goto err; + if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) { + struct termios t; + + if (errno != EPERM) + goto err; + + memzero(&t, sizeof(t)); + if (ioctl(fd, TIOCGLCKTRMIOS, &t) < 0) { + pr_perror("Can't get tty locked params on %#x", p->tty_id); + goto err; + } + + /* + * The ioctl(TIOCSLCKTRMIOS) requires a CRIU process to be privileged + * in the init_user_ns, but if the current "termios_locked" value equal + * to the "termios_locked" value from the image, we can safely skip setting it. + */ + if (memcmp(&t, &p->tl, sizeof(struct termios)) != 0) + goto err; + } if ((p->has & HAS_TERMIOS) && ioctl(fd, TCSETS, &p->t) < 0) goto err; From 6679d60ffd2da0282340a501ccfa1b439d452116 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 30 Nov 2023 13:29:56 +0000 Subject: [PATCH 367/775] ci: do not use 'tail' for skip-file-rwx-check test Newer versions of 'tail' rely on inotify and after a restore 'tail' is unhappy with the state of inotify and just stops. This replaces 'tail' with a minimal shell based test (thanks Andrei). Signed-off-by: Adrian Reber --- test/others/skip-file-rwx-check/run.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh index 0803d78ec..0776ebf61 100755 --- a/test/others/skip-file-rwx-check/run.sh +++ b/test/others/skip-file-rwx-check/run.sh @@ -10,11 +10,11 @@ source ../env.sh make clean touch testfile chmod +w testfile -tail --follow testfile & -tailpid=$! -if ! "$criu" dump --tree=$tailpid --shell-job --verbosity=4 --log-file=dump.log +bash -c 'exec 3 Date: Thu, 30 Nov 2023 14:13:37 +0000 Subject: [PATCH 368/775] ci: fix centos-stream 9 ci errors The image has a too old version of nettle which does not work with gnutls. Just upgrade to the latest to make the error go away. Signed-off-by: Adrian Reber --- .cirrus.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 6a586d58b..adaa9be33 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -37,6 +37,9 @@ task: dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel + # The image has a too old version of nettle which does not work with gnutls. + # Just upgrade to the latest to make the error go away. + dnf -y upgrade nettle nettle-devel systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed. # The Cirrus CI user runs as a service from selinux point of view and is From 2d1f4ec719b8bcdf60ffa57d7954965c41e61ad7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 30 Nov 2023 15:10:23 +0000 Subject: [PATCH 369/775] ci: disable non-root in user namespace test in container Signed-off-by: Adrian Reber --- scripts/ci/prepare-for-fedora-rawhide.sh | 1 + scripts/ci/run-ci-tests.sh | 11 ++++++++--- scripts/ci/vagrant.sh | 4 ++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index e31814a95..d812c5faa 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -18,6 +18,7 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libbsd-devel \ + libselinux-utils \ make \ procps-ng \ protobuf-c-devel \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index e05ead668..ef7e869e0 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -288,11 +288,16 @@ ip net add test # Rootless tests # Check if cap_checkpoint_restore is supported and also if unshare -c is supported. -if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then +# +# Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). +# This is a temporary workaround until fixed in the kernel. +# The kernel currently does not show correct device and inode numbers in /proc/pid/maps +# for stackable file systems. +if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true && [ ! -e /run/.containerenv ]; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu - if [ -d /sys/fs/selinux ]; then + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. selinuxmode=$(getenforce) setenforce Permissive @@ -301,7 +306,7 @@ if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true; then # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" - if [ -d /sys/fs/selinux ]; then + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then setenforce "$selinuxmode" fi setcap -r criu/criu diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c0c8e88c1..c8cf0be74 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -70,6 +70,10 @@ fedora-rawhide() { # ssh default 'sudo dnf remove -y crun || true' ssh default sudo dnf install -y podman runc + # Some tests in the container need selinux to be disabled. + # In the container it is not possible to change the state of selinux. + # Let's just disable it for this test run completely. + ssh default 'sudo setenforce Permissive' ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } From 7b689b7a423e16864b7cd99cc1a216a4464f9591 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 20 Oct 2023 10:59:19 +0100 Subject: [PATCH 370/775] gitignore: remove historical left-over files In commit [1] was introduced a mechanism to auto-generate the files: sys-exec-tbl*.c, syscalls*.S, syscall-codes*.h, and syscall*.h. This commit also updated the gitignore rules to ignore auto-generated files. However, after commit [2], the path for these files has changed and the patterns specified in gitignore are no longer needed. [1] bbc2f133 (x86/build: generate syscalls-{64,32}.built-in.o) [2] 19fadee9 (compel: plugins,std -- Implement syscalls in std plugin) Reported-by: @felicitia Signed-off-by: Radostin Stoyanov --- .gitignore | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitignore b/.gitignore index 2f2ab2029..854657d1c 100644 --- a/.gitignore +++ b/.gitignore @@ -25,12 +25,6 @@ images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest -criu/arch/*/sys-exec-tbl*.c -# x86 syscalls-table is not generated -!criu/arch/x86/sys-exec-tbl.c -criu/arch/*/syscalls*.S -criu/include/syscall-codes*.h -criu/include/syscall*.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h From b419f3dfdc7f8b8caa2af44e4af93d01c7bd34d1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 27 Dec 2023 20:33:30 -0800 Subject: [PATCH 371/775] make: fix compilation on alpine Starting with the musl v1.2.4~69, _GNU_SOURCE doesn't set _LARGEFILE64_SOURCE. Fixes #2313 Signed-off-by: Andrei Vagin --- Makefile | 1 + scripts/build/Dockerfile.alpine | 2 +- test/zdtm/Makefile.inc | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 05f1964a2..31dbe202f 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,7 @@ export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_LARGEFILE64_SOURCE DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index cb746757a..593e19031 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -47,6 +47,6 @@ RUN apk add \ # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml +RUN pip3 install junit_xml --break-system-packages RUN make -C test/zdtm diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 2456260e6..24f32c606 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -41,7 +41,7 @@ PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) -CFLAGS += -D_GNU_SOURCE +CFLAGS += -D_GNU_SOURCE -D_LARGEFILE64_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include ifeq ($(strip $(V)),) From d9c427d70c03e62b23c3e9491bebdbd5c60ef16f Mon Sep 17 00:00:00 2001 From: robert Date: Sun, 7 Jan 2024 15:32:00 -0800 Subject: [PATCH 372/775] irmap: hardcode some more interesting paths Signed-off-by: robert --- criu/irmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/irmap.c b/criu/irmap.c index e12df5cb5..37d098db1 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -67,6 +67,7 @@ static struct irmap hints[] = { .path = "/var/log", .nr_kids = -1, }, + { .path = "/usr/share/dbus-1/services", .nr_kids = -1 }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, From 0ab2f9e976952679239f937642dc8ac020bbdca9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Jan 2024 14:40:25 +0000 Subject: [PATCH 373/775] net: fix network unlock with iptables-nft When iptables-nft is used as backend for iptables, the rules for network locking are translated into the following nft rules: ``` $ iptables-restore-translate -f lock.txt add table ip filter add chain ip filter CRIU insert rule ip filter INPUT counter jump CRIU insert rule ip filter OUTPUT counter jump CRIU add rule ip filter CRIU mark 0xc114 counter accept add rule ip filter CRIU counter drop ``` These rules create the following chains: ``` table ip filter { # handle 1 chain CRIU { # handle 1 meta mark 0x0000c114 counter packets 16 bytes 890 accept # handle 6 counter packets 1 bytes 60 drop # handle 7 meta mark 0x0000c114 counter packets 0 bytes 0 accept # handle 8 counter packets 0 bytes 0 drop # handle 9 } chain INPUT { # handle 2 type filter hook input priority filter; policy accept; counter packets 8 bytes 445 jump CRIU # handle 3 counter packets 0 bytes 0 jump CRIU # handle 10 } chain OUTPUT { # handle 4 type filter hook output priority filter; policy accept; counter packets 9 bytes 505 jump CRIU # handle 5 counter packets 0 bytes 0 jump CRIU # handle 11 } } ``` In order to delete the CRIU chain, we need to first delete all four jump targets. Otherwise, `-X CRIU` would fail with the following error: iptables-restore v1.8.10 (nf_tables): line 5: CHAIN_DEL failed (Resource busy): chain CRIU Reported-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/net.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/criu/net.c b/criu/net.c index 7109e6876..b34c379ba 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3178,19 +3178,53 @@ static inline int nftables_network_unlock(void) #endif } +static int iptables_has_criu_jump_target(void) +{ + int fd, ret; + char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fd = -1; + pr_perror("failed to open /dev/null, using log fd"); + } + + ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); + close_safe(&fd); + return ret; +} + static int iptables_network_unlock_internal(void) { - char conf[] = "*filter\n" - ":CRIU - [0:0]\n" - "-D INPUT -j CRIU\n" - "-D OUTPUT -j CRIU\n" - "-X CRIU\n" - "COMMIT\n"; + char delete_jump_targets[] = "*filter\n" + ":CRIU - [0:0]\n" + "-D INPUT -j CRIU\n" + "-D OUTPUT -j CRIU\n" + "COMMIT\n"; + + char delete_criu_chain[] = "*filter\n" + ":CRIU - [0:0]\n" + "-X CRIU\n" + "COMMIT\n"; + int ret = 0; - ret |= iptables_restore(false, conf, sizeof(conf) - 1); + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); if (kdat.ipv6) - ret |= iptables_restore(true, conf, sizeof(conf) - 1); + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + + /* For compatibility with iptables-nft backend, we need to make sure that all jump + * targets have been removed before deleting the CRIU chain. + */ + if (!iptables_has_criu_jump_target()) { + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + } + + ret |= iptables_restore(false, delete_criu_chain, sizeof(delete_criu_chain) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_criu_chain, sizeof(delete_criu_chain) - 1); return ret; } From d94251df755b5fa5751b2f6cc56e45bde7703938 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 5 Jan 2024 18:07:59 +0000 Subject: [PATCH 374/775] test/nfconntrack: use nft or iptables-legacy nft does not support xtables compat expressions https://git.netfilter.org/nftables/commit/?id=79195a8cc9e9d9cf2d17165bf07ac4cc9d55539f Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.alpine | 1 + test/zdtm/static/Makefile | 8 +++--- ...nntrack.c => socket-tcp-ipt-nfconntrack.c} | 0 .../static/socket-tcp-ipt-nfconntrack.desc | 6 +++++ test/zdtm/static/socket-tcp-nfconntrack.desc | 1 - test/zdtm/static/socket-tcp-nft-nfconntrack.c | 1 + .../static/socket-tcp-nft-nfconntrack.desc | 7 +++++ test/zdtm/static/socket-tcp.c | 27 ++++++++++++++++--- 8 files changed, 44 insertions(+), 7 deletions(-) rename test/zdtm/static/{socket-tcp-nfconntrack.c => socket-tcp-ipt-nfconntrack.c} (100%) create mode 100644 test/zdtm/static/socket-tcp-ipt-nfconntrack.desc delete mode 100644 test/zdtm/static/socket-tcp-nfconntrack.desc create mode 120000 test/zdtm/static/socket-tcp-nft-nfconntrack.c create mode 100644 test/zdtm/static/socket-tcp-nft-nfconntrack.desc diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 593e19031..2c58c910e 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -33,6 +33,7 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date RUN apk add \ ip6tables \ iptables \ + iptables-legacy \ nftables \ iproute2 \ tar \ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 07d3bc6e2..fb856d55b 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -85,7 +85,8 @@ TST_NOFILE := \ socket-tcp4v6 \ socket-tcp-local \ socket-tcp-reuseport \ - socket-tcp-nfconntrack \ + socket-tcp-ipt-nfconntrack \ + socket-tcp-nft-nfconntrack \ socket-tcp6-local \ socket-tcp4v6-local \ socket-tcpbuf \ @@ -277,7 +278,7 @@ pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ - bpf_array + bpf_array endif ifneq ($(ARCH),arm) @@ -598,7 +599,8 @@ socket-tcpbuf6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV6 socket-tcp4v6-local: CFLAGS += -D ZDTM_TCP_LOCAL -D ZDTM_IPV4V6 socket-tcp-local: CFLAGS += -D ZDTM_TCP_LOCAL -socket-tcp-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_CONNTRACK +socket-tcp-ipt-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_IPT_CONNTRACK +socket-tcp-nft-nfconntrack: CFLAGS += -D ZDTM_TCP_LOCAL -DZDTM_NFT_CONNTRACK socket_listen6: CFLAGS += -D ZDTM_IPV6 socket_listen4v6: CFLAGS += -D ZDTM_IPV4V6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 diff --git a/test/zdtm/static/socket-tcp-nfconntrack.c b/test/zdtm/static/socket-tcp-ipt-nfconntrack.c similarity index 100% rename from test/zdtm/static/socket-tcp-nfconntrack.c rename to test/zdtm/static/socket-tcp-ipt-nfconntrack.c diff --git a/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc new file mode 100644 index 000000000..53dd82285 --- /dev/null +++ b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'has_ipt_legacy', + 'flavor': 'h', + 'opts': '--tcp-established', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc deleted file mode 100644 index add2513f8..000000000 --- a/test/zdtm/static/socket-tcp-nfconntrack.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.c b/test/zdtm/static/socket-tcp-nft-nfconntrack.c new file mode 120000 index 000000000..8cb60dd03 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.desc b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc new file mode 100644 index 000000000..38a4eb389 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc @@ -0,0 +1,7 @@ +{ + 'flavor': 'h', + 'feature': 'network_lock_nftables', + 'opts': '--tcp-established', + 'dopts': '--network-lock nftables', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index f6ef47385..9830c7860 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -67,17 +67,38 @@ int main(int argc, char **argv) int val; socklen_t optlen; -#ifdef ZDTM_CONNTRACK +#ifdef ZDTM_IPT_CONNTRACK if (unshare(CLONE_NEWNET)) { pr_perror("unshare"); return 1; } if (system("ip link set up dev lo")) return 1; - if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + + if (system("iptables-legacy -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) return 1; - if (system("iptables -w -A INPUT -j DROP")) + if (system("iptables-legacy -w -A INPUT -j DROP")) return 1; + +#endif + +#ifdef ZDTM_NFT_CONNTRACK + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; + + if (system("nft add table ip filter")) + return 1; + if (system("nft add chain ip filter INPUT")) + return 1; + if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) + return 1; + if (system("nft add rule ip filter INPUT counter drop")) + return 1; + #endif #ifdef ZDTM_TCP_LOCAL From 842289c7ebb303f9dd42f2e20ea6787218123bde Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jan 2024 18:09:28 +0000 Subject: [PATCH 375/775] net: add error messages for restore of nftables Show appropriate error messages when restore of nftables fails. Signed-off-by: Radostin Stoyanov --- criu/net.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/criu/net.c b/criu/net.c index b34c379ba..0f7280bb5 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2438,27 +2438,39 @@ static inline int do_restore_nftables(struct cr_img *img) off_t img_data_size; char *buf; - if ((img_data_size = img_raw_size(img)) < 0) + if ((img_data_size = img_raw_size(img)) < 0) { + pr_err("image size mismatch\n"); goto out; + } - if (read_img_str(img, &buf, img_data_size) < 0) + if (read_img_str(img, &buf, img_data_size) < 0) { + pr_err("Failed to read nftables data\n"); goto out; + } nft = nft_ctx_new(NFT_CTX_DEFAULT); - if (!nft) + if (!nft) { + pr_err("Failed to create nft context object\n"); goto buf_free_out; - - if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft) || -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) - nft_run_cmd_from_buffer(nft, buf, strlen(buf))) -#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) - nft_run_cmd_from_buffer(nft, buf)) -#else - { - BUILD_BUG_ON(1); } -#endif + + if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft)) { + pr_err("Failed to enable std/err output buffering\n"); goto nft_ctx_free_out; + } + +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) + if (nft_run_cmd_from_buffer(nft, buf, strlen(buf))) +#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) + if (nft_run_cmd_from_buffer(nft, buf)) +#else + BUILD_BUG_ON(1); +#endif + { + pr_err("nft command error:\n%s\n%s\n", + nft_ctx_get_error_buffer(nft), buf); + goto nft_ctx_free_out; + } exit_code = 0; From 20628bc8a1f04b5f975373dd5bccff778e908f5c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 25 Oct 2023 01:07:27 +0000 Subject: [PATCH 376/775] kerndat: check the PAGEMAP_SCAN ioctl PAGEMAP_SCAN is a new ioctl that allows to get page attributes in a more effeciant way than reading pagemap files. Signed-off-by: Andrei Vagin --- criu/cr-check.c | 10 ++++++ criu/include/kerndat.h | 1 + criu/include/pagemap_scan.h | 68 +++++++++++++++++++++++++++++++++++++ criu/kerndat.c | 20 +++++++++++ 4 files changed, 99 insertions(+) create mode 100644 criu/include/pagemap_scan.h diff --git a/criu/cr-check.c b/criu/cr-check.c index cb083b16c..fea1ce674 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1382,6 +1382,14 @@ static int check_ipv6_freebind(void) return 0; } +static int check_pagemap_scan(void) +{ + if (!kdat.has_pagemap_scan) + return -1; + + return 0; +} + static int (*chk_feature)(void); /* @@ -1502,6 +1510,7 @@ int cr_check(void) ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); + ret |= check_pagemap_scan(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1623,6 +1632,7 @@ static struct feature_list feature_list[] = { { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, + { "pagemap_scan", check_pagemap_scan }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index f5d409acb..91dbd494b 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -86,6 +86,7 @@ struct kerndat_s { struct __ptrace_rseq_configuration libc_rseq_conf; bool has_ipv6_freebind; bool has_membarrier_get_registrations; + bool has_pagemap_scan; }; extern struct kerndat_s kdat; diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h new file mode 100644 index 000000000..0ad4c9bc0 --- /dev/null +++ b/criu/include/pagemap_scan.h @@ -0,0 +1,68 @@ +#ifndef __CR_PAGEMAP_SCAN_H__ +#define __CR_PAGEMAP_SCAN_H__ + +#ifndef PAGEMAP_SCAN +#include +#include "int.h" + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + u64 start; + u64 end; + u64 categories; +}; + +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + u64 size; + u64 flags; + u64 start; + u64 end; + u64 walk_end; + u64 vec; + u64 vec_len; + u64 max_pages; + u64 category_inverted; + u64 category_mask; + u64 category_anyof_mask; + u64 return_mask; +}; +#endif /* PAGEMAP_SCAN */ + +#endif /* __CR_PAGEMAP_SCAN_H__ */ diff --git a/criu/kerndat.c b/criu/kerndat.c index fef5a46c1..95e7226b2 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -54,6 +54,7 @@ #include "memfd.h" #include "mount-v2.h" #include "util-caps.h" +#include "pagemap_scan.h" struct kerndat_s kdat = {}; volatile int dummy_var; @@ -74,6 +75,25 @@ static int check_pagemap(void) return -1; } + if (ioctl(fd, PAGEMAP_SCAN, NULL) == 0) { + pr_err("PAGEMAP_SCAN succeeded unexpectedly\n"); + return -1; + } else { + switch (errno) { + case EFAULT: + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; + break; + case EINVAL: + case ENOTTY: + pr_debug("PAGEMAP_SCAN isn't supported\n"); + break; + default: + pr_perror("PAGEMAP_SCAN failed with unexpected errno"); + return -1; + } + } + retry = 3; while (retry--) { ++dummy_var; From cb64d73adadf63c49a0fd6d458f5fe05a90daf1e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 23 Oct 2023 18:52:13 +0000 Subject: [PATCH 377/775] page-cache: use the PAGEMAP_SCAN ioctl when it is available Signed-off-by: Andrei Vagin --- criu/include/mem.h | 4 +- criu/include/pagemap-cache.h | 13 +++- criu/include/shmem.h | 3 +- criu/mem.c | 112 +++++++++++++++++++++++------------ criu/pagemap-cache.c | 88 +++++++++++++++++++++------ criu/shmem.c | 23 ++++--- 6 files changed, 172 insertions(+), 71 deletions(-) diff --git a/criu/include/mem.h b/criu/include/mem.h index 03574ea3d..3618c9cc3 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,6 +7,7 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" +#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -47,5 +48,6 @@ int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -bool should_dump_page(VmaEntry *vmae, u64 pme); + +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 1d8bbffaf..875e69e56 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,10 +1,12 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ +#include #include #include "int.h" #include "common/list.h" +#include "pagemap_scan.h" struct vma_area; @@ -15,9 +17,15 @@ typedef struct { unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ + int fd; /* file to read PMs from */ + u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + + struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ + size_t regs_len; /* actual length of regs */ + size_t regs_max_len; /* maximum length of regs */ + size_t regs_idx; /* current index in the regs array */ } pmc_t; #define PMC_INIT \ @@ -26,7 +34,8 @@ typedef struct { } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); +extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 813ef630e..15cab1146 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,13 +4,14 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" +#include "pagemap-cache.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); diff --git a/criu/mem.c b/criu/mem.c index 417e0a21d..f56ed826b 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -99,7 +99,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -bool should_dump_page(VmaEntry *vmae, u64 pme) +static bool should_dump_entire_vma(VmaEntry *vmae) { /* * vDSO area must be always dumped because on restore @@ -107,30 +107,53 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vmae, VMA_AREA_VVAR)) - return false; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) - return true; return false; } +/* + * should_dump_page returns vaddr if an addressed page has to be dumped. + * Otherwise, it returns an address that has to be inspected next. + */ +u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +{ + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) + return -1; + + if (pmc->regs) { + while (1) { + if (pmc->regs_idx == pmc->regs_len) + return pmc->end; + if (vaddr < pmc->regs[pmc->regs_idx].end) + break; + pmc->regs_idx++; + } + if (vaddr < pmc->regs[pmc->regs_idx].start) + return pmc->regs[pmc->regs_idx].start; + if (softdirty) + *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + return vaddr; + } else { + u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) + return vaddr + PAGE_SIZE; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + if (softdirty) + *softdirty = pme & PME_SOFT_DIRTY; + return vaddr; + } + + return vaddr + PAGE_SIZE; + } +} + bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -164,25 +187,30 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * the memory contents is present in the parent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, bool has_parent) { - u64 *at = &map[PAGE_PFN(*off)]; - unsigned long pfn, nr_to_scan; + unsigned long nr_scanned; unsigned long pages[3] = {}; + unsigned long vaddr; + bool dump_all_pages; int ret = 0; - nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + dump_all_pages = should_dump_entire_vma(vma->e); - for (pfn = 0; pfn < nr_to_scan; pfn++) { - unsigned long vaddr; + nr_scanned = 0; + for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; + bool softdirty = false; + u64 next; int st; - if (!should_dump_page(vma->e, at[pfn])) + /* If dump_all_pages is true, should_dump_page is called to get pme. */ + next = should_dump_page(pmc, vma->e, vaddr, &softdirty); + if (!dump_all_pages && next != vaddr) { + vaddr = next - PAGE_SIZE; continue; - - vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -194,7 +222,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + if (has_parent && page_in_parent(softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -214,9 +242,8 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *off += pfn * PAGE_SIZE; - - cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + *pvaddr = vaddr; + cnt_add(CNT_PAGES_SCANNED, nr_scanned); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); @@ -356,12 +383,20 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 off = 0; - u64 *map; + u64 vaddr; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vma->e, VMA_AREA_VVAR)) + return 0; /* * To facilitate any combination of pre-dump modes to run after @@ -421,15 +456,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - map = pmc_get_map(pmc, vma); - if (!map) + if (pmc_get_map(pmc, vma)) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, map); - + return add_shmem_area(item->pid->real, vma->e, pmc); + vaddr = vma->e->start; again: - ret = generate_iovs(item, vma, pp, map, &off, has_parent); + ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 09dbc6a36..d9bd1bc86 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -1,5 +1,6 @@ #include #include +#include #include "page.h" #include "pagemap-cache.h" @@ -22,6 +23,8 @@ #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) +#define PAGE_REGIONS_MAX_NR 32768 + /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. @@ -50,10 +53,23 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; + pmc->regs_max_len = PAGE_PFN(map_size); + if (pmc->regs_max_len > PAGE_REGIONS_MAX_NR) + pmc->regs_max_len = PAGE_REGIONS_MAX_NR; + pmc->regs_len = 0; + pmc->regs_idx = 0; + pmc->regs = NULL; + pmc->map = NULL; - pmc->map = xmalloc(pmc->map_len); - if (!pmc->map) - goto err; + if (kdat.has_pagemap_scan) { + pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); + if (!pmc->regs) + goto err; + } else { + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; + } if (pagemap_cache_disabled) pr_warn_once("The pagemap cache is disabled\n"); @@ -87,17 +103,11 @@ err: return -1; } -static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) -{ - return &pmc->map[PAGE_PFN(addr - pmc->start)]; -} - static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); - size_t size_map; if (high > kdat.task_size) high = kdat.task_size; @@ -149,39 +159,79 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) pr_debug("\t%d: simple mode [l:%lx h:%lx]\n", pmc->pid, pmc->start, pmc->end); } + return pmc_fill(pmc, pmc->start, pmc->end); +} + +int pmc_fill(pmc_t *pmc, u64 start, u64 end) +{ + size_t size_map; + + pmc->start = start; + pmc->end = end; + size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + if (pmc->regs) { + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = pmc->start, + .end = pmc->end, + .vec = (long)pmc->regs, + .vec_len = pmc->regs_max_len, + .max_pages = 0, + /* + * Request pages that are in RAM or swap, excluding + * zero-filled and file-backed pages. + */ + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; + long ret; + + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); + if (ret == -1) { + pr_perror("PAGEMAP_SCAN"); + pmc_zap(pmc); + return -1; + } + pmc->regs_len = ret; + pmc->regs_idx = 0; + pmc->end = args.walk_end; + } else { + if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } } return 0; } -u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +int pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) - return __pmc_get_map(pmc, vma->e->start); + return 0; /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); - return NULL; + return -1; } - - /* Hit for sure */ - return __pmc_get_map(pmc, vma->e->start); + return 0; } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); + xfree(pmc->regs); pmc_reset(pmc); } diff --git a/criu/shmem.c b/criu/shmem.c index c13a39b66..9e3178352 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,23 +206,28 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) +static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; + u64 vaddr; if (!is_shmem_tracking_en()) return; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); - for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { - if (!should_dump_page(vma, map[vma_pfn])) + for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { + bool softdirty = false; + u64 next; + + next = should_dump_page(pmc, vma, vaddr, &softdirty); + if (next != vaddr) { + vaddr = next - PAGE_SIZE; continue; + } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (map[vma_pfn] & PME_SOFT_DIRTY) + if (softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); - else if (page_is_zero(map[vma_pfn])) - set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } @@ -648,7 +653,7 @@ err: return -1; } -int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) +int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); @@ -662,7 +667,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } @@ -679,7 +684,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, map, vma); + update_shmem_pmaps(si, pmc, vma); return 0; } From afc0efcf78d922daf058ea49c083db38f85d2933 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 29 Nov 2023 23:25:44 +0000 Subject: [PATCH 378/775] pagemap-cache: add an ability to run tests without PAGEMAP_SCAN This change adds a new injectable fault (135) to disable PAGEMAP_SCAN and fault back to read pagemap files. Signed-off-by: Andrei Vagin --- criu/include/fault-injection.h | 1 + criu/pagemap-cache.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 69d670be9..fe75dfe86 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -19,6 +19,7 @@ enum faults { FI_HUGE_ANON_SHMEM_ID = 132, FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, + FI_DONT_USE_PAGEMAP_SCAN = 135, FI_MAX, }; diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index d9bd1bc86..978a6b1ac 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -11,6 +11,7 @@ #include "vma.h" #include "mem.h" #include "kerndat.h" +#include "fault-injection.h" #undef LOG_PREFIX #define LOG_PREFIX "pagemap-cache: " @@ -61,7 +62,7 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->regs = NULL; pmc->map = NULL; - if (kdat.has_pagemap_scan) { + if (kdat.has_pagemap_scan && !fault_injected(FI_DONT_USE_PAGEMAP_SCAN)) { pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); if (!pmc->regs) goto err; From d2511707fa29fc0ccb588bfb83196ff5c613166c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 22 Jan 2024 17:50:39 +0800 Subject: [PATCH 379/775] zdtm: socket-tcp-nft-nfconntrack: add a hook to the chain in nft case Let's use hooked nft chain which actually affects packets. Fixes: e5f4d8c6f ("test/nfconntrack: use nft or iptables-legacy") Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/socket-tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index 9830c7860..bc2075496 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -92,7 +92,7 @@ int main(int argc, char **argv) if (system("nft add table ip filter")) return 1; - if (system("nft add chain ip filter INPUT")) + if (system("nft 'add chain ip filter INPUT { type filter hook input priority 0 ; }'")) return 1; if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) return 1; From a62f827302655efb12d0594e7766fe0a6af78ff6 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 24 Jan 2024 10:39:00 +0000 Subject: [PATCH 380/775] criu-log: remove unused declaration This patch removes a leftover declaration for log_closedir() which has been removed in the following commit: dc80d6f125e1e919363a0b8f938b8679ff0dbc2b log: get rid of LOG_DIR_FD_OFF and opening cwd in log_init() Signed-off-by: Radostin Stoyanov --- criu/include/criu-log.h | 1 - 1 file changed, 1 deletion(-) diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h index ae2f38489..9d52fbdb1 100644 --- a/criu/include/criu-log.h +++ b/criu/include/criu-log.h @@ -26,7 +26,6 @@ extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); -extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); From 92e8f9293bd796d7d06c91dea29116b945a0aaa4 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 23 Jan 2024 07:28:40 +0000 Subject: [PATCH 381/775] net: return bool with iptable_has_criu_jump_target To improve readability, this patch changes the return type of iptables_has_criu_jump_target() to a boolean, where 'true' indicates that iptables has CRIU jump target and 'false' indicates otherwise. Suggested-by: Pavel Tikhomirov Signed-off-by: Radostin Stoyanov --- criu/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/criu/net.c b/criu/net.c index 0f7280bb5..b5c4a6ee3 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3190,7 +3190,7 @@ static inline int nftables_network_unlock(void) #endif } -static int iptables_has_criu_jump_target(void) +static bool iptables_has_criu_jump_target(void) { int fd, ret; char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; @@ -3203,7 +3203,7 @@ static int iptables_has_criu_jump_target(void) ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); close_safe(&fd); - return ret; + return !ret; } static int iptables_network_unlock_internal(void) @@ -3228,7 +3228,7 @@ static int iptables_network_unlock_internal(void) /* For compatibility with iptables-nft backend, we need to make sure that all jump * targets have been removed before deleting the CRIU chain. */ - if (!iptables_has_criu_jump_target()) { + if (iptables_has_criu_jump_target()) { ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); if (kdat.ipv6) ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); From 59599dacdd5fe91262125927ce0b3e42e3aaecd0 Mon Sep 17 00:00:00 2001 From: David Francis Date: Tue, 30 Jan 2024 14:59:48 -0500 Subject: [PATCH 382/775] plugin/amdgpu: Don't print error for "No such process" during resume During the late stages of restore, each process being resumed gets an ioctl call to KFD_CRIU_OP_RESUME. If the process has no kfd process info, this call with fail with -ESRCH. This is normal behaviour, so we shouldn't print an error message for it. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_plugin.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 32ff8f936..3675353a7 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1999,7 +1999,10 @@ int amdgpu_plugin_resume_devices_late(int target_pid) args.op = KFD_CRIU_OP_RESUME; pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("restore late ioctl failed"); + if (errno == ESRCH) + pr_info("Pid %d has no kfd process info\n", target_pid); + else + pr_perror("restore late ioctl failed"); ret = -1; } From b689a6710cb8bfc2a0784cbce2b77c8aa157fad9 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 5 Feb 2024 11:17:40 +0800 Subject: [PATCH 383/775] plugin/amdgpu: Also don't print 'plugin failed' in criu We already don't treat it as error in the plugin itself, but after returning -1 from RESUME_DEVICES_LATE hook we print debug message in criu about failed plugin, let's return 0 instead. While on it let's replace ret to exit_code. Fixes: a9cbdad76 ("plugin/amdgpu: Don't print error for "No such process" during resume") Signed-off-by: Pavel Tikhomirov --- plugins/amdgpu/amdgpu_plugin.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 3675353a7..23253632d 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1985,7 +1985,7 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, amdgpu_plugin_update_vma int amdgpu_plugin_resume_devices_late(int target_pid) { struct kfd_ioctl_criu_args args = { 0 }; - int fd, ret = 0; + int fd, exit_code = 0; pr_info("Inside %s for target pid = %d\n", __func__, target_pid); @@ -1999,15 +1999,16 @@ int amdgpu_plugin_resume_devices_late(int target_pid) args.op = KFD_CRIU_OP_RESUME; pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - if (errno == ESRCH) + if (errno == ESRCH) { pr_info("Pid %d has no kfd process info\n", target_pid); - else + } else { pr_perror("restore late ioctl failed"); - ret = -1; + exit_code = -1; + } } close(fd); - return ret; + return exit_code; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) From 733ef9631570cecd94efe72d98e7255ca0e05956 Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Fri, 10 Nov 2023 11:36:34 -0600 Subject: [PATCH 384/775] amdgpu_plugin: Refactor code in preparation to support C&R for DRM devices Add a new compilation unit to host symbols and methods that will be needed to C&R DRM devices. Refactor code that indicates support for C&R and checkpoints KFD and DRM devices Signed-off-by: Ramesh Errabolu --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 245 ++++-------------------- plugins/amdgpu/amdgpu_plugin_drm.c | 63 ++++++ plugins/amdgpu/amdgpu_plugin_drm.h | 22 +++ plugins/amdgpu/amdgpu_plugin_topology.c | 41 ++-- plugins/amdgpu/amdgpu_plugin_topology.h | 2 + plugins/amdgpu/amdgpu_plugin_util.c | 208 ++++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_util.h | 106 ++++++++++ plugins/amdgpu/criu-amdgpu.proto | 18 +- 9 files changed, 460 insertions(+), 247 deletions(-) create mode 100644 plugins/amdgpu/amdgpu_plugin_drm.c create mode 100644 plugins/amdgpu/amdgpu_plugin_drm.h create mode 100755 plugins/amdgpu/amdgpu_plugin_util.c create mode 100755 plugins/amdgpu/amdgpu_plugin_util.h diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 64a923d38..5efa8fb0b 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -28,7 +28,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_topology.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 23253632d..60e04f973 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -30,55 +30,14 @@ #include "files.h" #include "common/list.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" #include "img-streamer.h" #include "image.h" #include "cr_options.h" -#define AMDGPU_KFD_DEVICE "/dev/kfd" -#define PROCPIDMEM "/proc/%d/mem" -#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" -#define HSAKMT_SHM "/hsakmt_shared_mem" -#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" -#define HSAKMT_SEM "hsakmt_semaphore" - -#define KFD_IOCTL_MAJOR_VERSION 1 -#define MIN_KFD_IOCTL_MINOR_VERSION 8 - -#define IMG_KFD_FILE "amdgpu-kfd-%d.img" -#define IMG_RENDERD_FILE "amdgpu-renderD-%d.img" -#define IMG_PAGES_FILE "amdgpu-pages-%d-%04x.img" - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "amdgpu_plugin: " - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif - -#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) - -#define SDMA_OPCODE_COPY 1 -#define SDMA_COPY_SUB_OPCODE_LINEAR 0 -#define SDMA_NOP 0 -#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) - -enum sdma_op_type { - SDMA_OP_VRAM_READ, - SDMA_OP_VRAM_WRITE, -}; - struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -89,143 +48,13 @@ struct vma_metadata { }; /************************************ Global Variables ********************************************/ -struct tp_system src_topology; -struct tp_system dest_topology; - -struct device_maps checkpoint_maps; -struct device_maps restore_maps; - -extern int fd_next; static LIST_HEAD(update_vma_info_list); -extern bool kfd_fw_version_check; -extern bool kfd_sdma_fw_version_check; -extern bool kfd_caches_count_check; -extern bool kfd_num_gws_check; -extern bool kfd_vram_size_check; -extern bool kfd_numa_check; -extern bool kfd_capability_check; - size_t kfd_max_buffer_size; /**************************************************************************************************/ -int write_fp(FILE *fp, const void *buf, const size_t buf_len) -{ - size_t len_write; - - len_write = fwrite(buf, 1, buf_len, fp); - if (len_write != buf_len) { - pr_perror("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); - return -EIO; - } - return 0; -} - -int read_fp(FILE *fp, void *buf, const size_t buf_len) -{ - size_t len_read; - - len_read = fread(buf, 1, buf_len, fp); - if (len_read != buf_len) { - pr_perror("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); - return -EIO; - } - return 0; -} - -/** - * @brief Open an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * The FILE * returned is already at the location of the first actual contents. - * - * @param path The file path - * @param write False for read, true for write - * @param size Size of actual contents - * @return FILE *if successful, NULL if failed - */ -FILE *open_img_file(char *path, bool write, size_t *size) -{ - FILE *fp = NULL; - int fd, ret; - - if (opts.stream) - fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); - else - fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); - - if (fd < 0) { - pr_perror("%s: Failed to open for %s", path, write ? "write" : "read"); - return NULL; - } - - fp = fdopen(fd, write ? "w" : "r"); - if (!fp) { - pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read"); - close(fd); - return NULL; - } - - if (write) - ret = write_fp(fp, size, sizeof(*size)); - else - ret = read_fp(fp, size, sizeof(*size)); - - if (ret) { - pr_perror("%s:Failed to access file size", path); - fclose(fp); - return NULL; - } - - pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); - return fp; -} - -/** - * @brief Write an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * - * @param path The file path - * @param buf pointer to data to be written - * @param buf_len size of buf - * @return 0 if successful. -errno on failure - */ -int write_img_file(char *path, const void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - size_t len = buf_len; - - fp = open_img_file(path, true, &len); - if (!fp) - return -errno; - - ret = write_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - -int read_file(const char *file_path, void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - - fp = fopen(file_path, "r"); - if (!fp) { - pr_perror("Cannot fopen %s", file_path); - return -errno; - } - - ret = read_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - /* Call ioctl, restarting if it is interrupted */ int kmtIoctl(int fd, unsigned long request, void *arg) { @@ -263,21 +92,21 @@ static void free_e(CriuKfd *e) static int allocate_device_entries(CriuKfd *e, int num_of_devices) { - e->device_entries = xmalloc(sizeof(DeviceEntry *) * num_of_devices); + e->device_entries = xmalloc(sizeof(KfdDeviceEntry *) * num_of_devices); if (!e->device_entries) { pr_err("Failed to allocate device_entries\n"); return -ENOMEM; } for (int i = 0; i < num_of_devices; i++) { - DeviceEntry *entry = xzalloc(sizeof(*entry)); + KfdDeviceEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate entry\n"); return -ENOMEM; } - device_entry__init(entry); + kfd_device_entry__init(entry); e->device_entries[i] = entry; e->n_device_entries++; @@ -287,21 +116,21 @@ static int allocate_device_entries(CriuKfd *e, int num_of_devices) static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr) { - e->bo_entries = xmalloc(sizeof(BoEntry *) * num_bos); + e->bo_entries = xmalloc(sizeof(KfdBoEntry *) * num_bos); if (!e->bo_entries) { pr_err("Failed to allocate bo_info\n"); return -ENOMEM; } for (int i = 0; i < num_bos; i++) { - BoEntry *entry = xzalloc(sizeof(*entry)); + KfdBoEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate botest\n"); return -ENOMEM; } - bo_entry__init(entry); + kfd_bo_entry__init(entry); e->bo_entries[i] = entry; e->n_bo_entries++; @@ -309,13 +138,13 @@ static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucke return 0; } -int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceEntry **deviceEntries) +int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDeviceEntry **deviceEntries) { uint32_t devinfo_index = 0; struct tp_node *node; list_for_each_entry(node, &sys->nodes, listm_system) { - DeviceEntry *devinfo = deviceEntries[devinfo_index++]; + KfdDeviceEntry *devinfo = deviceEntries[devinfo_index++]; devinfo->node_id = node->id; @@ -383,11 +212,11 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceE return 0; } -int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) +int devinfo_to_topology(KfdDeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) { for (int i = 0; i < num_devices; i++) { struct tp_node *node; - DeviceEntry *devinfo = devinfos[i]; + KfdDeviceEntry *devinfo = devinfos[i]; node = sys_add_node(sys, devinfo->node_id, devinfo->gpu_id); if (!node) @@ -549,7 +378,7 @@ struct thread_data { uint32_t gpu_id; pid_t pid; struct kfd_criu_bo_bucket *bo_buckets; - BoEntry **bo_entries; + KfdBoEntry **bo_entries; int drm_fd; int ret; int id; /* File ID used by CRIU to identify KFD image for this process */ @@ -557,8 +386,7 @@ struct thread_data { int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) { - struct stat st_kfd, st_dri_min; - char img_path[128]; + struct stat st_kfd; int ret = 0; pr_debug("Enter %s\n", __func__); @@ -568,27 +396,18 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) return ret; } - snprintf(img_path, sizeof(img_path), "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE); - - ret = stat(img_path, &st_dri_min); - if (ret == -1) { - pr_perror("stat error for %s", img_path); - return ret; - } - - if (major(st_buf->st_rdev) == major(st_kfd.st_rdev) || ((major(st_buf->st_rdev) == major(st_dri_min.st_rdev)) && - (minor(st_buf->st_rdev) >= minor(st_dri_min.st_rdev) && - minor(st_buf->st_rdev) >= DRM_FIRST_RENDER_NODE))) { + /* If input device is KFD return device as supported */ + if (major(st_buf->st_rdev) == major(st_kfd.st_rdev)) { pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev)); - pr_debug("AMD KFD(maj) = %d, DRI(maj,min) = %d:%d VMA Device fd(maj,min) = %d:%d\n", - major(st_kfd.st_rdev), major(st_dri_min.st_rdev), minor(st_dri_min.st_rdev), - major(st_buf->st_rdev), minor(st_buf->st_rdev)); - /* VMA belongs to kfd */ return 0; } - pr_perror("Can't handle the VMA mapping"); - return -ENOTSUP; + /* Determine if input is a DRM device and therefore is supported */ + ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf); + if (ret) + pr_perror("%s(), Can't handle VMAs of input device\n", __func__); + + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -655,8 +474,9 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, void *buffer, size_t buffer_size, - amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type) { uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; @@ -954,7 +774,7 @@ void *dump_bo_contents(void *_thread_data) goto exit; } - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, true, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -1027,7 +847,7 @@ void *restore_bo_contents(void *_thread_data) max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : SDMA_LINEAR_COPY_MAX_SIZE - 1; - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, false, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -1234,7 +1054,7 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd for (i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *boinfo = e->bo_entries[i]; + KfdBoEntry *boinfo = e->bo_entries[i]; boinfo->gpu_id = bo_bucket->gpu_id; boinfo->addr = bo_bucket->addr; @@ -1391,7 +1211,7 @@ int amdgpu_plugin_dump_file(int fd, int id) criu_render_node__pack(&rd, buf); - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); + snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); ret = write_img_file(img_path, buf, len); if (ret) { xfree(buf); @@ -1399,6 +1219,7 @@ int amdgpu_plugin_dump_file(int fd, int id) } xfree(buf); + /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; } @@ -1531,7 +1352,7 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) { struct kfd_criu_device_bucket *device_bucket; - DeviceEntry *devinfo = e->device_entries[entries_i]; + KfdDeviceEntry *devinfo = e->device_entries[entries_i]; struct tp_node *tp_node; if (!devinfo->gpu_id) @@ -1581,7 +1402,7 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int i = 0; i < args->num_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *bo_entry = e->bo_entries[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; bo_bucket->gpu_id = bo_entry->gpu_id; bo_bucket->addr = bo_entry->addr; @@ -1736,7 +1557,7 @@ int amdgpu_plugin_restore_file(int id) * TODO: Currently, this code will only work if this function is called for /dev/kfd * first as we assume restore_maps is already filled. Need to fix this later. */ - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); + snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); pr_info("Restoring RenderD %s\n", img_path); img_fp = open_img_file(img_path, false, &img_size); diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c new file mode 100644 index 000000000..a48dc68f0 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include "criu-amdgpu.pb-c.h" + +#include +#include + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + + +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) +{ + char path[PATH_MAX]; + struct stat drm; + int ret = 0; + + snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE); + ret = stat(path, &drm); + if (ret == -1) { + pr_err("Error in getting stat for: %s", path); + return ret; + } + + if ((major(st->st_rdev) != major(drm.st_rdev)) || + (minor(st->st_rdev) < minor(drm.st_rdev)) || + (minor(st->st_rdev) > DRM_LAST_RENDER_NODE)) { + pr_err("Can't handle VMA mapping of input device\n"); + return -ENOTSUP; + } + + pr_debug("AMD DRI(maj,min) = %d:%d VMA Device FD(maj,min) = %d:%d\n", + major(drm.st_rdev), minor(drm.st_rdev), + major(st->st_rdev), minor(st->st_rdev)); + + return 0; +} + + diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h new file mode 100644 index 000000000..37009c8ba --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -0,0 +1,22 @@ +#ifndef __AMDGPU_PLUGIN_DRM_H__ +#define __AMDGPU_PLUGIN_DRM_H__ + +#include +#include "common/list.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + + +/** + * Determines if VMA's of input file descriptor belong to amdgpu's + * DRM device and are therefore supported + */ +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); + + +#endif /* __AMDGPU_PLUGIN_DRM_H__ */ + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index ef79e5ef4..c5fa51fda 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -16,35 +16,11 @@ #include "xmalloc.h" #include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" #define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef COMPILE_TESTS -#undef pr_err -#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) -#undef pr_info -#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) -#undef pr_debug -#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) - -#undef pr_perror -#define pr_perror(format, arg...) \ - fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#endif - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif - /* User override options */ /* Skip firmware version check */ bool kfd_fw_version_check = true; @@ -840,6 +816,9 @@ void topology_free(struct tp_system *sys) list_del(&p2pgroup->listm_system); xfree(p2pgroup); } + + /* Update Topology as being freed */ + sys->parsed = false; } /** @@ -1461,3 +1440,15 @@ int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, return ret; } + +int topology_gpu_count(struct tp_system *sys) +{ + struct tp_node *node; + int count = 0; + + list_for_each_entry(node, &sys->nodes, listm_system) + if (NODE_IS_GPU(node)) + count++; + return count; +} + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index 9d99cda1c..c890e3dda 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -107,6 +107,8 @@ int topology_parse(struct tp_system *topology, const char *msg); int topology_determine_iolinks(struct tp_system *sys); void topology_print(const struct tp_system *sys, const char *msg); +int topology_gpu_count(struct tp_system *topology); + struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id); struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id); diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c new file mode 100755 index 000000000..48ff70555 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -0,0 +1,208 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include +#include + +#include "criu-plugin.h" +#include "plugin.h" +#include "criu-amdgpu.pb-c.h" + +#include "img-streamer.h" +#include "image.h" +#include "cr_options.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + +/* Tracks number of device files that need to be checkpointed */ +static int dev_file_cnt = 0; + +/* Helper structures to encode device topology of SRC and DEST platforms */ +struct tp_system src_topology; +struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +struct device_maps checkpoint_maps; +struct device_maps restore_maps; + +bool checkpoint_is_complete() +{ + return (dev_file_cnt == 0); +} + +void decrement_checkpoint_count() +{ + dev_file_cnt--; +} + +void init_gpu_count(struct tp_system *topo) +{ + if (dev_file_cnt != 0) + return; + + /* We add ONE to include checkpointing of KFD device */ + dev_file_cnt = 1 + topology_gpu_count(topo); +} + +int read_fp(FILE *fp, void *buf, const size_t buf_len) +{ + size_t len_read; + + len_read = fread(buf, 1, buf_len, fp); + if (len_read != buf_len) { + pr_err("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); + return -EIO; + } + return 0; +} + +int write_fp(FILE *fp, const void *buf, const size_t buf_len) +{ + size_t len_write; + + len_write = fwrite(buf, 1, buf_len, fp); + if (len_write != buf_len) { + pr_err("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); + return -EIO; + } + return 0; +} + +/** + * @brief Open an image file + * + * We store the size of the actual contents in the first 8-bytes of + * the file. This allows us to determine the file size when using + * criu_image_streamer when fseek and fstat are not available. The + * FILE * returned is already at the location of the first actual + * contents. + * + * @param path The file path + * @param write False for read, true for write + * @param size Size of actual contents + * @return FILE *if successful, NULL if failed + */ +FILE *open_img_file(char *path, bool write, size_t *size) +{ + FILE *fp = NULL; + int fd, ret; + + if (opts.stream) + fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); + else + fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); + + if (fd < 0) { + pr_err("%s: Failed to open for %s", path, write ? "write" : "read"); + return NULL; + } + + fp = fdopen(fd, write ? "w" : "r"); + if (!fp) { + pr_err("%s: Failed get pointer for %s", path, write ? "write" : "read"); + return NULL; + } + + if (write) + ret = write_fp(fp, size, sizeof(*size)); + else + ret = read_fp(fp, size, sizeof(*size)); + + if (ret) { + pr_err("%s:Failed to access file size", path); + fclose(fp); + return NULL; + } + + pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); + return fp; +} + +int read_file(const char *file_path, void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + + fp = fopen(file_path, "r"); + if (!fp) { + pr_err("Cannot fopen %s", file_path); + return -errno; + } + + ret = read_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + + +/** + * @brief Write an image file + * + * We store the size of the actual contents in the first 8-bytes of the file. This allows us to + * determine the file size when using criu_image_streamer when fseek and fstat are not available. + * + * @param path The file path + * @param buf pointer to data to be written + * @param buf_len size of buf + * @return 0 if successful. -errno on failure + */ +int write_img_file(char *path, const void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + size_t len = buf_len; + + fp = open_img_file(path, true, &len); + if (!fp) + return -errno; + + ret = write_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) +{ + struct kfd_criu_bo_bucket *bo; + + pr_info("\n"); + for (int idx = 0; idx < bo_cnt; idx++) { + bo = &bo_list[idx]; + pr_info("\n"); + pr_info("%s(), %d. KFD BO Addr: %llx \n", __func__, idx, bo->addr); + pr_info("%s(), %d. KFD BO Size: %llx \n", __func__, idx, bo->size); + pr_info("%s(), %d. KFD BO Offset: %llx \n", __func__, idx, bo->offset); + pr_info("%s(), %d. KFD BO Restored Offset: %llx \n", __func__, idx, bo->restored_offset); + pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags); + pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id); + pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd); + pr_info("\n"); + } + pr_info("\n"); +} + + diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h new file mode 100755 index 000000000..aacca3a28 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -0,0 +1,106 @@ +#ifndef __AMDGPU_PLUGIN_UTIL_H__ +#define __AMDGPU_PLUGIN_UTIL_H__ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#ifdef COMPILE_TESTS +#undef pr_err +#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) +#undef pr_info +#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) +#undef pr_debug +#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) + +#undef pr_perror +#define pr_perror(format, arg...) \ + fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#endif + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "amdgpu_plugin: " + +#ifdef DEBUG +#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) +#else +#define plugin_log_msg(fmt, ...) \ + { \ + } +#endif + + +/* Path where KFD device is surfaced */ +#define AMDGPU_KFD_DEVICE "/dev/kfd" + +/* Path where DRM devices are surfaced */ +#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d" + +/* Minimum version of KFD IOCTL's that supports C&R */ +#define KFD_IOCTL_MAJOR_VERSION 1 +#define MIN_KFD_IOCTL_MINOR_VERSION 8 + +/* Name of file having serialized data of KFD device */ +#define IMG_KFD_FILE "amdgpu-kfd-%d.img" + +/* Name of file having serialized data of KFD buffer objects (BOs) */ +#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img" + +/* Name of file having serialized data of DRM device */ +#define IMG_DRM_FILE "amdgpu-renderD-%d.img" + +/* Name of file having serialized data of DRM device buffer objects (BOs) */ +#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img" + +/* Helper macros to Checkpoint and Restore a ROCm file */ +#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" +#define HSAKMT_SHM "/hsakmt_shared_mem" +#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" +#define HSAKMT_SEM "hsakmt_semaphore" + +/* Help macros to build sDMA command packets */ +#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) + +#define SDMA_OPCODE_COPY 1 +#define SDMA_COPY_SUB_OPCODE_LINEAR 0 +#define SDMA_NOP 0 +#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) + +enum sdma_op_type { + SDMA_OP_VRAM_READ, + SDMA_OP_VRAM_WRITE, +}; + +/* Helper structures to encode device topology of SRC and DEST platforms */ +extern struct tp_system src_topology; +extern struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +extern struct device_maps checkpoint_maps; +extern struct device_maps restore_maps; + +extern int fd_next; + +extern bool kfd_fw_version_check; +extern bool kfd_sdma_fw_version_check; +extern bool kfd_caches_count_check; +extern bool kfd_num_gws_check; +extern bool kfd_vram_size_check; +extern bool kfd_numa_check; +extern bool kfd_capability_check; + +int read_fp(FILE *fp, void *buf, const size_t buf_len); +int write_fp(FILE *fp, const void *buf, const size_t buf_len); +int read_file(const char *file_path, void *buf, const size_t buf_len); +int write_img_file(char *path, const void *buf, const size_t buf_len); +FILE *open_img_file(char *path, bool write, size_t *size); + +bool checkpoint_is_complete(); +void decrement_checkpoint_count(); +void init_gpu_count(struct tp_system *topology); + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); + +#endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 81d00d3ff..078b67650 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -5,7 +5,7 @@ message dev_iolink { required uint32 node_to_id = 2; } -message device_entry { +message kfd_device_entry { required uint32 node_id = 1; required uint32 gpu_id = 2; required uint32 cpu_cores_count = 3; @@ -40,10 +40,10 @@ message device_entry { repeated dev_iolink iolinks = 32; } -message bo_entry { - required uint64 addr = 1; - required uint64 size = 2; - required uint64 offset = 3; +message kfd_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; required uint32 alloc_flags = 4; required uint32 gpu_id = 5; } @@ -52,10 +52,10 @@ message criu_kfd { required uint32 pid = 1; required uint32 num_of_gpus = 2; required uint32 num_of_cpus = 3; - repeated device_entry device_entries = 4; - required uint64 num_of_bos = 5; - repeated bo_entry bo_entries = 6; - required uint32 num_of_objects = 7; + repeated kfd_device_entry device_entries = 4; + required uint64 num_of_bos = 5; + repeated kfd_bo_entry bo_entries = 6; + required uint32 num_of_objects = 7; required uint64 shared_mem_size = 8; required uint32 shared_mem_magic = 9; required bytes priv_data = 10; From 0d5923c95ef703da0a59dd03e8526af0c152b2a0 Mon Sep 17 00:00:00 2001 From: Ramesh Errabolu Date: Fri, 10 Nov 2023 13:02:49 -0600 Subject: [PATCH 385/775] amdgpu_plugin: Refactor code used to implement Checkpoint Refactor code used to Checkpoint DRM devices. Code is moved into amdgpu_plugin_drm.c file which hosts various methods to checkpoint and restore a workload. Signed-off-by: Ramesh Errabolu --- plugins/amdgpu/amdgpu_plugin.c | 64 ++++++++++++++---------------- plugins/amdgpu/amdgpu_plugin_drm.c | 38 ++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_drm.h | 6 +++ 3 files changed, 74 insertions(+), 34 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 60e04f973..a579158d0 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -49,6 +49,13 @@ struct vma_metadata { /************************************ Global Variables ********************************************/ +/** + * FD of KFD device used to checkpoint. On a multi-process + * tree the order of checkpointing goes from parent to child + * and so on - so saving the FD will not be overwritten + */ +static int kfd_checkpoint_fd; + static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; @@ -990,6 +997,10 @@ static int unpause_process(int fd) goto exit; } + // Reset the KFD FD + kfd_checkpoint_fd = -1; + sys_close_drm_render_devices(&src_topology); + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); @@ -1181,44 +1192,25 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } + /* Initialize number of device files that will be checkpointed */ + init_gpu_count(&src_topology); + /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { + /* This is RenderD dumper plugin, for now just save renderD * minor number to be used during restore. In later phases this * needs to save more data for video decode etc. */ - - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; - - pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id); - - tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); - if (!tp_node) { - pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev)); - - return -ENODEV; - } - - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) - return -ENODEV; - - len = criu_render_node__get_packed_size(&rd); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; - - criu_render_node__pack(&rd, buf); - - snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); - ret = write_img_file(img_path, buf, len); - if (ret) { - xfree(buf); + ret = amdgpu_plugin_drm_dump_file(fd, id, &st); + if (ret) return ret; - } - xfree(buf); + /* Invoke unpause process if needed */ + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(kfd_checkpoint_fd); + } /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; @@ -1315,11 +1307,15 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = write_img_file(img_path, buf, len); xfree(buf); -exit: - /* Restore all queues */ - unpause_process(fd); - sys_close_drm_render_devices(&src_topology); +exit: + /* Restore all queues if conditions permit */ + kfd_checkpoint_fd = fd; + decrement_checkpoint_count(); + if (checkpoint_is_complete()) { + ret = unpause_process(fd); + } + xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index a48dc68f0..689d62072 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -61,3 +61,41 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) } +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) +{ + CriuRenderNode rd = CRIU_RENDER_NODE__INIT; + struct tp_node *tp_node; + char path[PATH_MAX]; + unsigned char *buf; + int minor; + int len; + int ret; + + /* Get the topology node of the DRM device */ + minor = minor(drm->st_rdev); + tp_node = sys_get_node_by_render_minor(&src_topology, minor); + if (!tp_node) { + pr_err("Failed to find a device with minor number = %d\n", minor); + return -ENODEV; + } + + /* Get the GPU_ID of the DRM device */ + rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd.gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id); + return -ENODEV; + } + + len = criu_render_node__get_packed_size(&rd); + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + + criu_render_node__pack(&rd, buf); + + snprintf(path, sizeof(path), IMG_DRM_FILE, id); + ret = write_img_file(path, buf, len); + xfree(buf); + return ret; +} + diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h index 37009c8ba..6f0c1a9a6 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.h +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -17,6 +17,12 @@ */ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); +/** + * Serialize meta-data about a particular DRM device, its number of BOs, + * etc into a file. The serialized filename has in it the value ID that + * is passed in as a parameter + */ +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); #endif /* __AMDGPU_PLUGIN_DRM_H__ */ From 71102e7f7399ad9297d14ebcae6bcbb8b30a8787 Mon Sep 17 00:00:00 2001 From: rahulk789 Date: Sun, 26 Nov 2023 14:31:42 +0530 Subject: [PATCH 386/775] sk-inet: Added IP_TTL socket option Signed-off-by: rahulk789 --- criu/sk-inet.c | 7 ++++++- images/sk-inet.proto | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 24e92a852..b8154e860 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -417,10 +417,12 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); + ret |= dump_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); } ioe->has_freebind = ioe->freebind; ioe->has_pktinfo = !!ioe->pktinfo; ioe->has_tos = !!ioe->tos; + ioe->has_ttl = !!ioe->ttl; return ret; } @@ -817,7 +819,10 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); if (ioe->has_tos) ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); - } + if (ioe->has_ttl) + ret |= restore_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); + + } if (ioe->raw) ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); diff --git a/images/sk-inet.proto b/images/sk-inet.proto index 666326fa4..03a679e7f 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -20,6 +20,7 @@ message ip_opts_entry { optional bool pktinfo = 5; optional uint32 tos = 6; + optional uint32 ttl = 7; } message inet_sk_entry { From 895a16c13c4b155b6ca03b8ce4a323dc9acf1104 Mon Sep 17 00:00:00 2001 From: rahulk789 Date: Sun, 26 Nov 2023 14:35:47 +0530 Subject: [PATCH 387/775] zdtm: Added tests for IP_TTL restore Signed-off-by: rahulk789 --- test/zdtm/static/sock_ip_opts00.c | 1 + 1 file changed, 1 insertion(+) diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c index d890410d8..cb464365d 100644 --- a/test/zdtm/static/sock_ip_opts00.c +++ b/test/zdtm/static/sock_ip_opts00.c @@ -26,6 +26,7 @@ struct sk_opt { struct sk_opt sk_opts_v4[] = { { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, + { SOL_IP, IP_TTL, 32 }, { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, }; From bd17bd43e39f9bc3e5d07f665d8e3c878b0c0a2b Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 6 Feb 2024 12:04:07 +0800 Subject: [PATCH 388/775] sk-inet: fix codding style in restore_ip_opts Commit [1] introduced codding-style breackage, let's fix it. Fixes: 66cab1f49 ("sk-inet: Added IP_TTL socket option") [1] Signed-off-by: Pavel Tikhomirov --- criu/sk-inet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index b8154e860..a6a767c73 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -821,8 +821,7 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); if (ioe->has_ttl) ret |= restore_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); - - } + } if (ioe->raw) ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); From a808f09bea416e9b8bf8415497f4c1481b2a10c5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 9 Feb 2024 14:14:10 +0000 Subject: [PATCH 389/775] amdgpu_plugin: fix lint errors $ make lint ... # Do not append \n to pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' plugins/amdgpu/amdgpu_plugin.c: pr_perror("%s(), Can't handle VMAs of input device\n", __func__); ! git --no-pager grep -En '^\s*\.*);$' | grep -v '\\n' plugins/amdgpu/amdgpu_plugin_drm.c:45: pr_err("Error in getting stat for: %s", path); plugins/amdgpu/amdgpu_plugin_util.c:77: pr_err("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); plugins/amdgpu/amdgpu_plugin_util.c:89: pr_err("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); plugins/amdgpu/amdgpu_plugin_util.c:120: pr_err("%s: Failed to open for %s", path, write ? "write" : "read"); plugins/amdgpu/amdgpu_plugin_util.c:126: pr_err("%s: Failed get pointer for %s", path, write ? "write" : "read"); plugins/amdgpu/amdgpu_plugin_util.c:136: pr_err("%s:Failed to access file size", path); plugins/amdgpu/amdgpu_plugin_util.c:152: pr_err("Cannot fopen %s", file_path); make: *** [Makefile:470: lint] Error 1 Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 2 +- plugins/amdgpu/amdgpu_plugin_drm.c | 3 +-- plugins/amdgpu/amdgpu_plugin_util.c | 16 +++++++--------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index a579158d0..a41469a50 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -412,7 +412,7 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) /* Determine if input is a DRM device and therefore is supported */ ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf); if (ret) - pr_perror("%s(), Can't handle VMAs of input device\n", __func__); + pr_perror("%s(), Can't handle VMAs of input device", __func__); return ret; } diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 689d62072..d54cd937d 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -42,7 +42,7 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE); ret = stat(path, &drm); if (ret == -1) { - pr_err("Error in getting stat for: %s", path); + pr_err("Error in getting stat for: %s\n", path); return ret; } @@ -98,4 +98,3 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) xfree(buf); return ret; } - diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index 48ff70555..62e569fc8 100755 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -65,7 +65,7 @@ void init_gpu_count(struct tp_system *topo) return; /* We add ONE to include checkpointing of KFD device */ - dev_file_cnt = 1 + topology_gpu_count(topo); + dev_file_cnt = 1 + topology_gpu_count(topo); } int read_fp(FILE *fp, void *buf, const size_t buf_len) @@ -74,7 +74,7 @@ int read_fp(FILE *fp, void *buf, const size_t buf_len) len_read = fread(buf, 1, buf_len, fp); if (len_read != buf_len) { - pr_err("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); + pr_err("Unable to read file (read:%ld buf_len:%ld)\n", len_read, buf_len); return -EIO; } return 0; @@ -86,7 +86,7 @@ int write_fp(FILE *fp, const void *buf, const size_t buf_len) len_write = fwrite(buf, 1, buf_len, fp); if (len_write != buf_len) { - pr_err("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); + pr_err("Unable to write file (wrote:%ld buf_len:%ld)\n", len_write, buf_len); return -EIO; } return 0; @@ -117,13 +117,13 @@ FILE *open_img_file(char *path, bool write, size_t *size) fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); if (fd < 0) { - pr_err("%s: Failed to open for %s", path, write ? "write" : "read"); + pr_err("%s: Failed to open for %s\n", path, write ? "write" : "read"); return NULL; } fp = fdopen(fd, write ? "w" : "r"); if (!fp) { - pr_err("%s: Failed get pointer for %s", path, write ? "write" : "read"); + pr_err("%s: Failed get pointer for %s\n", path, write ? "write" : "read"); return NULL; } @@ -133,7 +133,7 @@ FILE *open_img_file(char *path, bool write, size_t *size) ret = read_fp(fp, size, sizeof(*size)); if (ret) { - pr_err("%s:Failed to access file size", path); + pr_err("%s:Failed to access file size\n", path); fclose(fp); return NULL; } @@ -149,7 +149,7 @@ int read_file(const char *file_path, void *buf, const size_t buf_len) fp = fopen(file_path, "r"); if (!fp) { - pr_err("Cannot fopen %s", file_path); + pr_err("Cannot fopen %s\n", file_path); return -errno; } @@ -204,5 +204,3 @@ void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) } pr_info("\n"); } - - From e0f91e66ee7d4b5e52ae52cc6b891d1ffc0a0f7c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 12 Feb 2024 11:52:41 +0000 Subject: [PATCH 390/775] kerndat: check support for PAGE_IS_SOFT_DIRTY The commit introducing PAGE_IS_SOFT_DIRTY has not been merged in kernel v6.7.x. fs/proc/task_mmu: report SOFT_DIRTY bits through the PAGEMAP_SCAN ioctl https://github.com/torvalds/linux/commit/e6a9a2cbc13bf As a result, CRIU fails with the following error: Error (criu/pagemap-cache.c:199): pagemap-cache: PAGEMAP_SCAN: Invalid argument' Error (criu/pagemap-cache.c:225): pagemap-cache: Failed to fill cache for 63 (400000-402000)' This patch updates check_pagemap() in kerndat to check if PAGE_IS_SOFT_DIRTY is supported. Fixes: #2334 Signed-off-by: Radostin Stoyanov --- criu/kerndat.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 95e7226b2..e3b378a9c 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -63,6 +63,14 @@ static int check_pagemap(void) { int ret, fd, retry; u64 pfn = 0; + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { @@ -75,15 +83,11 @@ static int check_pagemap(void) return -1; } - if (ioctl(fd, PAGEMAP_SCAN, NULL) == 0) { - pr_err("PAGEMAP_SCAN succeeded unexpectedly\n"); - return -1; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; } else { switch (errno) { - case EFAULT: - pr_debug("PAGEMAP_SCAN is supported\n"); - kdat.has_pagemap_scan = true; - break; case EINVAL: case ENOTTY: pr_debug("PAGEMAP_SCAN isn't supported\n"); From 7fd4a15e68f1af91adf4ce6ebd5a40cd57e35661 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 18 Jan 2024 14:26:33 +0000 Subject: [PATCH 391/775] pb2dict: fix flake8 error This patch fixes the following flake8 error: python3 -m flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py lib/pycriu/images/pb2dict.py:361:43: E721 do not compare types, for exact checks use `is` / `is not`, for instance checks use `isinstance()` Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 3f5f390e3..d29fdf66c 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -358,7 +358,10 @@ def pb2dict(pb, pretty=False, is_hex=False): else: d_val = _pb2dict_cast(field, value, pretty, is_hex) - d[field.name] = d_val.decode() if type(d_val) == bytes else d_val + try: + d[field.name] = d_val.decode() + except (UnicodeDecodeError, AttributeError): + d[field.name] = d_val return d From e0b74f558b3e32d8d512836307ab3ea26ed41659 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 9 Feb 2024 13:04:58 +0000 Subject: [PATCH 392/775] make: replace flake8 with ruff Ruff (https://github.com/astral-sh/ruff) is a Python linter written in Rust, designed to replace Flake8. It is significantly faster and actively maintained. In addition to replacing flake8 with ruff, this patch also creates separate makefile targets for ruff, shellcheck and codespell, so that they can be tested independently. RUFF_FLAGS can be used to specify options such as '--fix'. Example: make lint make ruff RUFF_FLAGS=--fix Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 4 +-- .github/workflows/lint.yml | 2 +- CONTRIBUTING.md | 2 +- Makefile | 37 ++++++++++++++---------- scripts/build/Dockerfile.alpine | 1 - scripts/build/Dockerfile.archlinux | 1 - scripts/build/Dockerfile.centos8 | 1 - scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 4 +-- scripts/ruff.toml | 4 +++ 11 files changed, 33 insertions(+), 26 deletions(-) create mode 100644 scripts/ruff.toml diff --git a/.cirrus.yml b/.cirrus.yml index adaa9be33..72135590d 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata python-flake8 xmlto libdrm-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel @@ -111,7 +111,7 @@ task: yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : yum install -y dnf-plugins-core yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel alternatives --set python /usr/bin/python3 systemctl stop sssd # Even with selinux in permissive mode the selinux tests will be executed diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f52bce812..489259474 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,7 +14,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 xz clang-tools-extra which codespell git-clang-format ShellCheck + run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck - uses: actions/checkout@v2 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a70506bfb..37965e5fb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -59,7 +59,7 @@ Other conventions can be learned from the source code itself. In short, make sur Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. -The following command can be used to automatically run a code linter for Python files (flake8), Shell scripts (shellcheck), +The following command can be used to automatically run a code linter for Python files (ruff), Shell scripts (shellcheck), text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` diff --git a/Makefile b/Makefile index 31dbe202f..e49dace7a 100644 --- a/Makefile +++ b/Makefile @@ -426,20 +426,23 @@ help: @echo ' amdgpu_plugin - Make AMD GPU plugin' .PHONY: help -lint: - flake8 --version - flake8 --config=scripts/flake8.cfg test/zdtm.py - flake8 --config=scripts/flake8.cfg test/inhfd/*.py - flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/pycriu/images/pb2dict.py - flake8 --config=scripts/flake8.cfg lib/pycriu/images/images.py - flake8 --config=scripts/flake8.cfg scripts/criu-ns - flake8 --config=scripts/flake8.cfg test/others/criu-ns/run.py - flake8 --config=scripts/flake8.cfg crit/*.py - flake8 --config=scripts/flake8.cfg crit/crit/*.py - flake8 --config=scripts/flake8.cfg scripts/uninstall_module.py - flake8 --config=scripts/flake8.cfg coredump/ coredump/coredump - flake8 --config=scripts/flake8.cfg scripts/github-indent-warnings.py +ruff: + @ruff --version + ruff ${RUFF_FLAGS} --config=scripts/ruff.toml \ + test/zdtm.py \ + test/inhfd/*.py \ + test/others/rpc/config_file.py \ + lib/pycriu/images/pb2dict.py \ + lib/pycriu/images/images.py \ + scripts/criu-ns \ + test/others/criu-ns/run.py \ + crit/*.py \ + crit/crit/*.py \ + scripts/uninstall_module.py \ + coredump/ coredump/coredump \ + scripts/github-indent-warnings.py + +shellcheck: shellcheck --version shellcheck scripts/*.sh shellcheck scripts/ci/*.sh scripts/ci/apt-install @@ -448,7 +451,11 @@ lint: shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh shellcheck -x test/others/config-file/*.sh shellcheck -x test/others/action-script/*.sh + +codespell: codespell -S tags + +lint: ruff shellcheck codespell # Do not append \n to pr_perror, pr_pwarn or fail ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' # Do not use %m with pr_* or fail @@ -459,7 +466,7 @@ lint: ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files ! git --no-pager grep -E '\s+$$' \*.c \*.h -.PHONY: lint +.PHONY: lint ruff shellcheck codespell codecov: SHELL := $(shell which bash) codecov: diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 2c58c910e..329d7791d 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -41,7 +41,6 @@ RUN apk add \ go \ e2fsprogs \ py-yaml \ - py3-flake8 \ py3-importlib-metadata \ asciidoctor diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index b9968e876..405651489 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -31,7 +31,6 @@ RUN pacman -Syu --noconfirm \ bash \ go \ python-yaml \ - flake8 \ asciidoctor \ python-junit-xml \ python-importlib-metadata \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index b06524674..a67212344 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -26,7 +26,6 @@ RUN yum install -y --allowerasing \ protobuf-c-devel \ protobuf-devel \ python3-devel \ - python3-flake8 \ python3-PyYAML \ python3-protobuf \ python3-pip \ diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index d812c5faa..09085c403 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -23,7 +23,6 @@ dnf install -y \ procps-ng \ protobuf-c-devel \ protobuf-devel \ - python3-flake8 \ python3-PyYAML \ python3-protobuf \ python3-junit_xml \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index ef7e869e0..2fdecbc97 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -4,7 +4,7 @@ set -x -e CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time flake8 libbsd-dev python3-yaml + libnl-route-3-dev time libbsd-dev python3-yaml libperl-dev pkg-config python3-protobuf python3-pip python3-importlib-metadata python3-junit.xml libdrm-dev) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c8cf0be74..4c1be3544 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -38,8 +38,8 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-protobuf python3-importlib-metadata \ - python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel + protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ + rubygem-asciidoctor iptables libselinux-devel libbpf-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline diff --git a/scripts/ruff.toml b/scripts/ruff.toml new file mode 100644 index 000000000..2b0385976 --- /dev/null +++ b/scripts/ruff.toml @@ -0,0 +1,4 @@ +# Ignore `E401` (import violations) in all `__init__.py` files +[lint.per-file-ignores] +"__init__.py" = ["F401"] + From 835afb1b88b1dd05e0628143ec48263b0edd2e16 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 9 Feb 2024 13:47:55 +0000 Subject: [PATCH 393/775] criu-ns: fix lint error This patch fixes the following lint error: scripts/criu-ns:219:16: E713 [*] Test for membership should be `not in` The change in this patch is auto-generated with `ruff --fix`. Signed-off-by: Radostin Stoyanov --- scripts/criu-ns | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/criu-ns b/scripts/criu-ns index 4c032aa14..5950d7c50 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -216,7 +216,7 @@ def wrap_restore(): def get_varg(args): for i in range(1, len(sys.argv)): - if not sys.argv[i] in args: + if sys.argv[i] not in args: continue if i + 1 >= len(sys.argv): From a0a6ec3dc078262c28673ebb00eb91c6064820df Mon Sep 17 00:00:00 2001 From: Stepan Pieshkin Date: Fri, 2 Feb 2024 07:03:05 +0000 Subject: [PATCH 394/775] cgroup: Add support for restoring a thread in a correct v1 cgroup Currently we have checkpoint/restore support only of cgroup v2 threaded controllers. Threads originating in cgroup v1 environments will be restored to the main thread's cgroup. This change extends the support for a cgroups v1. Signed-off-by: Stepan Pieshkin --- criu/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index 67282f269..6d1f74457 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -427,10 +427,11 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const } /* - * Set the is_threaded flag if cgroup.type's value is threaded, - * ignore all other values. + * Set the is_threaded flag if cgroup.type's value is threaded + * or it is a cgroup v1 (it has a 'tasks' property). + * Ignore all other values. */ - if (!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) + if ((!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) || !strcmp("tasks", prop->name)) controller->is_threaded = true; pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); @@ -1922,7 +1923,7 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) if (ctrl->cnames[0][0] == 0) fstype = "cgroup2"; - pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); + pr_debug("\tMaking controller dir %s (%s), type %s\n", paux, opt, fstype); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; @@ -1985,6 +1986,7 @@ static int cgroupd(int sk) CgMemberEntry *ce = cg_set_entry->ctls[i]; char aux[PATH_MAX]; CgControllerEntry *ctrl = NULL; + const char *format; for (j = 0; j < n_controllers; j++) { CgControllerEntry *cur = controllers[j]; @@ -2008,7 +2010,8 @@ static int cgroupd(int sk) continue; aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); - snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/cgroup.threads", ce->path); + format = ctrl->cnames[0][0] ? "/%s/tasks" : "/%s/cgroup.threads"; + snprintf(aux + aux_off, sizeof(aux) - aux_off, format, ce->path); /* * Cgroupd runs outside of the namespaces so we don't From f590c2b638cd83bd3ce3c1af2759ce0393e77bca Mon Sep 17 00:00:00 2001 From: Stepan Pieshkin Date: Fri, 2 Feb 2024 07:05:51 +0000 Subject: [PATCH 395/775] zdtm/static: check that cgroup layout of threads is preserved Co-developed-by: Stepan Pieshkin Signed-off-by: Stepan Pieshkin Signed-off-by: Michal Clapinski Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 3 + test/zdtm/static/cgroup_threads.c | 184 +++++++++++++++++++++++++++ test/zdtm/static/cgroup_threads.desc | 1 + test/zdtm/static/cgroup_threads.hook | 19 +++ 4 files changed, 207 insertions(+) create mode 100644 test/zdtm/static/cgroup_threads.c create mode 100644 test/zdtm/static/cgroup_threads.desc create mode 100755 test/zdtm/static/cgroup_threads.hook diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index fb856d55b..548cefac2 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -402,6 +402,7 @@ TST_DIR = \ cgroup_ignore \ cgroup_stray \ cgroup_yard \ + cgroup_threads \ unlink_fstat04 \ unlink_fstat041 \ mntns_remap \ @@ -684,6 +685,8 @@ s390x_gs_threads: LDFLAGS += -pthread thread_different_uid_gid: LDLIBS += -pthread -lcap +cgroup_threads: LDFLAGS += -pthread + bpf_hash: LDLIBS += -lbpf bpf_array: LDLIBS += -lbpf diff --git a/test/zdtm/static/cgroup_threads.c b/test/zdtm/static/cgroup_threads.c new file mode 100644 index 000000000..2c17e13a7 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup layout of threads is preserved"; +const char *test_author = "Michał Cłapiński "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; +#define SUBNAME "subcg_threads" +#define SUBNAME2 SUBNAME "/subsubcg" + +#define exit_group(code) syscall(__NR_exit_group, code) + +static int cg_move(char *name) +{ + int cgfd, l; + char paux[256]; + + sprintf(paux, "%s/%s", dirname, name); + if (mkdir(paux, 0600)) { + pr_perror("Can't create %s", paux); + return -1; + } + + sprintf(paux, "%s/%s/tasks", dirname, name); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + return -1; + } + + l = write(cgfd, "0", 2); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + return -1; + } + + return 0; +} + +static int cg_check(char *name) +{ + int found = 0; + FILE *cgf; + char paux[256], aux[128]; + + cgf = fopen("/proc/thread-self/cgroup", "r"); + if (cgf == NULL) + return -1; + + sprintf(aux, "name=%s:/%s", cgname, name); + while (fgets(paux, sizeof(paux), cgf)) { + char *s; + + s = strchr(paux, ':') + 1; + s[strlen(s) - 1] = '\0'; + test_msg("CMP [%s] vs [%s]\n", s, aux); + if (!strcmp(s, aux)) { + found = 1; + break; + } + } + + fclose(cgf); + + return found ? 0 : -1; +} + +int th_sync[2], rst_sync[2]; + +void *thread_fn(void *args) +{ + int status = cg_move(SUBNAME2); + + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + + if (status == 0) { + if (read(rst_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + exit_group(1); + } + + status = cg_check(SUBNAME2); + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + } + + pthread_exit(0); +} + +int main(int argc, char **argv) +{ + int status, exit_code = 1; + pthread_t thread; + char aux[64]; + + test_init(argc, argv); + + /* + * Pipe to talk to the kid. + * First, it reports that it's ready (int), + * then it reports the restore status (int). + */ + + if (pipe(th_sync)) { + pr_perror("pipe"); + return 1; + } + + /* "Restore happened" pipe */ + if (pipe(rst_sync)) { + pr_perror("pipe"); + return 1; + } + + if (mkdir(dirname, 0700) < 0) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(aux, "none,name=%s", cgname); + if (mount("none", dirname, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto out_rd; + } + + if (cg_move(SUBNAME)) + goto out_rs; + + if (pthread_create(&thread, NULL, thread_fn, NULL)) { + pr_perror("Can't create a new thread"); + goto out_rs; + } + + status = -1; + read(th_sync[0], &status, sizeof(status)); + if (status != 0) { + pr_perror("Error moving into cgroups"); + close(rst_sync[0]); + goto out_rs; + } + + test_daemon(); + test_waitsig(); + + close(rst_sync[1]); + + status = -1; + if (read(th_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + goto out_rs; + } + if (status != 0) { + fail("child cg changed"); + goto out_rs; + } + + pass(); + exit_code = 0; + +out_rs: + umount(dirname); +out_rd: + rmdir(dirname); +out: + return exit_code; +} diff --git a/test/zdtm/static/cgroup_threads.desc b/test/zdtm/static/cgroup_threads.desc new file mode 100644 index 000000000..3c6c4a7e2 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup_threads.hook b/test/zdtm/static/cgroup_threads.hook new file mode 100755 index 000000000..f4b553d34 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.hook @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +tname=$(mktemp -d cgclean.XXXXXX) +trap 'rmdir "${tname}"' EXIT + +mount -t cgroup none $tname -o "none,name=zdtmtst" +trap 'umount "${tname}"; rmdir "${tname}"' EXIT + +echo "Cleaning $tname" + +rmdir "$tname/subcg_threads/subsubcg/" || true +rmdir "$tname/subcg_threads/" || true + +echo "Left there is:" +ls "$tname" From 0b8c51eaad9d3293aa77cf64cf56742fb2ccf1ed Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 25 May 2022 20:04:59 +0300 Subject: [PATCH 396/775] compiler: add ALIGN_DOWN macro Signed-off-by: Mike Rapoport (IBM) --- include/common/compiler.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/common/compiler.h b/include/common/compiler.h index 1347b6236..3e66709f9 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -89,6 +89,7 @@ #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) #define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) +#define ALIGN_DOWN(x, a) ALIGN((x) - ((a) - 1), (a)) #define min(x, y) \ ({ \ From 17f4dd09593e7b5a42648985f8fa72905349271d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 10:04:16 +0300 Subject: [PATCH 397/775] compel: always pass user_fpregs_struct_t to compel_get_task_regs() All architectures create on-stack structure for floating point save area in compel_get_task_regs() if the caller passes NULL rather than a valid pointer. The only place that calls compel_get_task_regs() with NULL for floating point save area is parasite_start_daemon() and it is simpler to define this strucuture on stack of parasite_start_daemon(). The availability of floating point save data is required in parasite_start_daemon() to detect shadow stack presence early during parasite infection and will be used in later patches. Signed-off-by: Mike Rapoport (IBM) --- compel/arch/aarch64/src/lib/infect.c | 3 +-- compel/arch/arm/src/lib/infect.c | 3 +-- compel/arch/mips/src/lib/infect.c | 3 +-- compel/arch/ppc64/src/lib/infect.c | 3 +-- compel/arch/s390/src/lib/infect.c | 3 +-- compel/arch/x86/src/lib/infect.c | 3 +-- compel/src/lib/infect.c | 3 ++- 7 files changed, 8 insertions(+), 13 deletions(-) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index d0189f003..812ba34a3 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -59,10 +59,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; struct iovec iov; int ret; diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 7700f52ca..8b810a88f 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -65,10 +65,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr } #define PTRACE_GETVFPREGS 27 -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *vfp, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *vfp = ext_regs ? ext_regs : &tmp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index afa0f5ed5..0e98aaee3 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -119,10 +119,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 1603ac92e..84c2b1d7c 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -391,10 +391,9 @@ static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_stru return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; int ret; ret = __get_task_regs(pid, regs, fpregs); diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 3cd25e71d..85dfc3a4d 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -293,10 +293,9 @@ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) /* * Prepare task registers for restart */ -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; struct iovec iov; int rewind; diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 88bdb4047..2febbf3f7 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -345,10 +345,9 @@ static int corrupt_extregs(pid_t pid) return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index b9a913fa1..696daa7f1 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -739,6 +739,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; + user_fpregs_struct_t ext_regs; /* * Get task registers before going daemon, since the @@ -746,7 +747,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) * while in daemon it is not such. */ - if (compel_get_task_regs(pid, &ctl->orig.regs, NULL, ictx->save_regs, ictx->regs_arg, ictx->flags)) { + if (compel_get_task_regs(pid, &ctl->orig.regs, &ext_regs, ictx->save_regs, ictx->regs_arg, ictx->flags)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } From 6e491a19a33496abc93ba3ce59a53c01d03a1ca6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Mon, 21 Mar 2022 09:34:41 +0200 Subject: [PATCH 398/775] compel: shstk: save CET state when CPU supports it Signed-off-by: Mike Rapoport (IBM) --- .../arch/x86/src/lib/include/uapi/asm/cpu.h | 1 + .../arch/x86/src/lib/include/uapi/asm/fpu.h | 11 +++- .../src/lib/include/uapi/asm/infect-types.h | 3 + compel/arch/x86/src/lib/infect.c | 65 ++++++++++++++++++- compel/include/uapi/infect.h | 8 +++ criu/arch/x86/crtools.c | 17 +++++ images/core-x86.proto | 8 +++ 7 files changed, 111 insertions(+), 2 deletions(-) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h index 63ff83dbe..11c50e0e5 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -244,6 +244,7 @@ enum cpuid_leafs { #define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (11 * 32 + 7) /* Shadow Stack */ #define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index 8c83dd9ae..d595a68fc 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -245,6 +245,14 @@ struct pkru_state { uint32_t pad; } __packed; +/* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + uint64_t cet; /* user control-flow settings */ + uint64_t ssp; /* user shadow stack pointer */ +}; + /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. @@ -260,7 +268,7 @@ struct pkru_state { * Of course it was not ;-) Now using four pages... * */ -#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct) - sizeof(struct cet_user_state)) /* * cpu requires it to be 64 byte aligned @@ -276,6 +284,7 @@ struct xsave_struct { struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; + struct cet_user_state cet; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index b35504ff8..2619fe64a 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -143,4 +143,7 @@ typedef struct xsave_struct user_fpregs_struct_t; */ #define __NR32_mmap __NR32_mmap2 +extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); +#define compel_shstk_enabled __compel_shstk_enabled + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 2febbf3f7..aabb4f371 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -26,6 +26,16 @@ #ifndef NT_X86_XSTATE #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #endif + +#ifndef NT_X86_SHSTK +#define NT_X86_SHSTK 0x204 /* x86 shstk state */ +#endif + +#ifndef ARCH_SHSTK_STATUS +#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#endif + #ifndef NT_PRSTATUS #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ #endif @@ -250,7 +260,49 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) // [1] Intel® 64 and IA-32 Architectures Software Developer's // Manual Volume 1: Basic Architecture // Section 13.6: Processor tracking of XSAVE-managed state - return get_task_fpregs(pid, xsave); + if (get_task_fpregs(pid, xsave)) + return -1; + } + + /* + * xsave may be on stack, if we don't clear it explicitly we get + * funky shadow stack state + */ + memset(&xsave->cet, 0, sizeof(xsave->cet)); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + unsigned long ssp = 0; + unsigned long features = 0; + + if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { + /* + * kernels that don't support shadow stack return + * -EINVAL + */ + if (errno == EINVAL) + return 0; + + pr_perror("shstk: can't get shadow stack status for %d", pid); + return -1; + } + + if (!(features & ARCH_SHSTK_SHSTK)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: can't get SSP for %d", pid); + return -1; + } + } + + xsave->cet.cet = features; + xsave->cet.ssp = ssp; + + pr_debug("%d: shstk: cet: %lx ssp: %lx\n", pid, xsave->cet.cet, xsave->cet.ssp); } return 0; @@ -697,3 +749,14 @@ unsigned long compel_task_size(void) { return TASK_SIZE; } + +bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return false; + + if (ext_regs->cet.cet & ARCH_SHSTK_SHSTK) + return true; + + return false; +} diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 3bd36dda1..848d36c57 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -182,4 +182,12 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); +#ifndef compel_shstk_enabled +static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + return false; +} +#define compel_shstk_enabled +#endif + #endif diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index 912a4348b..e068a9a02 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -133,6 +133,14 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpre #undef assign_array #undef assign_xsave + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + UserX86CetEntry *cet = core->thread_info->fpregs->xsave->cet; + struct cet_user_state *regs = &fpregs->cet; + + cet->cet = regs->cet; + cet->ssp = regs->ssp; + } + return 0; } @@ -199,6 +207,13 @@ static int alloc_xsave_extends(UserX86XsaveEntry *xsave) goto err; } + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + xsave->cet = xzalloc(sizeof(UserX86CetEntry)); + if (!xsave->cet) + goto err; + user_x86_cet_entry__init(xsave->cet); + } + return 0; err: return -1; @@ -220,6 +235,8 @@ int arch_alloc_thread_info(CoreEntry *core) with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) + sz += sizeof(UserX86CetEntry); } m = xmalloc(sz); diff --git a/images/core-x86.proto b/images/core-x86.proto index 815cf21ff..762418d73 100644 --- a/images/core-x86.proto +++ b/images/core-x86.proto @@ -41,6 +41,11 @@ message user_x86_regs_entry { optional user_x86_regs_mode mode = 28 [default = NATIVE]; } +message user_x86_cet_entry { + required uint64 cet = 1[(criu).hex = true]; + required uint64 ssp = 2[(criu).hex = true]; +} + message user_x86_xsave_entry { /* standard xsave features */ required uint64 xstate_bv = 1; @@ -60,6 +65,9 @@ message user_x86_xsave_entry { /* Protected keys */ repeated uint32 pkru = 8; + /* CET */ + optional user_x86_cet_entry cet = 9; + /* * Processor trace (PT) and hardware duty cycling (HDC) * are supervisor state components and only managed by From 63a45e1c8aac0e46149ad82637de0d22227c9663 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 19:58:04 +0300 Subject: [PATCH 399/775] compel: infect: prepare parasite_service() for addition of CET support To support sigreturn with CET enabled parasite must rewind its stack before calling sigreturn so that shadow stack will be compatible with actual calling sequence. In addition, calling sigreturn from top level routine (__export_parasite_head_start) will significantly simplify the shadow stack manipulations required to execute sigreturn. For x86 make fini_sigreturn() return the stack pointer for the signal frame that will be used by sigreturn and propagate that return value up to __export_parasite_head_start. In non-daemon mode parasite_trap_cmd() returns non-positive value which allows to distinguish daemon and non-daemon mode and properly stop at int3 in non-daemon mode. Architectures other than x86 remain unchanged and will still call sigreturn from fini_sigreturn(). Signed-off-by: Mike Rapoport (IBM) --- compel/arch/x86/plugins/std/parasite-head.S | 14 +++++++++ .../x86/src/lib/include/uapi/asm/sigframe.h | 10 ++++++- compel/plugins/include/uapi/std/infect.h | 2 +- compel/plugins/std/infect.c | 30 +++++++++---------- criu/pie/restorer.c | 6 +++- 5 files changed, 44 insertions(+), 18 deletions(-) diff --git a/compel/arch/x86/plugins/std/parasite-head.S b/compel/arch/x86/plugins/std/parasite-head.S index 4fb38d1f1..42cad4808 100644 --- a/compel/arch/x86/plugins/std/parasite-head.S +++ b/compel/arch/x86/plugins/std/parasite-head.S @@ -34,7 +34,21 @@ END(__export_parasite_head_start_compat) .code64 #endif +/* + * When parasite_service() runs in the daemon mode it will return the stack + * pointer for the sigreturn frame in %rax and we call sigreturn directly + * from here. + * Since a valid stack pointer is positive, it is safe to presume that + * return value <= 0 means that parasite_service() called parasite_trap_cmd() + * in non-daemon mode, and the parasite should stop at int3. + */ ENTRY(__export_parasite_head_start) call parasite_service + cmp $0, %rax + jle 1f + movq %rax, %rsp + movq $15, %rax + syscall +1: int $0x03 END(__export_parasite_head_start) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index ec8c156fa..9a540694b 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -203,13 +203,21 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) : "rdi"(new_sp) \ : "eax", "r8", "r9", "r10", "r11", "memory") -#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ +#define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ do { \ if ((rt_sigframe)->is_native) \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) + +#define ARCH_RT_SIGRETURN_DUMP(new_sp, rt_sigframe) \ +do { \ + if ((rt_sigframe)->is_native) \ + return new_sp; \ + else \ + ARCH_RT_SIGRETURN_COMPAT(new_sp); \ +} while (0) /* clang-format off */ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h index 08a5a7a80..a729abbd2 100644 --- a/compel/plugins/include/uapi/std/infect.h +++ b/compel/plugins/include/uapi/std/infect.h @@ -7,7 +7,7 @@ extern int parasite_get_rpc_sock(void); extern unsigned int __export_parasite_service_cmd; extern void *__export_parasite_service_args_ptr; -extern int __must_check parasite_service(void); +extern unsigned long __must_check parasite_service(void); /* * Must be supplied by user plugins. diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index 60b21d313..034201320 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -16,6 +16,10 @@ #include "rpc-pie-priv.h" +#ifndef ARCH_RT_SIGRETURN_DUMP +#define ARCH_RT_SIGRETURN_DUMP ARCH_RT_SIGRETURN +#endif + static int tsock = -1; static struct rt_sigframe *sigframe; @@ -79,12 +83,13 @@ static int __parasite_daemon_wait_msg(struct ctl_msg *m) /* Core infect code */ -static noinline void fini_sigreturn(unsigned long new_sp) +static noinline unsigned long fini_sigreturn(unsigned long new_sp) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_DUMP(new_sp, sigframe); + return new_sp; } -static int fini(void) +static unsigned long fini(void) { unsigned long new_sp; @@ -96,14 +101,14 @@ static int fini(void) sys_close(tsock); std_log_set_fd(-1); - fini_sigreturn(new_sp); + return fini_sigreturn(new_sp); BUG(); return -1; } -static noinline __used int noinline parasite_daemon(void *args) +static noinline __used unsigned long parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; @@ -140,12 +145,10 @@ static noinline __used int noinline parasite_daemon(void *args) } out: - fini(); - - return 0; + return fini(); } -static noinline __used int parasite_init_daemon(void *data) +static noinline __used unsigned long parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; @@ -178,14 +181,11 @@ static noinline __used int parasite_init_daemon(void *data) } else goto err; - parasite_daemon(data); + return parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); - fini(); - BUG(); - - return -1; + return fini(); } #ifndef __parasite_entry @@ -203,7 +203,7 @@ err: unsigned int __export_parasite_service_cmd = 0; void *__export_parasite_service_args_ptr = NULL; -int __used __parasite_entry parasite_service(void) +unsigned long __used __parasite_entry parasite_service(void) { unsigned int cmd = __export_parasite_service_cmd; void *args = __export_parasite_service_args_ptr; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 02971657e..20c6801c5 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -78,6 +78,10 @@ #define FALLOC_FL_PUNCH_HOLE 0x02 #endif +#ifndef ARCH_RT_SIGRETURN_RST +#define ARCH_RT_SIGRETURN_RST ARCH_RT_SIGRETURN +#endif + #define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ @@ -631,7 +635,7 @@ static int restore_thread_common(struct thread_restore_args *args) static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sigframe) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_RST(new_sp, sigframe); } static int send_cg_set(int sk, int cg_set) From 0aba3dcfa17929dbee316a398c6207d4cdf14f58 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 21:06:50 +0300 Subject: [PATCH 400/775] compel: shstk: prepare shadow stack signal frame When calling sigreturn with CET enabled, the kernel verifies that the shadow stack has proper address of sa_restorer and a "restore token". Normally, they pushed to the shadow stack when signal processing is started. Since compel calls sigreturn directly, the shadow stack should be updated to match the kernel expectations for sigreturn invocation. Add parasite_setup_shstk() that sets up the shadow stack with the address of __export_parasite_head_start as sa_restorer and with the required restore token. Signed-off-by: Mike Rapoport (IBM) --- .../src/lib/include/uapi/asm/infect-types.h | 4 ++ compel/arch/x86/src/lib/infect.c | 45 +++++++++++++++++++ compel/include/uapi/infect.h | 9 ++++ compel/src/lib/infect.c | 3 ++ 4 files changed, 61 insertions(+) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index 2619fe64a..b998c488c 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -146,4 +146,8 @@ typedef struct xsave_struct user_fpregs_struct_t; extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); #define compel_shstk_enabled __compel_shstk_enabled +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index aabb4f371..a07b1c9f3 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -760,3 +760,48 @@ bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) return false; } + +int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +{ + pid_t pid = ctl->rpid; + unsigned long sa_restorer = ctl->parasite_ip; + unsigned long long ssp; + unsigned long token; + struct iovec iov; + + if (!compel_shstk_enabled(ext_regs)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: %d: cannot get SSP", pid); + return -1; + } + } + + /* The token is for 64-bit */ + token = ALIGN_DOWN(ssp, 8); + token |= (1UL << 63); + ssp = ALIGN_DOWN(ssp, 8) - 8; + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, token)) { + pr_perror("shstk: %d: failed to inject shadow stack token", pid); + return -1; + } + + ssp = ssp - sizeof(uint64_t); + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, sa_restorer)) { + pr_perror("shstk: %d: failed to inject restorer address", pid); + return -1; + } + + ssp = ssp + sizeof(uint64_t); + if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + pr_perror("shstk: %d: cannot write SSP", pid); + return -1; + } + + return 0; +} diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 848d36c57..cd6255909 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -190,4 +190,13 @@ static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) #define compel_shstk_enabled #endif +#ifndef parasite_setup_shstk +static inline int parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs) +{ + return 0; +} +#define parasite_setup_shstk parasite_setup_shstk +#endif + #endif diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 696daa7f1..79d00c9a1 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -760,6 +760,9 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; + if (parasite_setup_shstk(ctl, &ext_regs)) + return -1; + if (parasite_init_daemon(ctl)) return -1; From 17eda3ce57e41117771a0f52c6f54cca3a62db88 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sun, 15 May 2022 21:19:58 +0300 Subject: [PATCH 401/775] criu: shstk: add VMA_AREA_SHSTK flag The shadow stack VMAs require special care because they can only be created and populated using special system calls. Add VMA_AREA_SHSTK flag and set it for VMAs that are marked as "ss" in /proc/pid/smaps Signed-off-by: Mike Rapoport (IBM) --- criu/include/image.h | 3 +++ criu/proc_parse.c | 17 ++++++++++++++--- lib/pycriu/images/pb2dict.py | 1 + 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/criu/include/image.h b/criu/include/image.h index 9a275565f..a17aae35c 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -35,6 +35,8 @@ * - stack * the memory area is used in application stack so we * should be careful about guard page here + * - shadow stack + * the memory area is used by shadow stack * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall @@ -84,6 +86,7 @@ #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_SHSTK (1 << 15) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 9d43e2394..92655a484 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -118,7 +118,8 @@ bool handle_vma_plugin(int *fd, struct stat *stat) return true; } -static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) +static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, + int *shstk) { char *tok; @@ -162,6 +163,9 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) *io_pf = 1; + if (_vmflag_match(tok, "ss")) + *shstk = 1; + /* * Anything else is just ignored. */ @@ -172,14 +176,21 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { - __parse_vmflags(buf, flags, madv, io_pf); + int shstk = 0; + + __parse_vmflags(buf, flags, madv, io_pf, &shstk); } static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) { int io_pf = 0; + int shstk = 0; - __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); + __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf, + &shstk); + + if (shstk) + vma_area->e->status |= VMA_AREA_SHSTK; /* * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index d29fdf66c..0d1a24692 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -103,6 +103,7 @@ mmap_status_map = [ ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), ('VMA_AREA_MEMFD', 1 << 14), + ('VMA_AREA_SHSTK', 1 << 15), ('VMA_UNSUPP', 1 << 31), ] From 4b6dda7ec077d516939544d12900ded231dcb248 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 24 May 2022 21:25:14 +0300 Subject: [PATCH 402/775] criu: shstk: premap and prepopulate shadow stack VMAs Shadow stack VMAs cannot be mmap()ed, they must be created using map_shadow_stack() system call and populated using special wrss instruction available only when shadow stack is enabled. Premap them to reserve virtual address space and populate it to have there contents available for later copying after enabling shadow stack. Along with the space required by shadow stack VMAs also reserve an extra page that will be later used as a temporary shadow stack. Signed-off-by: Mike Rapoport (IBM) --- criu/include/vma.h | 1 + criu/mem.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/criu/include/vma.h b/criu/include/vma.h index 4b663ee50..b8ddfc142 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -106,6 +106,7 @@ static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size return (vma_entry_is(entry, VMA_AREA_REGULAR) && (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && (entry->end <= task_size)) || + vma_entry_is(entry, VMA_AREA_SHSTK) || vma_entry_is(entry, VMA_AREA_AIORING); } diff --git a/criu/mem.c b/criu/mem.c index f56ed826b..0236c5e1e 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -741,6 +741,8 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; + if (vma_area_is(vma, VMA_AREA_SHSTK)) + ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); @@ -882,6 +884,14 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); + + /* + * map an extra page for shadow stack VMAs, it will be used as a + * temporary shadow stack + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + size += PAGE_SIZE; + if (!vma_inherited(vma)) { int flag = 0; /* @@ -957,6 +967,15 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { + /* + * Shadow stack VMAs cannot be mmap()ed, they must be created using + * map_shadow_stack() system call. + * Premap them to reserve virtual address space and populate them + * to have there contents available for later copying. + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + return true; + /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the From 2ebd1a4f0b6e1c4e35f044a65549356388e4d1df Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 25 May 2022 10:30:06 +0300 Subject: [PATCH 403/775] criu: shstk: prepare shadow stack parameters for restorer blob Shadow stacks must be populated using special WRSS instruction. This instruction is only available when shadow stack is enabled, calling it with disabled shadow stack causes #UD. Moreover, shadow stack VMAs cannot be mremap()ed and they must be created using map_shadow_stack() system call. This requires delaying the restore of shadow stacks to restorer blob after the CRIU mappings are cleared. Introduce rst_shstk_info structure to hold shadow stack parameters required in the restorer blob and populate this structure in arch_prepare_shstk() method. Signed-off-by: Mike Rapoport (IBM) Signed-off-by: Andrei Vagin --- criu/arch/x86/Makefile | 1 + criu/arch/x86/include/asm/restorer.h | 1 + criu/arch/x86/include/asm/shstk.h | 69 +++++++++++++++++++++ criu/arch/x86/shstk.c | 90 ++++++++++++++++++++++++++++ criu/cr-restore.c | 3 + criu/include/restore.h | 13 ++++ criu/include/restorer.h | 8 +++ 7 files changed, 185 insertions(+) create mode 100644 criu/arch/x86/include/asm/shstk.h create mode 100644 criu/arch/x86/shstk.c diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile index 618e85bb3..46f00e9e9 100644 --- a/criu/arch/x86/Makefile +++ b/criu/arch/x86/Makefile @@ -9,6 +9,7 @@ obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o +obj-y += shstk.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index f7a6d5058..3a673958d 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -8,6 +8,7 @@ #include #include #include "asm/compat.h" +#include "asm/shstk.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000..a81062010 --- /dev/null +++ b/criu/arch/x86/include/asm/shstk.h @@ -0,0 +1,69 @@ +#ifndef __CR_ASM_SHSTK_H__ +#define __CR_ASM_SHSTK_H__ + +/* + * Shadow stack constants from Linux + */ +/* arch/x86/include/uapi/asm/mman.h */ +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +/* arch/x86/include/uapi/asm/prctl.h */ +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#define ARCH_HAS_SHSTK + +/* from arch/x86/kernel/shstk.c */ +#define SHSTK_DATA_BIT (1UL << 63) /* BIT(63) */ + +/* + * Shadow stack memory cannot be restored with memcpy/pread but only using + * a special instruction that can write to shadow stack. + * That instruction is only available when shadow stack is enabled, + * otherwise it causes #UD. + * + * Also, shadow stack VMAs cannot be mmap()ed or mrepmap()ed, they must be + * created using map_shadow_stack() system call. This pushes creation of + * shadow stack VMAs to the restorer blob after CRIU mappings are freed. + * + * And there is an additional jungling with shadow stacks to ensure that we + * don't unmap an active shadow stack + * + * The overall sequence of restoring shadow stack is + * - Enable shadow stack early after clone()ing the task + * - Unlock shadow stack features using ptrace + * - In the restorer blob: + * - switch to a temporary shadow stack to be able to unmap shadow stack + * with the CRIU mappings + * - after memory mappigns are restored, recreate shadow stack VMAs, + * populate them using wrss instruction and switch to the task shadow + * stack + * - lock shadow stack features + */ +struct rst_shstk_info { + unsigned long vma_start; /* start of shadow stack VMA */ + unsigned long vma_size; /* size of shadow stack VMA */ + unsigned long premmaped_addr; /* address of shadow stack copy in + the premmaped area */ + unsigned long tmp_shstk; /* address of temporary shadow stack */ + u64 ssp; /* shadow stack pointer */ + u64 cet; /* CET conrtol state */ +}; +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_shstk_prepare + +#endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c new file mode 100644 index 000000000..f6bc81dc6 --- /dev/null +++ b/criu/arch/x86/shstk.c @@ -0,0 +1,90 @@ +#include + +#include + +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +static bool task_needs_shstk(struct pstree_item *item, CoreEntry *core) +{ + UserX86FpregsEntry *fpregs; + + if (!task_alive(item)) + return false; + + fpregs = core->thread_info->fpregs; + if (fpregs->xsave && fpregs->xsave->cet) { + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + pr_warn_once("Restoring task with shadow stack on non-CET machine\n"); + return false; + } + + if (fpregs->xsave->cet->cet & ARCH_SHSTK_SHSTK) + return true; + } + + return false; +} + +static int shstk_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *shstk) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, shstk->ssp)) { + unsigned long premmaped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + shstk->vma_start = vma->e->start; + shstk->vma_size = size; + shstk->premmaped_addr = premmaped_addr; + shstk->tmp_shstk = premmaped_addr + size; + + break; + } + } + + return 0; +} + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + UserX86FpregsEntry *fpregs = core->thread_info->fpregs; + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *shstk = &ta->shstk; + int i; + + if (!task_needs_shstk(item, core)) + return 0; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + fpregs = core->thread_info->fpregs; + shstk = &thread_args->shstk; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + } + + return 0; +} diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 270049721..e43cc1742 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -975,6 +975,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (setup_uffd(pid, ta)) return -1; + if (arch_shstk_prepare(current, core, ta)) + return -1; + return sigreturn_restore(pid, ta, args_len, core); } diff --git a/criu/include/restore.h b/criu/include/restore.h index 8ef0dbddf..7d29496f2 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -7,4 +7,17 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); +struct task_restore_args; +struct pstree_item; + +#ifndef arch_shstk_prepare +static inline int arch_shstk_prepare(struct pstree_item *item, + CoreEntry *core, + struct task_restore_args *ta) +{ + return 0; +} +#define arch_shstk_prepare arch_shstk_prepare +#endif + #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index f398d8d8f..73565d1de 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -56,6 +56,10 @@ struct restore_posix_timer { int overrun; }; +#ifndef rst_shstk_info +struct rst_shstk_info {}; +#endif + /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things @@ -119,6 +123,8 @@ struct thread_restore_args { unsigned int seccomp_filters_n; bool seccomp_force_tsync; + struct rst_shstk_info shstk; + char comm[TASK_COMM_LEN]; int cg_set; int cgroupd_sk; @@ -240,6 +246,8 @@ struct task_restore_args { uid_t uid; u32 cap_eff[CR_CAP_SIZE]; + + struct rst_shstk_info shstk; } __aligned(64); /* From f47899c9ef7e01cb18acb669cfec4b48f71e0683 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 29 Nov 2023 18:55:35 +0200 Subject: [PATCH 404/775] criu: kerndat: add kdat_has_shstk() Detect if CRIU runs with shadow stack enabled and store the result in kerndat. Unlike most kerndat knobs, kdat_has_shstk() does not check for availability of the shadow stack in the kernel, but rather checks if criu runs with shadow stack enabled. This depends on hardware availabilty, kernel and glibc support, compiler options and glibc tunables, so kdat_has_shstk() must be called every time CRIU starts and its result cannot be cached. The result will be used by the code that controls shadow stack enablement in the next commit. Signed-off-by: Mike Rapoport (IBM) --- criu/arch/x86/include/asm/kerndat.h | 1 + criu/arch/x86/kerndat.c | 27 +++++++++++++++++++++++++++ criu/include/kerndat.h | 1 + criu/kerndat.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/criu/arch/x86/include/asm/kerndat.h b/criu/arch/x86/include/asm/kerndat.h index 903bc80f7..5c3717230 100644 --- a/criu/arch/x86/include/asm/kerndat.h +++ b/criu/arch/x86/include/asm/kerndat.h @@ -4,5 +4,6 @@ extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); +extern int kdat_has_shstk(void); #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c index a98797d39..3a58bbea7 100644 --- a/criu/arch/x86/kerndat.c +++ b/criu/arch/x86/kerndat.c @@ -17,6 +17,7 @@ #include "asm/compat.h" #include "asm/dump.h" +#include "asm/shstk.h" int kdat_can_map_vdso(void) { @@ -251,3 +252,29 @@ out_kill: return ret; } + +/* + * Unlike most kerndat knobs, this does not check for availability of the + * shadow stack in the kernel, but rather checks if criu runs with shadow + * stack enabled. + * + * This depends on hardware availability, kernel and glibc support, compiler + * options and glibc tunables. + */ +int kdat_has_shstk(void) +{ + unsigned long features; + + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return 0; + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_STATUS, &features)) { + /* kernels that don't support shadow stack return -EINVAL */ + if (errno == EINVAL) + return 0; + pr_perror("Cannot get shadow stack status"); + return 1; + } + + return !!(features & ARCH_SHSTK_SHSTK); +} diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 91dbd494b..41524ed66 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -87,6 +87,7 @@ struct kerndat_s { bool has_ipv6_freebind; bool has_membarrier_get_registrations; bool has_pagemap_scan; + bool has_shstk; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index e3b378a9c..6f4fea46b 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1151,6 +1151,24 @@ static int kerndat_has_openat2(void) return 0; } +int __attribute__((weak)) kdat_has_shstk(void) +{ + return 0; +} + +static int kerndat_has_shstk(void) +{ + int ret = kdat_has_shstk(); + + if (ret < 0) { + pr_err("kdat_has_shstk failed\n"); + return ret; + } + + kdat.has_shstk = !!ret; + return 0; +} + #define KERNDAT_CACHE_NAME "criu.kdat" #define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME @@ -1705,6 +1723,12 @@ int kerndat_try_load_new(void) return ret; } + ret = kerndat_has_shstk(); + if (ret < 0) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + return ret; + } + /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); @@ -1926,6 +1950,10 @@ int kerndat_init(void) pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_shstk()) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); From 7dd583002368acedec40d90757a4d47672c6180b Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Tue, 31 May 2022 12:45:09 +0300 Subject: [PATCH 405/775] restore: add infrastructure to enable shadow stack There are several gotachs when restoring a task with shadow stack: * depending on the compiler options, glibc version and glibc tunables CRIU can run with or without shadow stack. * shadow stack VMAs are special, they must be created using a dedicated map_shadow_stack() system call and can be modified only by a special instruction (wrss) that is only available when shadow stack is enabled. * once shadow stack is enabled, it is not writable even with wrss; writes to shadow stack can be only enabled with ptrace() and only when shadow stack is enabled in the tracee. * if the shadow stack is enabled during restore rather than by glibc, calling retq after arch_prctl() that enables the shadow stack causes #CP, so the function that enables shadow stack can never return. Add the infrastructure required to cope with all of those: * modify the restore code to allow trampoline (arch_shstk_trampoline) that will enable shadow stack and call restore_task_with_children(). * add call to arch_shstk_unlock() right after the tasks are clone()ed; this will allow unlocking shadow stack features and making shadow stack writable. * add stubs for architectures that do not support shadow stacks * add implementation of arch_shstk_trampoline() and arch_shstk_unlock() for x86, but keep it disabled; it will be enabled along with addtion of the code that will restore shadow stack in the restorer blob Signed-off-by: Mike Rapoport (IBM) --- criu/arch/x86/include/asm/shstk.h | 9 ++ criu/arch/x86/shstk.c | 133 ++++++++++++++++++++++++++++++ criu/cr-restore.c | 14 +++- criu/include/restore.h | 18 ++++ criu/include/rst_info.h | 3 + 5 files changed, 176 insertions(+), 1 deletion(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index a81062010..7849dd7a6 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -66,4 +66,13 @@ int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, struct task_restore_args *ta); #define arch_shstk_prepare arch_shstk_prepare +#if 0 +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); +#define arch_shstk_unlock arch_shstk_unlock + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline +#endif + #endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c index f6bc81dc6..b752f114a 100644 --- a/criu/arch/x86/shstk.c +++ b/criu/arch/x86/shstk.c @@ -1,3 +1,6 @@ +#include +#include + #include #include @@ -88,3 +91,133 @@ int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, return 0; } + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid) +{ + unsigned long features; + int status; + int ret = -1; + + /* + * CRIU runs with no shadow stack and the task does not need one, + * nothing to do. + */ + if (!kdat.has_shstk && !task_needs_shstk(item, core)) + return 0; + + futex_wait_until(&rsti(item)->shstk_enable, 1); + + if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { + pr_perror("Cannot attach to %d", pid); + goto futex_wake; + } + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Cannot interrupt the %d task", pid); + goto detach; + } + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("waitpid(%d) failed", pid); + goto detach; + } + + features = ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS; + if (ptrace(PTRACE_ARCH_PRCTL, pid, features, ARCH_SHSTK_UNLOCK)) { + pr_perror("Cannot unlock CET for %d task", pid); + goto detach; + } + +detach: + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + pr_perror("Unable to detach %d", pid); + goto futex_wake; + } + + ret = 0; + +futex_wake: + futex_set_and_wake(&rsti(item)->shstk_unlock, 1); + + return ret; +} + +static void shstk_sync_unlock(struct pstree_item *item) +{ + /* notify parent that shadow stack is enabled ... */ + futex_set_and_wake(&rsti(item)->shstk_enable, 1); + + /* ... and wait until it unlocks its features with ptrace */ + futex_wait_until(&rsti(item)->shstk_unlock, 1); +} + +static void __arch_shstk_enable(struct pstree_item *item, + int (*func)(void *arg), void *arg) +{ + int ret; + + shstk_sync_unlock(item); + + /* return here would cause #CP, use exit() instead */ + ret = func(arg); + exit(ret); +} + +static int shstk_disable(struct pstree_item *item) +{ + shstk_sync_unlock(item); + + /* disable shadow stack, implicitly clears ARCH_SHSTK_WRSS */ + if (syscall(__NR_arch_prctl, ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) { + pr_perror("Failed to disable shadow stack"); + return -1; + } + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_LOCK, + ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS)) { + pr_perror("Failed to lock shadow stack controls"); + return -1; + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + unsigned long features = ARCH_SHSTK_SHSTK; + int code = ARCH_SHSTK_ENABLE; + + /* + * If task does not need shadow stack but CRIU runs with shadow + * stack enabled, we should disable it before continuing with + * restore + */ + if (!task_needs_shstk(item, core)) { + if (kdat.has_shstk && shstk_disable(item)) + return -1; + return func(arg); + } + + /* + * Calling sys_arch_prctl() means there will be use of retq + * instruction after shadow stack is enabled and this will cause + * Control Protectiond fault. Open code sys_arch_prctl() in + * assembly. + * + * code and addr should be in %rdi and %rsi and will be passed to + * the system call as is. + */ + asm volatile("movq $"__stringify(__NR_arch_prctl)", %%rax \n" + "syscall \n" + "cmpq $0, %%rax \n" + "je 1f \n" + "retq \n" + "1: \n" + :: "D"(code), "S"(features)); + + __arch_shstk_enable(item, func, arg); + + /* never reached */ + return -1; +} diff --git a/criu/cr-restore.c b/criu/cr-restore.c index e43cc1742..318d34c48 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1498,6 +1498,8 @@ static inline int fork_with_pid(struct pstree_item *item) pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } + arch_shstk_unlock(item, ca.core, pid); + err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) unlock_last_pid(); @@ -1764,7 +1766,7 @@ static int create_children_and_session(void) return 0; } -static int restore_task_with_children(void *_arg) +static int __restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; @@ -1956,6 +1958,16 @@ err: exit(1); } +static int restore_task_with_children(void *_arg) +{ + struct cr_clone_arg *arg = _arg; + struct pstree_item *item = arg->item; + CoreEntry *core = arg->core; + + return arch_shstk_trampoline(item, core, __restore_task_with_children, + arg); +} + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; diff --git a/criu/include/restore.h b/criu/include/restore.h index 7d29496f2..04d006505 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -20,4 +20,22 @@ static inline int arch_shstk_prepare(struct pstree_item *item, #define arch_shstk_prepare arch_shstk_prepare #endif +#ifndef arch_shstk_unlock +static inline int arch_shstk_unlock(struct pstree_item *item, + CoreEntry *core, pid_t pid) +{ + return 0; +} +#define arch_shstk_unlock arch_shstk_unlock +#endif + +#ifndef arch_shstk_trampoline +static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + return func(arg); +} +#define arch_shstk_trampoline arch_shstk_trampoline +#endif + #endif diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 704b42a72..59b891fa2 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -75,6 +75,9 @@ struct rst_info { struct rst_rseq *rseqe; + futex_t shstk_enable; + futex_t shstk_unlock; + void *breakpoint; }; From a48aa33eaa9c4b84793424b15b9d1480f01efab6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Wed, 25 May 2022 12:30:04 +0300 Subject: [PATCH 406/775] restorer: shstk: implement shadow stack restore The restore of a task with shadow stack enabled adds these steps: * switch from the default shadow stack to a temporary shadow stack allocated in the premmaped area * unmap CRIU mappings; nothing changed here, but it's important that CRIU mappings can be removed only after switching to a temporary shadow stack * create shadow stack VMA with map_shadow_stack() * restore shadow stack contents with wrss * switch to "real" shadow stack * lock shadow stack features Signed-off-by: Mike Rapoport (IBM) --- .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + .../x86/src/lib/include/uapi/asm/sigframe.h | 23 +- criu/arch/x86/include/asm/shstk.h | 204 +++++++++++++++++- criu/include/restorer.h | 16 ++ criu/pie/Makefile | 5 + criu/pie/restorer.c | 29 +++ 6 files changed, 271 insertions(+), 7 deletions(-) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 57681b79a..4e843bee9 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -118,3 +118,4 @@ __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) __NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_map_shadow_stack 453 sys_map_shadow_stack (unsigned long addr, unsigned long size, unsigned int flags) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index 9a540694b..4a2e67559 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -177,6 +177,24 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define USER32_CS 0x23 /* clang-format off */ +/* + * rst_sigreturn in resorer is noninline call which adds an entry to the + * shadow stack above the sigframe token; + * if shadow stack is enabled, increment the shadow stack pointer to remove + * that entry + */ +#define ARCH_SHSTK_POP() \ + asm volatile( \ + "xor %%rax, %%rax\n" \ + "rdsspq %%rax\n" \ + "cmpq $0, %%rax\n" \ + "jz 1f\n" \ + "movq $1, %%rax\n" \ + "incsspq %%rax\n" \ + "1:\n" \ + : : \ + : "rax") + #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ @@ -205,9 +223,10 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ do { \ - if ((rt_sigframe)->is_native) \ + if ((rt_sigframe)->is_native) { \ + ARCH_SHSTK_POP(); \ ARCH_RT_SIGRETURN_NATIVE(new_sp); \ - else \ + } else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 7849dd7a6..7814c351d 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -10,11 +10,11 @@ #endif /* arch/x86/include/uapi/asm/prctl.h */ -#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_ENABLE 0x5001 #define ARCH_SHSTK_DISABLE 0x5002 #define ARCH_SHSTK_LOCK 0x5003 -#define ARCH_SHSTK_UNLOCK 0x5004 -#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 #define ARCH_SHSTK_SHSTK (1ULL << 0) #define ARCH_SHSTK_WRSS (1ULL << 1) @@ -66,13 +66,207 @@ int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, struct task_restore_args *ta); #define arch_shstk_prepare arch_shstk_prepare -#if 0 int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); #define arch_shstk_unlock arch_shstk_unlock int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, int (*func)(void *arg), void *arg); #define arch_shstk_trampoline arch_shstk_trampoline -#endif + +#ifdef CR_NOGLIBC + +#include +#include +#include "vma.h" + +#define SHSTK_BUSY_BIT (1UL << 0) /* BIT(0) */ + +static inline int shstk_map(unsigned long addr, unsigned long size) +{ + long shstk = sys_map_shadow_stack(addr, size, SHADOW_STACK_SET_TOKEN); + + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", addr, shstk); + return -1; + } + + if (shstk != addr) { + pr_err("Shadow stack address mismatch: need %lx, got %lx\n", addr, shstk); + return -1; + } + + pr_info("Created shadow stack at %lx\n", shstk); + + return 0; +} + +/* clang-format off */ +static inline unsigned long get_ssp(void) +{ + unsigned long ssp; + + asm volatile("rdsspq %0" : "=r"(ssp) :: ); + + return ssp; +} + +static inline void wrssq(unsigned long addr, unsigned long val) +{ + asm volatile("wrssq %1, (%0)" :: "r"(addr), "r"(val) : "memory"); +} +/* clang-format off */ + +static always_inline void shstk_switch_ssp(unsigned long new_ssp) +{ + unsigned long old_ssp = get_ssp(); + + asm volatile("rstorssp (%0)\n" :: "r"(new_ssp)); + asm volatile("saveprevssp"); + + pr_debug("changed ssp from %lx to %lx\n", old_ssp, new_ssp); +} + +/* + * Disable writes to the shadow stack and lock it's disable/enable control + */ +static inline int shstk_finalize(void) +{ + int ret = 0; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return ret; +} + +/* + * Restore contents of the shadow stack and set shadow stack pointer + */ +static always_inline int shstk_restore(struct rst_shstk_info *cet) +{ + unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; + unsigned long ssp = cet->vma_start + cet->vma_size - 8; + unsigned long shstk_top = cet->vma_size / 8 - 1; + unsigned long val; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + if (shstk_map(cet->vma_start, cet->vma_size)) + return -1; + + /* + * Switch shadow stack from temporary location to the actual task's + * shadow stack VMA + */ + shstk_switch_ssp(ssp); + + /* restore shadow stack contents */ + for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) + wrssq(ssp, shstk_data[shstk_top]); + + /* + * Add tokens for sigreturn frame and for switch of the shadow stack. + * The sigreturn token will be checked by the kernel during + * processing of sigreturn + * The token for stack switch is required by rstorssp and + * saveprevssp semantics + */ + + /* token for sigreturn frame */ + val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; + wrssq(ssp, val); + + /* shadow stack switch token */ + val = ssp | SHSTK_BUSY_BIT; + ssp -= 8; + wrssq(ssp, val); + + /* reset shadow stack pointer to the proper location */ + shstk_switch_ssp(ssp); + + ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + return shstk_finalize(); +} +#define arch_shstk_restore shstk_restore + +/* + * Disable shadow stack + */ +static inline int shstk_disable(void) +{ + int ret; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); + if (ret) { + pr_err("Failed to disable shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return 0; +} + +/* + * Switch to temporary shadow stack + */ +static always_inline int shstk_switch_to_restorer(struct rst_shstk_info *cet) +{ + unsigned long ssp; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + ret = sys_munmap((void *)cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap area for temporary shadow stack\n"); + return -1; + } + + ret = shstk_map(cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) + return -1; + + /* + * Switch shadow stack from the default created by the kernel to a + * temporary shadow stack allocated in the premmaped area + */ + ssp = cet->tmp_shstk + PAGE_SIZE - 8; + shstk_switch_ssp(ssp); + + ret = sys_arch_prctl(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to enable writes to shadow stack\n"); + return ret; + } + + return 0; +} +#define arch_shstk_switch_to_restorer shstk_switch_to_restorer + +#endif /* CR_NOGLIBC */ #endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 73565d1de..3fb5322a4 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -339,4 +339,20 @@ enum { #define __r_sym(name) restorer_sym##name #define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) +#ifndef arch_shstk_switch_to_restorer +static inline int arch_shstk_switch_to_restorer(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_switch_to_restorer arch_shstk_switch_to_restorer +#endif + +#ifndef arch_shstk_restore +static inline int arch_shstk_restore(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_restore arch_shstk_restore +#endif + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 265dcf82b..912fab24b 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -18,6 +18,11 @@ ifeq ($(ARCH),mips) ccflags-y += -mno-abicalls -fno-pic endif +# -mshstk required for CET instructions +ifeq ($(ARCH),x86) + ccflags-y += -mshstk +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 20c6801c5..7c34c06d4 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -752,6 +752,10 @@ __visible long __export_restore_thread(struct thread_restore_args *args) goto core_restore_end; } + /* restore original shadow stack */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + /* All signals must be handled by thread leader */ ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); @@ -1672,6 +1676,9 @@ __visible long __export_restore_task(struct task_restore_args *args) pr_debug("lazy-pages: uffd %d\n", args->uffd); } + if (arch_shstk_switch_to_restorer(&args->shstk)) + goto core_restore_end; + /* * Park vdso/vvar in a safe place if architecture doesn't support * mapping them with arch_prctl(). @@ -1723,6 +1730,13 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1740,6 +1754,13 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; + /* + * shadow stack VMAs cannot be remapped, they must be + * recreated with map_shadow_stack system call + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) + continue; + if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -2166,6 +2187,14 @@ __visible long __export_restore_task(struct task_restore_args *args) futex_set_and_wake(&thread_inprogress, args->nr_threads); + /* + * Shadow stack of the leader can be locked only after all other + * threads were cloned, otherwise they may start with read-only + * shadow stack. + */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); if (ret) From a2b018a188c5fe5cd91785eae35b269c3b9bad40 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 24 Mar 2024 11:07:07 +0000 Subject: [PATCH 407/775] ci: try to fix broken docker test Upgrade to 22.04 base image and use the existing version of docker. Signed-off-by: Adrian Reber --- .github/workflows/docker-test.yml | 2 +- scripts/ci/docker-test.sh | 19 ------------------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index fabf399fd..11d67432b 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -12,7 +12,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04] + os: [ubuntu-22.04] steps: - uses: actions/checkout@v2 - name: Run Docker Test (${{ matrix.os }}) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index 7e7ef7197..aaf443afd 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,25 +2,6 @@ set -x -e -o pipefail -./apt-install \ - apt-transport-https \ - ca-certificates \ - curl \ - software-properties-common - -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - -add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable test" - -# checkpoint/restore is broken in Docker Engine (Community) version 25.0.0-beta.1 -# https://github.com/moby/moby/discussions/46816 -# Downgrade to the latest stable version. -VERSION_STRING=5:24.0.7-1~ubuntu.20.04~focal -./apt-install docker-ce=$VERSION_STRING docker-ce-cli=$VERSION_STRING containerd.io docker-buildx-plugin docker-compose-plugin - # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart From 8f0e200e666440d0377382b3632e76fa5e2d220b Mon Sep 17 00:00:00 2001 From: Artem Trushkin Date: Sun, 24 Mar 2024 17:16:58 +0700 Subject: [PATCH 408/775] mem: fix some VMAs being incorrectly mapped wtih PROT_WRITE A memory interval is a half-open interval, so the condition when pr->pe->vaddr == vma->e->end should not be interpreted as an intersection and should cause vma to be marked with VMA_NO_PROT_WRITE. Fixes: #2364 Signed-off-by: Artem Trushkin --- criu/mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/mem.c b/criu/mem.c index 0236c5e1e..5f0d57eb6 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1057,7 +1057,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; - if (pr->pe->vaddr > vma->e->end) + if (pr->pe->vaddr >= vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); From 75fed59ef67504920174f14bbd39f628c5e75bef Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Mar 2024 15:10:31 +0000 Subject: [PATCH 409/775] Add support for reset-on-fork scheduling flag This patch extends CRIU with support for SCHED_RESET_ON_FORK. When the SCHED_RESET_ON_FORK flag is set, the following rules apply for subsequently created children: - If the calling thread has a scheduling policy of SCHED_FIFO or SCHED_RR, the policy is reset to SCHED_OTHER in child processes. - If the calling process has a negative nice value, the nice value is reset to zero in child processes. (See 'man 7 sched') Fixes: #2359 Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 5 +++++ criu/cr-restore.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index ee5974acc..fe5e73798 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -157,6 +157,11 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_policy = true; tc->sched_policy = ret; + /* The reset-on-fork flag might be used in combination + * with SCHED_FIFO or SCHED_RR to reset the scheduling + * policy/priority in child processes. + */ + ret &= ~SCHED_RESET_ON_FORK; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 318d34c48..874986ca0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3057,7 +3057,7 @@ static int validate_sched_parm(struct rst_sched_param *sp) if ((sp->nice < -20) || (sp->nice > 19)) return 0; - switch (sp->policy) { + switch (sp->policy & ~SCHED_RESET_ON_FORK) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); From 231ba0cd29c221d6112f83adb4ccd255247f9f29 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 5 Mar 2024 08:53:56 +0000 Subject: [PATCH 410/775] zdtm/sched_policy00: use reset-on-fork flag This patch extends the sched_policy00 test case to verify that the SCHED_RESET_ON_FORK flag is restored correctly. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/sched_policy00.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/zdtm/static/sched_policy00.c b/test/zdtm/static/sched_policy00.c index dc71eed94..a35135050 100644 --- a/test/zdtm/static/sched_policy00.c +++ b/test/zdtm/static/sched_policy00.c @@ -51,7 +51,7 @@ int main(int argc, char **argv) } p.sched_priority = param; - if (sched_setscheduler(pid, SCHED_RR, &p)) { + if (sched_setscheduler(pid, SCHED_RR | SCHED_RESET_ON_FORK, &p)) { pr_perror("Can't set policy"); kill(pid, SIGKILL); return -1; @@ -61,7 +61,7 @@ int main(int argc, char **argv) test_waitsig(); ret = sched_getscheduler(pid); - if (ret != SCHED_RR) { + if (ret != (SCHED_RR | SCHED_RESET_ON_FORK)) { fail("Broken/No policy"); err++; } From 52623cca167abf848e7369ae77e7c3030602a646 Mon Sep 17 00:00:00 2001 From: ccccrrrr Date: Thu, 29 Feb 2024 16:02:06 +0800 Subject: [PATCH 411/775] criu: move timers dump/restore code into separate file Fixes: #335 Signed-off-by: ccccrrrr --- criu/Makefile.crtools | 1 + criu/cr-dump.c | 1 + criu/cr-restore.c | 242 +------------------ criu/include/parasite-syscall.h | 5 - criu/include/timer.h | 17 ++ criu/parasite-syscall.c | 150 ------------ criu/timer.c | 399 ++++++++++++++++++++++++++++++++ 7 files changed, 419 insertions(+), 396 deletions(-) create mode 100644 criu/include/timer.h create mode 100644 criu/timer.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index f58644917..bf17f1ec9 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -92,6 +92,7 @@ obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o +obj-y += timer.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/cr-dump.c b/criu/cr-dump.c index fe5e73798..a29ec82ef 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -86,6 +86,7 @@ #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" +#include "timer.h" /* * Architectures can overwrite this function to restore register sets that diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 874986ca0..c19a20b46 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -98,6 +98,7 @@ #include "restore.h" #include "cr-errno.h" +#include "timer.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -118,7 +119,6 @@ static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* @@ -882,7 +882,6 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a return 0; } -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) @@ -2719,245 +2718,6 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he return -1; } -static inline int timeval_valid(struct timeval *tv) -{ - return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); -} - -static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) -{ - if (ie->isec == 0 && ie->iusec == 0) { - memzero_p(val); - return 0; - } - - val->it_interval.tv_sec = ie->isec; - val->it_interval.tv_usec = ie->iusec; - - if (!timeval_valid(&val->it_interval)) { - pr_err("Invalid timer interval\n"); - return -1; - } - - if (ie->vsec == 0 && ie->vusec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - val->it_value.tv_sec = ie->isec; - val->it_value.tv_usec = ie->iusec; - } else { - val->it_value.tv_sec = ie->vsec; - val->it_value.tv_usec = ie->vusec; - } - - if (!timeval_valid(&val->it_value)) { - pr_err("Invalid timer value\n"); - return -1; - } - - pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, - val->it_interval.tv_sec, val->it_interval.tv_usec); - - return 0; -} - -/* - * Legacy itimers restore from CR_FD_ITIMERS - */ - -static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) -{ - int ret = -1; - struct cr_img *img; - ItimerEntry *ie; - - if (!deprecated_ok("Itimers")) - return -1; - - img = open_image(CR_FD_ITIMERS, O_RSTR, pid); - if (!img) - return -1; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("real", ie, &args->itimers[0]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("virt", ie, &args->itimers[1]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("prof", ie, &args->itimers[2]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; -out: - close_image(img); - return ret; -} - -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) -{ - int ret = 0; - TaskTimersEntry *tte = core->tc->timers; - - if (!tte) - return prepare_itimers_from_fd(pid, args); - - ret |= decode_itimer("real", tte->real, &args->itimers[0]); - ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); - ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); - - return ret; -} - -static inline int timespec_valid(struct timespec *ts) -{ - return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); -} - -static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) -{ - pt->val.it_interval.tv_sec = pte->isec; - pt->val.it_interval.tv_nsec = pte->insec; - - if (!timespec_valid(&pt->val.it_interval)) { - pr_err("Invalid timer interval(posix)\n"); - return -1; - } - - if (pte->vsec == 0 && pte->vnsec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - pt->val.it_value.tv_sec = pte->isec; - pt->val.it_value.tv_nsec = pte->insec; - } else { - pt->val.it_value.tv_sec = pte->vsec; - pt->val.it_value.tv_nsec = pte->vnsec; - } - - if (!timespec_valid(&pt->val.it_value)) { - pr_err("Invalid timer value(posix)\n"); - return -1; - } - - pt->spt.it_id = pte->it_id; - pt->spt.clock_id = pte->clock_id; - pt->spt.si_signo = pte->si_signo; - pt->spt.it_sigev_notify = pte->it_sigev_notify; - pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); - pt->spt.notify_thread_id = pte->notify_thread_id; - pt->overrun = pte->overrun; - - return 0; -} - -static int cmp_posix_timer_proc_id(const void *p1, const void *p2) -{ - return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; -} - -static void sort_posix_timers(struct task_restore_args *ta) -{ - void *tmem; - - /* - * This is required for restorer's create_posix_timers(), - * it will probe them one-by-one for the desired ID, since - * kernel doesn't provide another API for timer creation - * with given ID. - */ - - if (ta->posix_timers_n > 0) { - tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); - qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); - } -} - -/* - * Legacy posix timers restoration from CR_FD_POSIX_TIMERS - */ - -static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) -{ - struct cr_img *img; - int ret = -1; - struct restore_posix_timer *t; - - if (!deprecated_ok("Posix timers")) - return -1; - - img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); - if (!img) - return -1; - - ta->posix_timers_n = 0; - while (1) { - PosixTimerEntry *pte; - - ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); - if (ret <= 0) - break; - - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - break; - - ret = decode_posix_timer(pte, t); - if (ret < 0) - break; - - posix_timer_entry__free_unpacked(pte, NULL); - ta->posix_timers_n++; - } - - close_image(img); - if (!ret) - sort_posix_timers(ta); - - return ret; -} - -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) -{ - int i, ret = -1; - TaskTimersEntry *tte = core->tc->timers; - struct restore_posix_timer *t; - - ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); - - if (!tte) - return prepare_posix_timers_from_fd(pid, ta); - - ta->posix_timers_n = tte->n_posix; - for (i = 0; i < ta->posix_timers_n; i++) { - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - goto out; - - if (decode_posix_timer(tte->posix[i], t)) - goto out; - } - - ret = 0; - sort_posix_timers(ta); -out: - return ret; -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index 4540e11ee..70ecbb720 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -22,11 +22,6 @@ struct parasite_ctl; struct parasite_thread_ctl; extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); -extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); - -struct proc_posix_timers_stat; -extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *); extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); diff --git a/criu/include/timer.h b/criu/include/timer.h new file mode 100644 index 000000000..09583a901 --- /dev/null +++ b/criu/include/timer.h @@ -0,0 +1,17 @@ +#ifndef __CR_TIMER_H__ +#define __CR_TIMER_H__ + +#include "images/core.pb-c.h" + +struct task_restore_args; +struct pstree_item; +struct parasite_ctl; +struct proc_posix_timers_stat; + +extern int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); +extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); + +extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); +extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item); +#endif \ No newline at end of file diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 295e404ec..6d2aa9c88 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -278,156 +278,6 @@ int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *i return 0; } -static void encode_itimer(struct itimerval *v, ItimerEntry *ie) -{ - ie->isec = v->it_interval.tv_sec; - ie->iusec = v->it_interval.tv_usec; - ie->vsec = v->it_value.tv_sec; - ie->vusec = v->it_value.tv_usec; -} - -int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - struct parasite_dump_itimers_args *args; - int ret; - - args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); - if (ret < 0) - return ret; - - encode_itimer((&args->real), (core->tc->timers->real)); - encode_itimer((&args->virt), (core->tc->timers->virt)); - encode_itimer((&args->prof), (core->tc->timers->prof)); - - return 0; -} - -static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) -{ - int sz; - - /* - * Will be free()-ed in core_entry_free() - */ - - sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); - tte->posix = xmalloc(sz); - if (!tte->posix) - return -1; - - tte->n_posix = n; - *pte = (PosixTimerEntry *)(tte->posix + n); - return 0; -} - -static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) -{ - pid_t vtid = 0; - int i; - - if (rtid == 0) - return 0; - - if (!(root_ns_mask & CLONE_NEWPID)) { - /* Non-pid-namespace case */ - pte->notify_thread_id = rtid; - pte->has_notify_thread_id = true; - return 0; - } - - /* Pid-namespace case */ - if (!kdat.has_nspid) { - pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); - return -1; - } - - for (i = 0; i < item->nr_threads; i++) { - if (item->threads[i].real != rtid) - continue; - - vtid = item->threads[i].ns[0].virt; - break; - } - - if (vtid == 0) { - pr_err("Unable to convert the notify thread id %d\n", rtid); - return -1; - } - - pte->notify_thread_id = vtid; - pte->has_notify_thread_id = true; - return 0; -} - -static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, - PosixTimerEntry *pte) -{ - pte->it_id = vp->spt.it_id; - pte->clock_id = vp->spt.clock_id; - pte->si_signo = vp->spt.si_signo; - pte->it_sigev_notify = vp->spt.it_sigev_notify; - pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); - - pte->overrun = v->overrun; - - pte->isec = v->val.it_interval.tv_sec; - pte->insec = v->val.it_interval.tv_nsec; - pte->vsec = v->val.it_value.tv_sec; - pte->vnsec = v->val.it_value.tv_nsec; - - if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) - return -1; - - return 0; -} - -int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - TaskTimersEntry *tte = core->tc->timers; - PosixTimerEntry *pte; - struct proc_posix_timer *temp; - struct parasite_dump_posix_timers_args *args; - int ret, exit_code = -1; - int args_size; - int i; - - if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) - return -1; - - args_size = posix_timers_dump_size(proc_args->timer_n); - args = compel_parasite_args_s(ctl, args_size); - args->timer_n = proc_args->timer_n; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - args->timer[i].it_id = temp->spt.it_id; - i++; - } - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); - if (ret < 0) - goto end_posix; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - posix_timer_entry__init(&pte[i]); - if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) - goto end_posix; - tte->posix[i] = &pte[i]; - i++; - } - - exit_code = 0; -end_posix: - free_posix_timers(proc_args); - return exit_code; -} - int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) { struct parasite_dump_misc *ma; diff --git a/criu/timer.c b/criu/timer.c new file mode 100644 index 000000000..bdcb059cc --- /dev/null +++ b/criu/timer.c @@ -0,0 +1,399 @@ +#include "types.h" +#include "crtools.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "posix-timer.h" +#include "parasite.h" +#include "namespaces.h" +#include "rst-malloc.h" +#include "restorer.h" + +static inline int timeval_valid(struct timeval *tv) +{ + return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); +} + +static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) +{ + if (ie->isec == 0 && ie->iusec == 0) { + memzero_p(val); + return 0; + } + + val->it_interval.tv_sec = ie->isec; + val->it_interval.tv_usec = ie->iusec; + + if (!timeval_valid(&val->it_interval)) { + pr_err("Invalid timer interval\n"); + return -1; + } + + if (ie->vsec == 0 && ie->vusec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + val->it_value.tv_sec = ie->isec; + val->it_value.tv_usec = ie->iusec; + } else { + val->it_value.tv_sec = ie->vsec; + val->it_value.tv_usec = ie->vusec; + } + + if (!timeval_valid(&val->it_value)) { + pr_err("Invalid timer value\n"); + return -1; + } + + pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, + val->it_interval.tv_sec, val->it_interval.tv_usec); + + return 0; +} + +/* + * Legacy itimers restore from CR_FD_ITIMERS + */ + +int prepare_itimers_from_fd(int pid, struct task_restore_args *args) +{ + int ret = -1; + struct cr_img *img; + ItimerEntry *ie; + + if (!deprecated_ok("Itimers")) + return -1; + + img = open_image(CR_FD_ITIMERS, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("real", ie, &args->itimers[0]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("virt", ie, &args->itimers[1]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("prof", ie, &args->itimers[2]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; +out: + close_image(img); + return ret; +} + +int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) +{ + int ret = 0; + TaskTimersEntry *tte = core->tc->timers; + + if (!tte) + return prepare_itimers_from_fd(pid, args); + + ret |= decode_itimer("real", tte->real, &args->itimers[0]); + ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); + ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); + + return ret; +} + +static inline int timespec_valid(struct timespec *ts) +{ + return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); +} + +static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) +{ + pt->val.it_interval.tv_sec = pte->isec; + pt->val.it_interval.tv_nsec = pte->insec; + + if (!timespec_valid(&pt->val.it_interval)) { + pr_err("Invalid timer interval(posix)\n"); + return -1; + } + + if (pte->vsec == 0 && pte->vnsec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + pt->val.it_value.tv_sec = pte->isec; + pt->val.it_value.tv_nsec = pte->insec; + } else { + pt->val.it_value.tv_sec = pte->vsec; + pt->val.it_value.tv_nsec = pte->vnsec; + } + + if (!timespec_valid(&pt->val.it_value)) { + pr_err("Invalid timer value(posix)\n"); + return -1; + } + + pt->spt.it_id = pte->it_id; + pt->spt.clock_id = pte->clock_id; + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); + pt->spt.notify_thread_id = pte->notify_thread_id; + pt->overrun = pte->overrun; + + return 0; +} + +static int cmp_posix_timer_proc_id(const void *p1, const void *p2) +{ + return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; +} + +static void sort_posix_timers(struct task_restore_args *ta) +{ + void *tmem; + + /* + * This is required for restorer's create_posix_timers(), + * it will probe them one-by-one for the desired ID, since + * kernel doesn't provide another API for timer creation + * with given ID. + */ + + if (ta->posix_timers_n > 0) { + tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); + qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); + } +} + +/* + * Legacy posix timers restoration from CR_FD_POSIX_TIMERS + */ + +int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) +{ + struct cr_img *img; + int ret = -1; + struct restore_posix_timer *t; + + if (!deprecated_ok("Posix timers")) + return -1; + + img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); + if (!img) + return -1; + + ta->posix_timers_n = 0; + while (1) { + PosixTimerEntry *pte; + + ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); + if (ret <= 0) + break; + + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + break; + + ret = decode_posix_timer(pte, t); + if (ret < 0) + break; + + posix_timer_entry__free_unpacked(pte, NULL); + ta->posix_timers_n++; + } + + close_image(img); + if (!ret) + sort_posix_timers(ta); + + return ret; +} + +int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) +{ + int i, ret = -1; + TaskTimersEntry *tte = core->tc->timers; + struct restore_posix_timer *t; + + ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); + + if (!tte) + return prepare_posix_timers_from_fd(pid, ta); + + ta->posix_timers_n = tte->n_posix; + for (i = 0; i < ta->posix_timers_n; i++) { + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + goto out; + + if (decode_posix_timer(tte->posix[i], t)) + goto out; + } + + ret = 0; + sort_posix_timers(ta); +out: + return ret; +} + +static void encode_itimer(struct itimerval *v, ItimerEntry *ie) +{ + ie->isec = v->it_interval.tv_sec; + ie->iusec = v->it_interval.tv_usec; + ie->vsec = v->it_value.tv_sec; + ie->vusec = v->it_value.tv_usec; +} + +int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + struct parasite_dump_itimers_args *args; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); + if (ret < 0) + return ret; + + encode_itimer((&args->real), (core->tc->timers->real)); + encode_itimer((&args->virt), (core->tc->timers->virt)); + encode_itimer((&args->prof), (core->tc->timers->prof)); + + return 0; +} + +static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) +{ + int sz; + + /* + * Will be free()-ed in core_entry_free() + */ + + sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); + tte->posix = xmalloc(sz); + if (!tte->posix) + return -1; + + tte->n_posix = n; + *pte = (PosixTimerEntry *)(tte->posix + n); + return 0; +} + +static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) +{ + pid_t vtid = 0; + int i; + + if (rtid == 0) + return 0; + + if (!(root_ns_mask & CLONE_NEWPID)) { + /* Non-pid-namespace case */ + pte->notify_thread_id = rtid; + pte->has_notify_thread_id = true; + return 0; + } + + /* Pid-namespace case */ + if (!kdat.has_nspid) { + pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + if (item->threads[i].real != rtid) + continue; + + vtid = item->threads[i].ns[0].virt; + break; + } + + if (vtid == 0) { + pr_err("Unable to convert the notify thread id %d\n", rtid); + return -1; + } + + pte->notify_thread_id = vtid; + pte->has_notify_thread_id = true; + return 0; +} + +static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, + PosixTimerEntry *pte) +{ + pte->it_id = vp->spt.it_id; + pte->clock_id = vp->spt.clock_id; + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); + + pte->overrun = v->overrun; + + pte->isec = v->val.it_interval.tv_sec; + pte->insec = v->val.it_interval.tv_nsec; + pte->vsec = v->val.it_value.tv_sec; + pte->vnsec = v->val.it_value.tv_nsec; + + if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) + return -1; + + return 0; +} + +int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + TaskTimersEntry *tte = core->tc->timers; + PosixTimerEntry *pte; + struct proc_posix_timer *temp; + struct parasite_dump_posix_timers_args *args; + int ret, exit_code = -1; + int args_size; + int i; + + if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) + return -1; + + args_size = posix_timers_dump_size(proc_args->timer_n); + args = compel_parasite_args_s(ctl, args_size); + args->timer_n = proc_args->timer_n; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + args->timer[i].it_id = temp->spt.it_id; + i++; + } + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); + if (ret < 0) + goto end_posix; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + posix_timer_entry__init(&pte[i]); + if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) + goto end_posix; + tte->posix[i] = &pte[i]; + i++; + } + + exit_code = 0; +end_posix: + free_posix_timers(proc_args); + return exit_code; +} From 0fc83a79b1efda538ed1d0a83969f62e6e375f19 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 24 Mar 2024 14:26:41 +0000 Subject: [PATCH 412/775] ci: silence CircleCI warning about deprecated image CircleCI currently prints out the following warning: This job is using a deprecated image 'ubuntu-2004:202010-01', please update to a newer image According to https://discuss.circleci.com/t/linux-image-deprecations-and-eol-for-2024/ the recommended image name is: "image: default" Signed-off-by: Adrian Reber --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 47f7ad9b1..785b383e1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2.1 jobs: test-local-gcc: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout @@ -11,7 +11,7 @@ jobs: command: sudo -E make -C scripts/ci local test-local-clang: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout From af4058871e49459c7fbe3f6f1963a36c028f2aa7 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 27 Mar 2024 13:34:26 +0800 Subject: [PATCH 413/775] timer: fix wrapping allignment in function declaration Currently we have tabs + spaces on the wrapped line but the wrapped part is not alligned to the opening bracket. Fixes: bbe26d1b7 ("timer: fix allignment in function definition") Signed-off-by: Pavel Tikhomirov --- criu/include/timer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/include/timer.h b/criu/include/timer.h index 09583a901..d1deb6051 100644 --- a/criu/include/timer.h +++ b/criu/include/timer.h @@ -13,5 +13,5 @@ extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item); -#endif \ No newline at end of file + struct pstree_item *item); +#endif From e07ffa04b0d58299317afd8abaeebb6f071061a6 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 5 Apr 2024 14:19:28 -0700 Subject: [PATCH 414/775] Makefile.config: fix/improve feature warnings. 1. Tell which RPMs or DEBs are required in all cases. 2. Use $(info ...) everywhere. 3. Drop extra nested $(info), instead use (a document) a simpler kludge. 4. Simplify and unify the language, add missing periods. Signed-off-by: Kir Kolyshkin --- Makefile.config | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/Makefile.config b/Makefile.config index 8f2b5208e..52c250b21 100644 --- a/Makefile.config +++ b/Makefile.config @@ -2,12 +2,15 @@ include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak +# This is a kludge for $(info ...) to not eat spaces. +S := + ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else $(info Note: Building without setproctitle() and strlcpy() support.) - $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) + $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libselinux),y) @@ -23,10 +26,10 @@ endif ifeq ($(call pkg-config-check,libdrm),y) export CONFIG_AMDGPU := y - $(info Note: Building criu with amdgpu_plugin.) + $(info Note: Building with amdgpu_plugin.) else - $(info Note: Building criu without amdgpu_plugin.) - $(info Note: libdrm and libdrm_amdgpu are required to build amdgpu_plugin.) + $(info Note: Building without amdgpu_plugin.) + $(info $S Install libdrm-devel (RPM) or libdrm-dev (DEB) to fix.) endif ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) @@ -34,7 +37,8 @@ ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) export CONFIG_GNUTLS := y FEATURE_DEFINES += -DCONFIG_GNUTLS else - $(info Note: Building without GnuTLS support) + $(info Note: Building without GnuTLS support.) + $(info $S Install gnutls-devel (RPM) or gnutls-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libnftables),y) @@ -46,12 +50,11 @@ ifeq ($(call pkg-config-check,libnftables),y) LIBS_FEATURES += $(LIB_NFTABLES) FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 else - $(warning Warn: you have libnftables installed but it has incompatible API) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support (incompatible API version).) endif else - $(warning Warn: you have no libnftables installed) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support.) + $(info $S Install nftables-devel (RPM) or libnftables-dev (DEB) to fix.) endif export LIBS += $(LIBS_FEATURES) @@ -67,10 +70,10 @@ ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else - $(info Note: Building without ia32 C/R, missed ia32 support in gcc) - $(info $(info) That may be related to missing gcc-multilib in your) - $(info $(info) distribution or you may have Debian with buggy toolchain) - $(info $(info) (issue https://github.com/checkpoint-restore/criu/issues/315)) + $(info Note: Building without ia32 C/R, missing ia32 support in gcc.) + $(info $S It may be related to missing gcc-multilib in your) + $(info $S distribution, or you may have Debian with buggy toolchain.) + $(info $S See https://github.com/checkpoint-restore/criu/issues/315.) endif endif From 1c2a3d7faa16b1583fd2514205c38894f68dd487 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 20 Dec 2023 15:22:48 -0800 Subject: [PATCH 415/775] check: verify ino and dev of overlayfs files in /proc/pid/maps Check that the file device and inode shown in /proc/pid/maps match values returned by stat(2). Signed-off-by: Andrei Vagin --- criu/cr-check.c | 194 +++++++++++++++++++++++++++++++++++++ scripts/ci/run-ci-tests.sh | 14 ++- 2 files changed, 204 insertions(+), 4 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index fea1ce674..507f9915c 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "../soccr/soccr.h" @@ -53,6 +54,8 @@ #include "restorer.h" #include "uffd.h" #include "linux/aio_abi.h" +#include "syscall.h" +#include "mount-v2.h" #include "images/inventory.pb-c.h" @@ -1390,6 +1393,195 @@ static int check_pagemap_scan(void) return 0; } +/* musl doesn't have a statx wrapper... */ +struct staty { + __u32 stx_dev_major; + __u32 stx_dev_minor; + __u64 stx_ino; +}; + +static long get_file_dev_and_inode(void *addr, struct staty *stx) +{ + char buf[4096]; + FILE *mapf; + + mapf = fopen("/proc/self/maps", "r"); + if (mapf == NULL) { + pr_perror("fopen(/proc/self/maps)"); + return -1; + } + + while (fgets(buf, sizeof(buf), mapf)) { + unsigned long start, end; + uint32_t maj, min; + __u64 ino; + + if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", + &start, &end, &maj, &min, &ino) != 5) { + pr_perror("Unable to parse: %s", buf); + return -1; + } + if (start == (unsigned long)addr) { + stx->stx_dev_major = maj; + stx->stx_dev_minor = min; + stx->stx_ino = ino; + return 0; + } + } + + pr_err("Unable to find the mapping\n"); + return -1; +} + +static int ovl_mount(void) +{ + int tmpfs, fsfd, ovl; + + fsfd = sys_fsopen("tmpfs", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen tmpfs"); + return -1; + } + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create tmpfs mount"); + return -1; + } + + tmpfs = sys_fsmount(fsfd, 0, 0); + if (tmpfs == -1) { + pr_perror("Unable to mount tmpfs"); + return -1; + } + + close(fsfd); + + /* overlayfs can't be constructed on top of a detached mount. */ + if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Unable to attach tmpfs mount"); + return -1; + } + close(tmpfs); + + if (chdir("/tmp")) { + pr_perror("Unable to change working directory"); + return -1; + } + + if (mkdir("/tmp/w", 0755) == -1 || + mkdir("/tmp/u", 0755) == -1 || + mkdir("/tmp/l", 0755) == -1) { + pr_perror("mkdir"); + return -1; + } + + fsfd = sys_fsopen("overlay", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen overlayfs"); + return -1; + } + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { + pr_perror("Unable to configure overlayfs"); + return -1; + } + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create overlayfs"); + return -1; + } + ovl = sys_fsmount(fsfd, 0, 0); + if (ovl == -1) { + pr_perror("Unable to mount overlayfs"); + return -1; + } + + return ovl; +} + +/* + * Check that the file device and inode shown in /proc/pid/maps match values + * returned by stat(2). + */ +static int do_check_overlayfs_maps(void) +{ + struct staty stx, mstx; + struct stat st; + int ovl, fd; + void *addr; + + /* Create a new mount namespace to not care about cleaning test mounts. */ + if (unshare(CLONE_NEWNS) == -1) { + pr_warn("Unable to create a new mount namespace\n"); + return 0; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + pr_perror("Unable to remount / with MS_SLAVE"); + return -1; + } + + ovl = ovl_mount(); + if (ovl == -1) + return -1; + + fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); + if (fd == -1) { + pr_perror("Unable to open a test file"); + return -1; + } + + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Unable to map the test file"); + return -1; + } + + if (get_file_dev_and_inode(addr, &mstx)) + return -1; + if (fstat(fd, &st)) { + pr_perror("stat"); + return -1; + } + stx.stx_dev_major = major(st.st_dev); + stx.stx_dev_minor = minor(st.st_dev); + stx.stx_ino = st.st_ino; + + if (stx.stx_dev_major != mstx.stx_dev_major || + stx.stx_dev_minor != mstx.stx_dev_minor || + stx.stx_ino != mstx.stx_ino) { + pr_err("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", + mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, + stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); + return -1; + } + + return 0; +} + +static int check_overlayfs_maps(void) +{ + pid_t pid; + int status; + + pid = fork(); + if (pid == -1) { + pr_perror("Unable to fork a child"); + return -1; + } + if (pid == 0) { + if (do_check_overlayfs_maps()) + exit(1); + exit(0); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid"); + return -1; + } + return status == 0 ? 0 : -1; +} + static int (*chk_feature)(void); /* @@ -1511,6 +1703,7 @@ int cr_check(void) ret |= check_ptrace_get_rseq_conf(); ret |= check_ipv6_freebind(); ret |= check_pagemap_scan(); + ret |= check_overlayfs_maps(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1633,6 +1826,7 @@ static struct feature_list feature_list[] = { { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { "pagemap_scan", check_pagemap_scan }, + { "overlayfs_maps", check_overlayfs_maps }, { NULL, NULL }, }; diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 2fdecbc97..c50dc4174 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -290,10 +290,16 @@ ip net add test # Check if cap_checkpoint_restore is supported and also if unshare -c is supported. # # Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). -# This is a temporary workaround until fixed in the kernel. -# The kernel currently does not show correct device and inode numbers in /proc/pid/maps -# for stackable file systems. -if capsh --supports=cap_checkpoint_restore && unshare -c /bin/true && [ ! -e /run/.containerenv ]; then +# Before v6.8-rc1~215^2~6, the kernel currently did not show correct device and +# inode numbers in /proc/pid/maps for stackable file systems. +skip=0 +findmnt -no FSTYPE / | grep overlay && { + ./criu/criu check --feature overlayfs_maps || skip=1 +} +unshare -c /bin/true || skip=1 +capsh --supports=cap_checkpoint_restore || skip=1 + +if [ "$skip" == 0 ]; then make -C test/zdtm/ cleanout rm -rf test/dump setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu From 5aaf45021362006827b70718f64cf61a62fd959e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 26 Mar 2024 10:16:40 +0000 Subject: [PATCH 416/775] ci: update base OS to ubuntu 22.04 Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 2 +- .github/workflows/archlinux-test.yml | 2 +- .github/workflows/compat-test.yml | 2 +- .github/workflows/fedora-asan-test.yml | 2 +- .github/workflows/fedora-rawhide-test.yml | 2 +- .github/workflows/gcov-test.yml | 2 +- .github/workflows/java-test.yml | 2 +- .github/workflows/podman-test.yml | 2 +- .github/workflows/stream-test.yml | 2 +- .github/workflows/x86-64-clang-test.yml | 2 +- .github/workflows/x86-64-gcc-test.yml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 06f466c51..5757fa82b 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: target: [GCC=1, CLANG=1] diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index 328cc9d0f..9e8b60136 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run Arch Linux Test diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index 79f8f0010..e8b5a897b 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: target: [GCC, CLANG] diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 8b1bfcf32..11233f457 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 5355aa192..fae544900 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index fcab47837..f221fabb5 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml index abed793bf..af1f71046 100644 --- a/.github/workflows/java-test.yml +++ b/.github/workflows/java-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run Java Test diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index a7013a216..077cf63e2 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run Podman Test diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index 0f5b307db..efb217e16 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index b3b50829a..c9a1d3151 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run X86_64 CLANG Test diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index ec70b61fb..8d1815d57 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -9,7 +9,7 @@ concurrency: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Run X86_64 GCC Test From e68a06cfd10ea4d9e07a6a7909e07570f7f89a29 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 26 Mar 2024 10:17:44 +0000 Subject: [PATCH 417/775] ci: update actions/checkout to v4 Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 2 +- .github/workflows/archlinux-test.yml | 2 +- .github/workflows/compat-test.yml | 2 +- .github/workflows/cross-compile-daily.yml | 2 +- .github/workflows/cross-compile.yml | 2 +- .github/workflows/docker-test.yml | 2 +- .github/workflows/fedora-asan-test.yml | 2 +- .github/workflows/fedora-rawhide-test.yml | 2 +- .github/workflows/gcov-test.yml | 2 +- .github/workflows/java-test.yml | 2 +- .github/workflows/lint.yml | 2 +- .github/workflows/loongarch64-qemu-test.yml | 2 +- .github/workflows/podman-test.yml | 2 +- .github/workflows/stream-test.yml | 2 +- .github/workflows/x86-64-clang-test.yml | 2 +- .github/workflows/x86-64-gcc-test.yml | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 5757fa82b..73530d79a 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -15,6 +15,6 @@ jobs: target: [GCC=1, CLANG=1] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Alpine ${{ matrix.target }} Test run: sudo -E make -C scripts/ci alpine ${{ matrix.target }} diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index 9e8b60136..425f0662b 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Arch Linux Test run: sudo -E make -C scripts/ci archlinux diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index e8b5a897b..8a64ce185 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -16,6 +16,6 @@ jobs: steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Compat Tests (${{ matrix.target }}) run: sudo -E make -C scripts/ci local COMPAT_TEST=y ${{ matrix.target }}=1 diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index 927ddced2..b8c8c86d4 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -14,7 +14,7 @@ jobs: branches: [criu-dev, master] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: ref: ${{ matrix.branches }} - name: Run Cross Compilation Targets diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 4da5d397c..06b812823 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -33,7 +33,7 @@ jobs: target: mips64el-unstable-cross steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Cross Compilation Targets run: > sudo make -C scripts/ci ${{ matrix.target }} diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index 11d67432b..23696905a 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -14,6 +14,6 @@ jobs: matrix: os: [ubuntu-22.04] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Docker Test (${{ matrix.os }}) run: sudo make -C scripts/ci docker-test diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 11233f457..02dc9a1b3 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -12,6 +12,6 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora ASAN Test run: sudo -E make -C scripts/ci fedora-asan diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index fae544900..83e2ead82 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora Rawhide Test # We need to pass environment variables from the CI environment to # distinguish between CI environments. However, we need to make sure that diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index f221fabb5..cc4e1d44a 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Coverage Tests run: sudo -E make -C scripts/ci local GCOV=1 - name: Run gcov diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml index af1f71046..cbd3c1f23 100644 --- a/.github/workflows/java-test.yml +++ b/.github/workflows/java-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Java Test run: sudo make -C scripts/ci java-test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 489259474..862d68245 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -16,7 +16,7 @@ jobs: - name: Install tools run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set git safe directory # https://github.com/actions/checkout/issues/760 diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml index ba22fa25f..d7c554c87 100644 --- a/.github/workflows/loongarch64-qemu-test.yml +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -11,5 +11,5 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index 077cf63e2..a07edbe5b 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Podman Test run: sudo make -C scripts/ci podman-test diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index efb217e16..76bd96edf 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -12,6 +12,6 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run CRIU Image Streamer Test run: sudo -E make -C scripts/ci local STREAM_TEST=1 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index c9a1d3151..1f0a469bd 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 CLANG Test run: sudo make -C scripts/ci x86_64 CLANG=1 diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index 8d1815d57..15e84a0df 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -11,6 +11,6 @@ jobs: build: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 GCC Test run: sudo make -C scripts/ci x86_64 From f4290868bb2cb325df6326d36e7e0edf3d532a53 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 26 Mar 2024 13:01:16 +0000 Subject: [PATCH 418/775] ci/vdso01: fix typo Signed-off-by: Radostin Stoyanov --- test/zdtm/static/vdso01.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/vdso01.c b/test/zdtm/static/vdso01.c index d8d64155a..4e33d30a8 100644 --- a/test/zdtm/static/vdso01.c +++ b/test/zdtm/static/vdso01.c @@ -372,7 +372,7 @@ static int vdso_time_handler(void *func) t1 = time(NULL); t2 = vdso_time(NULL); - test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t1); + test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t2); if (labs(t1 - t2) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); From 4607b535664cf898ccbc319236ced37bda5c141f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 16 Apr 2024 12:40:20 +0100 Subject: [PATCH 419/775] mem: optimize debug logging of enqueued pages During restore, CRIU prints "Enqueue page-read" messages for each page-read request [1]. However, this message does not provide useful information, increases performance overhead during restore and the size of log file. $ ./zdtm.py run -t zdtm/static/maps06 -f h -k always $ grep 'Enqueue page-read' dump/zdtm/static/maps06/56/1/restore.log | wc -l 20493 This commit replaces these log messages with a single message that shows the number of enqueued page-read requests. $ grep 'enqueued' dump/zdtm/static/maps06/56/1/restore.log (00.061449) 56: nr_enqueued: 20493 [1] https://github.com/checkpoint-restore/criu/commit/91388fc Signed-off-by: Radostin Stoyanov --- criu/mem.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/criu/mem.c b/criu/mem.c index 5f0d57eb6..c9578ef44 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1087,6 +1087,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned int nr_shared = 0; unsigned int nr_dropped = 0; unsigned int nr_compared = 0; + unsigned int nr_enqueued = 0; unsigned int nr_lazy = 0; unsigned long va; @@ -1162,7 +1163,8 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; - pr_debug("Enqueue page-read\n"); + + nr_enqueued++; continue; } @@ -1258,7 +1260,8 @@ err_read: pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); - pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_enqueued: %d\n", nr_enqueued); pr_info("nr_lazy: %d\n", nr_lazy); return 0; From df178c7e534c0637b5215318cdaddc6d85097c17 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 16 Apr 2024 14:37:01 +0800 Subject: [PATCH 420/775] sk-tcp: cleanup dump_tcp_conn_state error handling 1) In dump_tcp_conn_state, if return from libsoccr_save is >=0, we check that sizeof(struct libsoccr_sk_data) returned from libsoccr_save is equal to sizeof(struct libsoccr_sk_data) we see in dump_tcp_conn_state (probably to check if we use the right library version). And if sizes are different we go to err_r, which just returns ret, which can teoretically be 0 (if size in library is zero) and that would lead dump_one_tcp treat this as success though it is obvious error. 2) In case of dump_opt or open_image fails we don't explicitly set ret and rely that sizeof(struct libsoccr_sk_data) previously set to ret is not 0, I don't really like it, it makes reading code too complex. 3) We have a lot of err_* labels which do exactly the same thing, there is no point in having all of them, also it is better to choose the name of the label based on what it really does. So let's refactor error handling to avoid these inconsistencies. Signed-off-by: Pavel Tikhomirov --- criu/sk-tcp.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 630a182a2..b8d9ba46e 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -135,6 +135,7 @@ void cpt_unlock_tcp_connections(void) static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; + int exit_code = -1; int ret, aux; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; @@ -144,11 +145,11 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) ret = libsoccr_save(socr, &data, sizeof(data)); if (ret < 0) { pr_err("libsoccr_save() failed with %d\n", ret); - goto err_r; + goto err; } if (ret != sizeof(data)) { pr_err("This libsocr is not supported (%d vs %d)\n", ret, (int)sizeof(data)); - goto err_r; + goto err; } sk->state = data.state; @@ -190,7 +191,7 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) */ if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) - goto err_opt; + goto err; if (aux) { tse.has_nodelay = true; @@ -198,7 +199,7 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) } if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) - goto err_opt; + goto err; if (aux) { tse.has_cork = true; @@ -208,20 +209,19 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) /* * Push the stuff to image */ - img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino); if (!img) - goto err_img; + goto err; ret = pb_write_one(img, &tse, PB_TCP_STREAM); if (ret < 0) - goto err_iw; + goto err_close; buf = libsoccr_get_queue_bytes(socr, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); if (buf) { ret = write_img_buf(img, buf, tse.inq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } @@ -230,18 +230,17 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) if (buf) { ret = write_img_buf(img, buf, tse.outq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } pr_info("Done\n"); -err_iw: + exit_code = 0; +err_close: close_image(img); -err_img: -err_opt: -err_r: - return ret; +err: + return exit_code; } int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) From 13854a988ca6ab3263a377f02b174944cd7e7a0e Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 9 May 2024 16:16:50 -0700 Subject: [PATCH 421/775] criu: fix a fatal failure if nft doesn't work On some systems, nft binary might not be installed, or some kernel options might be unconfigured, resulting in something like this: sudo unshare -n nft create table inet CRIU Error: Could not process rule: Operation not supported create table inet CRIU ^^^^^^^^^^^^^^^^^^^^^^^ This is similar to what kerndat_has_nftables_concat() does, and if the outcome is the same, it returns an error to kerndat_init(), and an error from kerndat_init() is considered fatal. Let's relax the check, returning mere "feature not working" instead of a fatal error. Signed-off-by: Kir Kolyshkin --- criu/kerndat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 6f4fea46b..f899ef642 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1602,7 +1602,9 @@ static int __has_nftables_concat(void *arg) return 1; if (NFT_RUN_CMD(nft, "create table inet CRIU")) { - pr_err("Can't create nftables table\n"); + pr_warn("Can't create nftables table\n"); + *has = false; /* kdat.has_nftables_concat = false */ + ret = 0; goto nft_ctx_free_out; } From 1cb75c0b1ead6f9ec415ea3c0372e77b4fa6c7c6 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Sun, 19 May 2024 12:49:18 +0100 Subject: [PATCH 422/775] sk-tcp: Move TCP socket options from TcpStreamEntry to TcpOptsEntry Currently some of the TCP socket option information is stored in the TcpStreamEntry, but the information in the TcpStreamEntry is only restored after the TCP socket has established connection, which results in these TCP socket options not being restored for unconnected TCP sockets. In this commit move the TCP socket options from TcpStreamEntry to TcpOptsEntry and add dump_tcp_opts() and restore_tcp_opts() for TCP socket options dump and restore. Signed-off-by: Juntong Deng --- criu/include/sk-inet.h | 3 +++ criu/sk-inet.c | 18 +++++++++++++- criu/sk-tcp.c | 55 +++++++++++++++++++++++++---------------- images/sk-inet.proto | 2 ++ images/tcp-stream.proto | 6 +++++ 5 files changed, 62 insertions(+), 22 deletions(-) diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index b3a70fb27..69ee8589e 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -87,6 +87,9 @@ extern void cpt_unlock_tcp_connections(void); extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); +extern int dump_tcp_opts(int sk, TcpOptsEntry *toe); +extern int restore_tcp_opts(int sk, TcpOptsEntry *toe); + #define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" #define SK_CLOSE_PARAM "tcp-close" diff --git a/criu/sk-inet.c b/criu/sk-inet.c index a6a767c73..92f53e569 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -454,6 +454,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT; IpOptsRawEntry ipopts_raw = IP_OPTS_RAW_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + TcpOptsEntry tcpopts = TCP_OPTS_ENTRY__INIT; int ret = -1, err = -1, proto, aux, type; ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL, &proto, sizeof(proto)); @@ -521,6 +522,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa ie.opts = &skopts; ie.ip_opts = &ipopts; ie.ip_opts->raw = &ipopts_raw; + ie.tcp_opts = &tcpopts; ie.n_src_addr = PB_ALEN_INET; ie.n_dst_addr = PB_ALEN_INET; @@ -581,9 +583,20 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa switch (proto) { case IPPROTO_TCP: - err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk, &skopts) : 0; if (sk->shutdown) sk_encode_shutdown(&ie, sk->shutdown); + + if (type == SOCK_RAW) { + err = 0; + } else { + err = dump_tcp_opts(lfd, &tcpopts); + if (err < 0) + goto err; + + err = dump_one_tcp(lfd, sk, &skopts); + if (err < 0) + goto err; + } break; case IPPROTO_UDP: case IPPROTO_UDPLITE: @@ -939,6 +952,9 @@ done: if (restore_socket_opts(sk, ie->opts)) goto err; + if (ie->proto == IPPROTO_TCP && restore_tcp_opts(sk, ie->tcp_opts)) + goto err; + if (ie->has_shutdown && (ie->proto == IPPROTO_UDP || ie->proto == IPPROTO_UDPLITE || ie->proto == IPPROTO_TCP)) { if (shutdown(sk, sk_decode_shutdown(ie->shutdown))) { diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index b8d9ba46e..f80a4cb9c 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -136,7 +136,7 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; int exit_code = -1; - int ret, aux; + int ret; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; char *buf; @@ -186,26 +186,6 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) tse.rcv_wup = data.rcv_wup; } - /* - * TCP socket options - */ - - if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) - goto err; - - if (aux) { - tse.has_nodelay = true; - tse.nodelay = true; - } - - if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) - goto err; - - if (aux) { - tse.has_cork = true; - tse.cork = true; - } - /* * Push the stuff to image */ @@ -243,6 +223,19 @@ err: return exit_code; } +int dump_tcp_opts(int fd, TcpOptsEntry *toe) +{ + int ret = 0; + + ret |= dump_opt(fd, SOL_TCP, TCP_NODELAY, &toe->nodelay); + ret |= dump_opt(fd, SOL_TCP, TCP_CORK, &toe->cork); + + toe->has_nodelay = !!toe->nodelay; + toe->has_cork = !!toe->cork; + + return ret; +} + int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) { soe->has_tcp_keepcnt = true; @@ -396,6 +389,11 @@ static int restore_tcp_conn_state(int sk, struct libsoccr_sk *socr, struct inet_ if (libsoccr_restore(socr, &data, sizeof(data))) goto err_c; + /* + * Restoring TCP socket options in TcpStreamEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (tse->has_nodelay && tse->nodelay) { aux = 1; if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux)) @@ -448,6 +446,21 @@ int prepare_tcp_socks(struct task_restore_args *ta) return 0; } +int restore_tcp_opts(int sk, TcpOptsEntry *toe) +{ + int ret = 0; + + if(!toe) + return ret; + + if (toe->has_nodelay) + ret |= restore_opt(sk, SOL_TCP, TCP_NODELAY, &toe->nodelay); + if (toe->has_cork) + ret |= restore_opt(sk, SOL_TCP, TCP_CORK, &toe->cork); + + return ret; +} + int restore_one_tcp(int fd, struct inet_sk_info *ii) { struct libsoccr_sk *sk; diff --git a/images/sk-inet.proto b/images/sk-inet.proto index 03a679e7f..2c709e018 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -5,6 +5,7 @@ syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; +import "tcp-stream.proto"; message ip_opts_raw_entry { optional bool hdrincl = 1; @@ -56,4 +57,5 @@ message inet_sk_entry { optional string ifname = 17; optional uint32 ns_id = 18; optional sk_shutdown shutdown = 19; + optional tcp_opts_entry tcp_opts = 20; } diff --git a/images/tcp-stream.proto b/images/tcp-stream.proto index c2244ba3b..4f85282e2 100644 --- a/images/tcp-stream.proto +++ b/images/tcp-stream.proto @@ -4,6 +4,11 @@ syntax = "proto2"; import "opts.proto"; +message tcp_opts_entry { + optional bool cork = 1; + optional bool nodelay = 2; +} + message tcp_stream_entry { required uint32 inq_len = 1; required uint32 inq_seq = 2; @@ -16,6 +21,7 @@ message tcp_stream_entry { optional uint32 rcv_wscale = 8; optional uint32 timestamp = 9; + /* These two are deprecated, use tcp_opts_entry instead */ optional bool cork = 10; optional bool nodelay = 11; From 9ba9aff77f07777a7525da4206b63a72649d3268 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Sun, 19 May 2024 12:55:02 +0100 Subject: [PATCH 423/775] sk-tcp: Move TCP socket options from SkOptsEntry to TcpOptsEntry Currently some TCP socket option information is stored in SkOptsEntry, which is a little confusing. SkOptsEntry should only contain socket options that are common to all sockets. In this commit move the TCP-specific socket options from SkOptsEntry to TcpOptsEntry. Signed-off-by: Juntong Deng --- criu/sk-tcp.c | 30 ++++++++++++------------------ criu/sockets.c | 6 ++++++ images/sk-opts.proto | 3 +++ images/tcp-stream.proto | 3 +++ 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index f80a4cb9c..9c8bad1c3 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -229,33 +229,21 @@ int dump_tcp_opts(int fd, TcpOptsEntry *toe) ret |= dump_opt(fd, SOL_TCP, TCP_NODELAY, &toe->nodelay); ret |= dump_opt(fd, SOL_TCP, TCP_CORK, &toe->cork); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); toe->has_nodelay = !!toe->nodelay; toe->has_cork = !!toe->cork; + toe->has_keepcnt = !!toe->keepcnt; + toe->has_keepidle = !!toe->keepidle; + toe->has_keepintvl = !!toe->keepintvl; return ret; } int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) { - soe->has_tcp_keepcnt = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt)) { - pr_perror("Can't read TCP_KEEPCNT"); - return -1; - } - - soe->has_tcp_keepidle = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &soe->tcp_keepidle)) { - pr_perror("Can't read TCP_KEEPIDLE"); - return -1; - } - - soe->has_tcp_keepintvl = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &soe->tcp_keepintvl)) { - pr_perror("Can't read TCP_KEEPINTVL"); - return -1; - } - if (sk->dst_port == 0) return 0; @@ -457,6 +445,12 @@ int restore_tcp_opts(int sk, TcpOptsEntry *toe) ret |= restore_opt(sk, SOL_TCP, TCP_NODELAY, &toe->nodelay); if (toe->has_cork) ret |= restore_opt(sk, SOL_TCP, TCP_CORK, &toe->cork); + if (toe->has_keepcnt) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + if (toe->has_keepidle) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + if (toe->has_keepintvl) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); return ret; } diff --git a/criu/sockets.c b/criu/sockets.c index 560c76517..f9ce999be 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -585,6 +585,12 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_debug("\tset keepalive for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val); } + + /* + * Restoring TCP socket options in SkOptsEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (soe->has_tcp_keepcnt) { pr_debug("\tset keepcnt for socket\n"); ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt); diff --git a/images/sk-opts.proto b/images/sk-opts.proto index 1d24d47cc..2f9d4e5c3 100644 --- a/images/sk-opts.proto +++ b/images/sk-opts.proto @@ -26,9 +26,12 @@ message sk_opts_entry { optional bool so_reuseport = 17; optional bool so_broadcast = 18; optional bool so_keepalive = 19; + + /* These three are deprecated, use tcp_opts_entry instead */ optional uint32 tcp_keepcnt = 20; optional uint32 tcp_keepidle = 21; optional uint32 tcp_keepintvl = 22; + optional uint32 so_oobinline = 23; optional uint32 so_linger = 24; diff --git a/images/tcp-stream.proto b/images/tcp-stream.proto index 4f85282e2..3d834159f 100644 --- a/images/tcp-stream.proto +++ b/images/tcp-stream.proto @@ -7,6 +7,9 @@ import "opts.proto"; message tcp_opts_entry { optional bool cork = 1; optional bool nodelay = 2; + optional uint32 keepcnt = 3; + optional uint32 keepidle = 4; + optional uint32 keepintvl = 5; } message tcp_stream_entry { From 708f872a6d8f9ec2ac5a94596983ce273482dc01 Mon Sep 17 00:00:00 2001 From: Juntong Deng Date: Sun, 19 May 2024 12:57:17 +0100 Subject: [PATCH 424/775] sk-tcp: Add test cases for TCP_CORK and TCP_NODELAY socket options Currently there are no socket option test cases for TCP_CORK and TCP_NODELAY, this commit adds related test cases. The socket option test cases for TCP_KEEPCNT, TCP_KEEPIDLE, and TCP_KEEPINTVL already exist in socket-tcp_keepalive.c, so they are not included in this test case. Signed-off-by: Juntong Deng --- test/zdtm/static/Makefile | 3 + test/zdtm/static/sock_tcp_opts00.c | 96 +++++++++++++++++++++++++++ test/zdtm/static/sock_tcp_opts00.desc | 1 + test/zdtm/static/sock_tcp_opts01.c | 1 + test/zdtm/static/sock_tcp_opts01.desc | 1 + 5 files changed, 102 insertions(+) create mode 100644 test/zdtm/static/sock_tcp_opts00.c create mode 100644 test/zdtm/static/sock_tcp_opts00.desc create mode 120000 test/zdtm/static/sock_tcp_opts01.c create mode 120000 test/zdtm/static/sock_tcp_opts01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 548cefac2..1e891f0ba 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -127,6 +127,8 @@ TST_NOFILE := \ sock_opts02 \ sock_ip_opts00 \ sock_ip_opts01 \ + sock_tcp_opts00 \ + sock_tcp_opts01 \ sk-unix-unconn \ sk-unix-unconn-seqpacket \ ipc_namespace \ @@ -609,6 +611,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO +sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS mnt_ext_manual: CFLAGS += -D ZDTM_EXTMAP_MANUAL mntns_pivot_root_ro: CFLAGS += -DMNTNS_PIVOT_ROOT_RO diff --git a/test/zdtm/static/sock_tcp_opts00.c b/test/zdtm/static/sock_tcp_opts00.c new file mode 100644 index 000000000..8061bc9ea --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.c @@ -0,0 +1,96 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that different tcp socket options are restored"; +const char *test_author = "Juntong Deng "; + +#ifdef ZDTM_VAL_ZERO +#define TCP_OPT_VAL 0 +#else +#define TCP_OPT_VAL 1 +#endif + +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + +struct sk_opt { + int level; + int opt; + int val; +}; + +struct sk_opt tcp_sk_opts[] = { + { SOL_TCP, TCP_CORK, TCP_OPT_VAL }, + { SOL_TCP, TCP_NODELAY, TCP_OPT_VAL }, +}; + +struct sk_conf { + int domain; + int type; + int protocol; + int sk; +} sk_confs[] = { + { AF_INET, SOCK_STREAM, IPPROTO_TCP }, + { AF_INET6, SOCK_STREAM, IPPROTO_TCP }, +}; + +int main(int argc, char **argv) +{ + struct sk_opt *opts = tcp_sk_opts; + int n_opts = ARRAY_SIZE(tcp_sk_opts); + int exit_code = 1; + int i, j, val; + socklen_t len; + + test_init(argc, argv); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); + if (sk_confs[i].sk == -1) { + pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, + sk_confs[i].protocol); + goto close; + } + } + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + val = opts[j].val; + if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { + pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + len = sizeof(int); + if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { + pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + + if (val != opts[j].val) { + fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, + sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); + goto close; + } + } + } + + pass(); + exit_code = 0; +close: + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) + close(sk_confs[i].sk); + return exit_code; +} diff --git a/test/zdtm/static/sock_tcp_opts00.desc b/test/zdtm/static/sock_tcp_opts00.desc new file mode 100644 index 000000000..2eac7e654 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/test/zdtm/static/sock_tcp_opts01.c b/test/zdtm/static/sock_tcp_opts01.c new file mode 120000 index 000000000..5219c2e98 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.c @@ -0,0 +1 @@ +./sock_tcp_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_tcp_opts01.desc b/test/zdtm/static/sock_tcp_opts01.desc new file mode 120000 index 000000000..fb1dfdcd1 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.desc @@ -0,0 +1 @@ +./sock_tcp_opts00.desc \ No newline at end of file From 30aa8dbe4d49624aeea9f906fccae002aab158f2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 May 2024 17:17:03 +0100 Subject: [PATCH 425/775] mount: fix unbounded write Replace sprintf() with snprintf() and specify maximum length of characters to avoid potential overflow. Reported-by: GitHub CodeQL (https://codeql.github.com/) Signed-off-by: Radostin Stoyanov --- criu/mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/mount.c b/criu/mount.c index afbd24281..82bbd52d6 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -98,7 +98,7 @@ static char *ext_mount_lookup(char *key) int len = strlen(key); char mkey[len + 6]; - sprintf(mkey, "mnt[%s]", key); + snprintf(mkey, sizeof(mkey), "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; From b3c3422cd99840b95f4c488472d6fc8ad3c96bf6 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 May 2024 19:13:01 +0100 Subject: [PATCH 426/775] test/make: remove unused target A fault-injection test was introduced in commit [1] and later removed in commit [2]. This patch removes the obsolete Makefile target. [1] b95407e264fcf58f4f73f78abef6dac60436e7dd test: check, that parasite can rollback itself (v2) [2] 2cb4532e266d0c9f8e87839d5b5eb728a3e4d10d tests: remove zdtm.sh (v2) Signed-off-by: Radostin Stoyanov --- test/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/Makefile b/test/Makefile index 5784b6a49..0bfdab680 100644 --- a/test/Makefile +++ b/test/Makefile @@ -45,10 +45,6 @@ zdtm-freezer: ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:f .PHONY: zdtm-freezer -fault-injection: - $(MAKE) -C fault-injection -.PHONY: fault-injection - override CFLAGS += -D_GNU_SOURCE clean_root: From 9c8a6927aa2bc96e0a33149d84a9244cd348f32a Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 21 May 2024 09:25:30 +0100 Subject: [PATCH 427/775] ci: update check for SELinux The rawhide tests runs in a container. Containers always have SELinux disabled from the inside. Somehow /sys/fs/selinux is now mounted. We used the existence of that directory if SELinux is available. This seems to be no longer true. Signed-off-by: Adrian Reber Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index c50dc4174..8ee734fbc 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -306,14 +306,19 @@ if [ "$skip" == 0 ]; then if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. selinuxmode=$(getenforce) - setenforce Permissive + if [ "$selinuxmode" != "Disabled" ]; then + setenforce Permissive + fi + fi # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then - setenforce "$selinuxmode" + if [ "$selinuxmode" != "Disabled" ]; then + setenforce "$selinuxmode" + fi fi setcap -r criu/criu else From 95f66d13db7119625da9ccb4eee9db5fa6aaf3e4 Mon Sep 17 00:00:00 2001 From: Arnav Bhatt Date: Sun, 10 Mar 2024 13:13:12 +0530 Subject: [PATCH 428/775] criu: move sigact dump/restore code into sigact.c Seperate sigact dump/restore code from cr-restore.c and parasite-syscall.c into sigact.c Signed-off-by: Arnav Bhatt --- criu/Makefile.crtools | 1 + criu/cr-dump.c | 1 + criu/cr-restore.c | 263 +------------------------- criu/include/parasite-syscall.h | 2 - criu/include/sigact.h | 14 ++ criu/parasite-syscall.c | 51 ----- criu/sigact.c | 319 ++++++++++++++++++++++++++++++++ 7 files changed, 336 insertions(+), 315 deletions(-) create mode 100644 criu/include/sigact.h create mode 100644 criu/sigact.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index bf17f1ec9..3ddf45cd7 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -93,6 +93,7 @@ obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o obj-y += timer.o +obj-y += sigact.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 diff --git a/criu/cr-dump.c b/criu/cr-dump.c index a29ec82ef..199ff2e32 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -87,6 +87,7 @@ #include "apparmor.h" #include "asm/dump.h" #include "timer.h" +#include "sigact.h" /* * Architectures can overwrite this function to restore register sets that diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c19a20b46..deecb1294 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -99,6 +99,7 @@ #include "cr-errno.h" #include "timer.h" +#include "sigact.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -407,268 +408,6 @@ static int populate_pid_proc(void) return 0; } -static rt_sigaction_t sigchld_act; -/* - * If parent's sigaction has blocked SIGKILL (which is non-sense), - * this parent action is non-valid and shouldn't be inherited. - * Used to mark parent_act* no more valid. - */ -static rt_sigaction_t parent_act[SIGMAX]; -#ifdef CONFIG_COMPAT -static rt_sigaction_t_compat parent_act_compat[SIGMAX]; -#endif - -static bool sa_inherited(int sig, rt_sigaction_t *sa) -{ - rt_sigaction_t *pa; - int i; - - if (current == root_item) - return false; /* XXX -- inherit from CRIU? */ - - pa = &parent_act[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_native_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); -#ifdef CONFIG_MIPS - e->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); - - memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); - memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); -#else - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); -#endif - if (sig == SIGCHLD) { - sigchld_act = act; - return 0; - } - - if (sa_inherited(sig - 1, &act)) - return 1; - - /* - * A pure syscall is used, because glibc - * sigaction overwrites se_restorer. - */ - ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); - if (ret < 0) { - pr_perror("Can't restore sigaction"); - return ret; - } - - parent_act[sig - 1] = act; - /* Mark SIGKILL blocked which makes compat sigaction non-valid */ -#ifdef CONFIG_COMPAT - parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; -#endif - - return 1; -} - -static void *stack32; - -#ifdef CONFIG_COMPAT -static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) -{ - rt_sigaction_t_compat *pa; - int i; - - if (current == root_item) - return false; - - pa = &parent_act_compat[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t_compat act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); - - if (sig == SIGCHLD) { - memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); - return 0; - } - - if (sa_compat_inherited(sig - 1, &act)) - return 1; - - if (!stack32) { - stack32 = alloc_compat_syscall_stack(); - if (!stack32) - return -1; - } - - ret = arch_compat_rt_sigaction(stack32, sig, &act); - if (ret < 0) { - pr_err("Can't restore compat sigaction: %d\n", ret); - return ret; - } - - parent_act_compat[sig - 1] = act; - /* Mark SIGKILL blocked which makes native sigaction non-valid */ - parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; - - return 1; -} -#else -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - return -1; -} -#endif - -static int prepare_sigactions_from_core(TaskCoreEntry *tc) -{ - int sig, i; - - if (tc->n_sigactions != SIGMAX - 2) { - pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); - return -1; - } - - pr_info("Restore on-core sigactions for %d\n", vpid(current)); - - for (sig = 1, i = 0; sig <= SIGMAX; sig++) { - int ret; - SaEntry *e; - bool sigaction_is_compat; - - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - e = tc->sigactions[i++]; - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - if (ret < 0) - return ret; - } - - return 0; -} - -/* Returns number of restored signals, -1 or negative errno on fail */ -static int restore_one_sigaction(int sig, struct cr_img *img, int pid) -{ - bool sigaction_is_compat; - SaEntry *e; - int ret = 0; - - BUG_ON(sig == SIGKILL || sig == SIGSTOP); - - ret = pb_read_one_eof(img, &e, PB_SIGACT); - if (ret == 0) { - if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ - pr_err("Unexpected EOF %d\n", sig); - return -1; - } - pr_warn("This format of sigacts-%d.img is deprecated\n", pid); - return -1; - } - if (ret < 0) - return ret; - - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - sa_entry__free_unpacked(e, NULL); - - return ret; -} - -static int prepare_sigactions_from_image(void) -{ - int pid = vpid(current); - struct cr_img *img; - int sig, rst = 0; - int ret = 0; - - pr_info("Restore sigacts for %d\n", pid); - - img = open_image(CR_FD_SIGACT, O_RSTR, pid); - if (!img) - return -1; - - for (sig = 1; sig <= SIGMAX; sig++) { - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - ret = restore_one_sigaction(sig, img, pid); - if (ret < 0) - break; - if (ret) - rst++; - } - - pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); - - close_image(img); - return ret; -} - -static int prepare_sigactions(CoreEntry *core) -{ - int ret; - - if (!task_alive(current)) - return 0; - - if (core->tc->n_sigactions != 0) - ret = prepare_sigactions_from_core(core->tc); - else - ret = prepare_sigactions_from_image(); - - if (stack32) { - free_compat_syscall_stack(stack32); - stack32 = NULL; - } - - return ret; -} - static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index 70ecbb720..4a8ec2fee 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -21,8 +21,6 @@ struct rt_sigframe; struct parasite_ctl; struct parasite_thread_ctl; -extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); - extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); diff --git a/criu/include/sigact.h b/criu/include/sigact.h new file mode 100644 index 000000000..4df011f96 --- /dev/null +++ b/criu/include/sigact.h @@ -0,0 +1,14 @@ +#ifndef __CR_SIGACT_H__ +#define __CR_SIGACT_H__ + +#include "images/core.pb-c.h" + +extern rt_sigaction_t sigchld_act; + +struct parasite_ctl; +struct pstree_item; + +extern int prepare_sigactions(CoreEntry *core); +extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); + +#endif diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 6d2aa9c88..a88f8a66f 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -9,7 +9,6 @@ #include "common/compiler.h" #include "types.h" #include "protobuf.h" -#include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" @@ -228,56 +227,6 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit return dump_thread_core(pid, core, args); } -int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - TaskCoreEntry *tc = item->core[0]->tc; - struct parasite_dump_sa_args *args; - int ret, sig; - SaEntry *sa, **psa; - - args = compel_parasite_args(ctl, struct parasite_dump_sa_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); - if (ret < 0) - return ret; - - psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); - if (!psa) - return -1; - - sa = (SaEntry *)(psa + SIGMAX - 2); - - tc->n_sigactions = SIGMAX - 2; - tc->sigactions = psa; - - for (sig = 1; sig <= SIGMAX; sig++) { - int i = sig - 1; - - if (sig == SIGSTOP || sig == SIGKILL) - continue; - - sa_entry__init(sa); - ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); - ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); - ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); -#ifdef CONFIG_MIPS - sa->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); - memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); -#else - BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); -#endif - sa->has_compat_sigaction = true; - sa->compat_sigaction = !compel_mode_native(ctl); - - *(psa++) = sa++; - } - - return 0; -} - int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) { struct parasite_dump_misc *ma; diff --git a/criu/sigact.c b/criu/sigact.c new file mode 100644 index 000000000..5174644d2 --- /dev/null +++ b/criu/sigact.c @@ -0,0 +1,319 @@ +#include "types.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "parasite.h" +#include "restorer.h" +#include "sigact.h" + +/* + * If parent's sigaction has blocked SIGKILL (which is non-sense), + * this parent action is non-valid and shouldn't be inherited. + * Used to mark parent_act* no more valid. + */ +static rt_sigaction_t parent_act[SIGMAX]; +#ifdef CONFIG_COMPAT +static rt_sigaction_t_compat parent_act_compat[SIGMAX]; +#endif + +static bool sa_inherited(int sig, rt_sigaction_t *sa) +{ + rt_sigaction_t *pa; + int i; + + if (current == root_item) + return false; /* XXX -- inherit from CRIU? */ + + pa = &parent_act[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static void *stack32; +rt_sigaction_t sigchld_act; + +#ifdef CONFIG_COMPAT +static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) +{ + rt_sigaction_t_compat *pa; + int i; + + if (current == root_item) + return false; + + pa = &parent_act_compat[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t_compat act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); + + if (sig == SIGCHLD) { + memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); + return 0; + } + + if (sa_compat_inherited(sig - 1, &act)) + return 1; + + if (!stack32) { + stack32 = alloc_compat_syscall_stack(); + if (!stack32) + return -1; + } + + ret = arch_compat_rt_sigaction(stack32, sig, &act); + if (ret < 0) { + pr_err("Can't restore compat sigaction: %d\n", ret); + return ret; + } + + parent_act_compat[sig - 1] = act; + /* Mark SIGKILL blocked which makes native sigaction non-valid */ + parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; + + return 1; +} +#else +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + return -1; +} +#endif + +static int restore_native_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); +#ifdef CONFIG_MIPS + e->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); + + memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); + memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); +#else + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); +#endif + if (sig == SIGCHLD) { + sigchld_act = act; + return 0; + } + + if (sa_inherited(sig - 1, &act)) + return 1; + + /* + * A pure syscall is used, because glibc + * sigaction overwrites se_restorer. + */ + ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); + if (ret < 0) { + pr_perror("Can't restore sigaction"); + return ret; + } + + parent_act[sig - 1] = act; + /* Mark SIGKILL blocked which makes compat sigaction non-valid */ +#ifdef CONFIG_COMPAT + parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; +#endif + + return 1; +} + +static int prepare_sigactions_from_core(TaskCoreEntry *tc) +{ + int sig, i; + + if (tc->n_sigactions != SIGMAX - 2) { + pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); + return -1; + } + + pr_info("Restore on-core sigactions for %d\n", vpid(current)); + + for (sig = 1, i = 0; sig <= SIGMAX; sig++) { + int ret; + SaEntry *e; + bool sigaction_is_compat; + + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + e = tc->sigactions[i++]; + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + if (ret < 0) + return ret; + } + + return 0; +} + +/* Returns number of restored signals, -1 or negative errno on fail */ +static int restore_one_sigaction(int sig, struct cr_img *img, int pid) +{ + bool sigaction_is_compat; + SaEntry *e; + int ret = 0; + + BUG_ON(sig == SIGKILL || sig == SIGSTOP); + + ret = pb_read_one_eof(img, &e, PB_SIGACT); + if (ret == 0) { + if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ + pr_err("Unexpected EOF %d\n", sig); + return -1; + } + pr_warn("This format of sigacts-%d.img is deprecated\n", pid); + return -1; + } + if (ret < 0) + return ret; + + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + sa_entry__free_unpacked(e, NULL); + + return ret; +} + +static int prepare_sigactions_from_image(void) +{ + int pid = vpid(current); + struct cr_img *img; + int sig, rst = 0; + int ret = 0; + + pr_info("Restore sigacts for %d\n", pid); + + img = open_image(CR_FD_SIGACT, O_RSTR, pid); + if (!img) + return -1; + + for (sig = 1; sig <= SIGMAX; sig++) { + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + ret = restore_one_sigaction(sig, img, pid); + if (ret < 0) + break; + if (ret) + rst++; + } + + pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); + + close_image(img); + return ret; +} + +int prepare_sigactions(CoreEntry *core) +{ + int ret; + + if (!task_alive(current)) + return 0; + + if (core->tc->n_sigactions != 0) + ret = prepare_sigactions_from_core(core->tc); + else + ret = prepare_sigactions_from_image(); + + if (stack32) { + free_compat_syscall_stack(stack32); + stack32 = NULL; + } + + return ret; +} + +int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + TaskCoreEntry *tc = item->core[0]->tc; + struct parasite_dump_sa_args *args; + int ret, sig; + SaEntry *sa, **psa; + + args = compel_parasite_args(ctl, struct parasite_dump_sa_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); + if (ret < 0) + return ret; + + psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); + if (!psa) + return -1; + + sa = (SaEntry *)(psa + SIGMAX - 2); + + tc->n_sigactions = SIGMAX - 2; + tc->sigactions = psa; + + for (sig = 1; sig <= SIGMAX; sig++) { + int i = sig - 1; + + if (sig == SIGSTOP || sig == SIGKILL) + continue; + + sa_entry__init(sa); + ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); + ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); + ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); +#ifdef CONFIG_MIPS + sa->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); + memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); +#else + BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); +#endif + sa->has_compat_sigaction = true; + sa->compat_sigaction = !compel_mode_native(ctl); + + *(psa++) = sa++; + } + + return 0; +} From 457bc6a8ff1008284662a8fa5b5bfeb33a10e39a Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 26 May 2024 14:44:14 +0200 Subject: [PATCH 429/775] criu: use proper format-specified to accommodate time_t 64-bit change See also: https://wiki.debian.org/ReleaseGoals/64bit-time Signed-off-by: Alexander Mikhalitsyn --- criu/autofs.c | 4 ++-- criu/timens.c | 8 ++++---- criu/timer.c | 5 +++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/criu/autofs.c b/criu/autofs.c index 6a7d8db0d..a1775cbc9 100644 --- a/criu/autofs.c +++ b/criu/autofs.c @@ -658,7 +658,7 @@ static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { - pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); + pr_info("%s: set timeout %" PRId64 " for %s\n", __func__, (int64_t)timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } @@ -770,7 +770,7 @@ static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { - pr_err("Failed to set timeout %ld for %s\n", timeout, mnt_path); + pr_err("Failed to set timeout %" PRId64 " for %s\n", (int64_t)timeout, mnt_path); return -1; } diff --git a/criu/timens.c b/criu/timens.c index 66c0c02a4..257782e5a 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -96,8 +96,8 @@ int prepare_timens(int id) ts.tv_nsec = te->monotonic->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: monotonic %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_MONOTONIC, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: monotonic %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_MONOTONIC, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a monotonic clock offset"); goto err; } @@ -111,8 +111,8 @@ int prepare_timens(int id) ts.tv_nsec = te->boottime->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: boottime %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_BOOTTIME, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: boottime %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_BOOTTIME, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a boottime clock offset"); goto err; } diff --git a/criu/timer.c b/criu/timer.c index bdcb059cc..4b286635d 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -46,8 +46,9 @@ static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) return -1; } - pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, - val->it_interval.tv_sec, val->it_interval.tv_usec); + pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, + (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, + (int64_t)val->it_interval.tv_sec, val->it_interval.tv_usec); return 0; } From cc88b1e1ffbadc325f0919b49ba27db048164512 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 25 May 2024 05:11:21 +0000 Subject: [PATCH 430/775] net: Fix TOCTOU race condition in unix_conf_op The unix_conf_op function reads the size of the sysctl entry array twice. gcc thinks that it can lead to a time-of-check to time-of-use (TOCTOU) race condition if the array size changes between the two reads. Fixes #2398 Signed-off-by: Andrei Vagin --- criu/net.c | 15 ++++++++------- scripts/build/Dockerfile.x86_64.hdr | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/criu/net.c b/criu/net.c index b5c4a6ee3..eee331108 100644 --- a/criu/net.c +++ b/criu/net.c @@ -359,22 +359,23 @@ static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntr return net_conf_op(tgt, conf, n, op, "ipv6", req, path, ARRAY_SIZE(devconfs6), devconfs6, def_conf); } -static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) +static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) { int i, ret = -1, flags = 0; char path[ARRAY_SIZE(unix_conf_entries)][MAX_CONF_UNIX_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(unix_conf_entries)] = {}; SysctlEntry **conf = *rconf; + size_t n = *pn; - if (*n != ARRAY_SIZE(unix_conf_entries)) { - pr_err("unix: Unexpected entries in config (%zu %zu)\n", *n, ARRAY_SIZE(unix_conf_entries)); + if (n != ARRAY_SIZE(unix_conf_entries)) { + pr_err("unix: Unexpected entries in config (%zu %zu)\n", n, ARRAY_SIZE(unix_conf_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { snprintf(path[i], MAX_CONF_UNIX_PATH, CONF_UNIX_FMT, unix_conf_entries[i]); req[i].name = path[i]; req[i].flags = flags; @@ -390,7 +391,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) } } - ret = sysctl_op(req, *n, op, CLONE_NEWNET); + ret = sysctl_op(req, n, op, CLONE_NEWNET); if (ret < 0) { pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", CONF_UNIX_BASE); return -1; @@ -399,7 +400,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) if (op == CTL_READ) { bool has_entries = false; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { conf[i]->has_iarg = true; if (!has_entries) @@ -412,7 +413,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) * Unix conf is optional. */ if (!has_entries) { - *n = 0; + *pn = 0; *rconf = NULL; } } diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 32fc2978a..566b4c916 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,4 +1,4 @@ -FROM ubuntu:focal +FROM ubuntu:24.04 COPY scripts/ci/apt-install /bin/apt-install From e7276cf63b5d531eaeefdeebd02d6da0898caafa Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 23 May 2024 14:21:17 +0100 Subject: [PATCH 431/775] pagemap-cache: handle short reads It is possible for pread() to return fewer number of bytes than requested. In such case, we need to repeat the read operation with appropriate offset. Signed-off-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/pagemap-cache.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 978a6b1ac..f04a517de 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -165,7 +165,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) int pmc_fill(pmc_t *pmc, u64 start, u64 end) { - size_t size_map; + size_t size_map, off; pmc->start = start; pmc->end = end; @@ -204,10 +204,17 @@ int pmc_fill(pmc_t *pmc, u64 start, u64 end) pmc->regs_idx = 0; pmc->end = args.walk_end; } else { - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + for (off = 0; off != size_map;) { + ssize_t ret; + char *ptr = (char *)pmc->map; + + ret = pread(pmc->fd, ptr + off, size_map - off, PAGEMAP_PFN_OFF(pmc->start) + off); + if (ret == -1) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } + off += ret; } } From 1da29f27f6e0f2f91336f200ea3791e96edafcb3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 23 May 2024 07:57:14 +0100 Subject: [PATCH 432/775] zdtm: add support for LD_PRELOAD tests This commit adds a `--preload-libfault` option to ZDTM's run command. This option runs CRIU with LD_PRELOAD to intercept libc functions such as pread(). This method allows to simulate special cases, for example, when a successful call to pread() transfers fewer bytes than requested. Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 3 +++ test/libfault/Makefile | 21 +++++++++++++++++++++ test/libfault/libfault.c | 31 +++++++++++++++++++++++++++++++ test/zdtm.py | 23 +++++++++++++++++++++-- test/zdtm/criu_config.py | 1 + 5 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 test/libfault/Makefile create mode 100644 test/libfault/libfault.c diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 8ee734fbc..ef2dffb1a 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -262,6 +262,9 @@ make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling +./test/zdtm.py run -t zdtm/static/maps00 --preload-libfault +./test/zdtm.py run -t zdtm/static/maps02 --preload-libfault + ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server diff --git a/test/libfault/Makefile b/test/libfault/Makefile new file mode 100644 index 000000000..cbe47fdf2 --- /dev/null +++ b/test/libfault/Makefile @@ -0,0 +1,21 @@ +CC = gcc +CFLAGS = -c -fPIC -ldl + +SRC = libfault.c +OBJ = $(SRC:.c=.o) + +LIB = libfault.so + +.PHONY: all clean run + +all: $(LIB) + +$(LIB): $(OBJ) + $(CC) -shared -o $(LIB) $(OBJ) + +$(OBJ): $(SRC) + $(CC) $(CFLAGS) $< + +clean: + rm -f $(OBJ) $(LIB) + diff --git a/test/libfault/libfault.c b/test/libfault/libfault.c new file mode 100644 index 000000000..650bf08ca --- /dev/null +++ b/test/libfault/libfault.c @@ -0,0 +1,31 @@ +#define _GNU_SOURCE +#include +#include +#include + +ssize_t (*original_pread)(int fd, void *buf, size_t count, off_t offset) = NULL; + +/** + * This function is a wrapper around pread() that is used for testing CRIU's + * handling of cases where pread() returns less data than requested. + * + * pmc_fill() in criu/pagemap.c is a good example of where this can happen. + */ +ssize_t pread64(int fd, void *buf, size_t count, off_t offset) +{ + if (!original_pread) { + original_pread = dlsym(RTLD_NEXT, "pread"); + if (!original_pread) { + errno = EIO; + return -1; + } + } + + /* The following aims to simulate the case when pread() returns less + * data than requested. We need to ensure that CRIU handles such cases. */ + if (count > 2048) { + count -= 1024; + } + + return original_pread(fd, buf, count, offset); +} diff --git a/test/zdtm.py b/test/zdtm.py index 7a7cdfd3b..fbb3400c4 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -32,6 +32,14 @@ from zdtm.criu_config import criu_config # File to store content of streamed images STREAMED_IMG_FILE_NAME = "img.criu" +# A library used to preload C functions to simulate +# cases such as partial read with pread(). +LIBFAULT_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "libfault", + "libfault.so" +) + prev_line = None uuid = uuid.uuid4() @@ -628,6 +636,8 @@ class zdtm_test: ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) + if 'preload_libfault' in opts and opts['preload_libfault']: + subprocess.check_call(["make", "-C", "libfault/"]) if 'rootless' in opts and opts['rootless']: return subprocess.check_call( @@ -880,6 +890,7 @@ class criu_cli: fault=None, strace=[], preexec=None, + preload_libfault=False, nowait=False, timeout=60): env = dict( @@ -890,6 +901,9 @@ class criu_cli: print("Forcing %s fault" % fault) env['CRIU_FAULT'] = fault + if preload_libfault: + env['LD_PRELOAD'] = LIBFAULT_PATH + cr = subprocess.Popen(strace + [criu_bin, action, "--no-default-config"] + args, env=env, @@ -980,6 +994,7 @@ class criu_rpc: fault=None, strace=[], preexec=None, + preload_libfault=False, nowait=False, timeout=None): if fault: @@ -1065,6 +1080,7 @@ class criu: self.__criu_bin = opts['criu_bin'] self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] + self.__preload_libfault = bool(opts['preload_libfault']) self.__mntns_compat_mode = bool(opts['mntns_compat_mode']) if opts['rpc']: @@ -1192,8 +1208,10 @@ class criu: with open("/proc/sys/kernel/ns_last_pid") as ns_last_pid_fd: ns_last_pid = ns_last_pid_fd.read() + preload_libfault = self.__preload_libfault and action in ['dump', 'pre-dump', 'restore'] + ret = self.__criu.run(action, s_args, self.__criu_bin, self.__fault, - strace, preexec, nowait) + strace, preexec, preload_libfault, nowait) if nowait: os.close(status_fds[1]) @@ -2083,7 +2101,7 @@ class Launcher: 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless') + 'rootless', 'preload_libfault') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2788,6 +2806,7 @@ def get_cli_args(): help="Select tests for a shard (0-based)") rp.add_argument("--test-shard-count", type=int, default=0, help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") + rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) diff --git a/test/zdtm/criu_config.py b/test/zdtm/criu_config.py index 487becfb4..221c23292 100644 --- a/test/zdtm/criu_config.py +++ b/test/zdtm/criu_config.py @@ -11,6 +11,7 @@ class criu_config: fault=None, strace=[], preexec=None, + preload=False, nowait=False): config_path = tempfile.mktemp(".conf", "criu-%s-" % action) From 6feb57a8406df708ba952f3e340381ed4c1494d5 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Jun 2024 08:14:51 +0200 Subject: [PATCH 433/775] ci: remove CentOS Stream 8 test (EOL) Signed-off-by: Adrian Reber --- .cirrus.yml | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 72135590d..5e30ca2c2 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -92,37 +92,6 @@ task: build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: CentOS Stream 8 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-stream-8 - platform: linux - cpu: 4 - memory: 8G - - setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - # Do not fail if latest epel repository definition is already installed - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm || : - yum install -y dnf-plugins-core - yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-PyYAML python3-protobuf python3-importlib-metadata python3-junit_xml xmlto libdrm-devel - alternatives --set python /usr/bin/python3 - systemctl stop sssd - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - - build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" - task: name: aarch64 build GCC (native) arm_container: From a252a240c33c154f3d12a3a73561192b6be1e7f8 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Fri, 29 Mar 2024 00:06:58 +0530 Subject: [PATCH 434/775] zdtm: Distinguish between fail and crash of dump Adds a exit_signal static method to criu_cli, criu_config and criu_rpc used to detect a crash. Fixes: #350 Signed-off-by: Bhavik Sachdev --- test/zdtm.py | 19 +++++++++++++++---- test/zdtm/criu_config.py | 4 ++++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index fbb3400c4..df23ea03d 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -913,6 +913,10 @@ class criu_cli: return cr return cr.wait(timeout=timeout) + @staticmethod + def exit_signal(ret): + return ret < 0 + class criu_rpc_process: def wait(self): @@ -1033,8 +1037,11 @@ class criu_rpc: else: raise test_fail_exc('RPC for %s required' % action) except crpc.CRIUExceptionExternal as e: - print("Fail", e) - ret = -1 + if e.typ != e.resp_typ: + ret = -2 + else: + print("Fail", e) + ret = -1 else: ret = 0 @@ -1047,6 +1054,10 @@ class criu_rpc: return ret + @staticmethod + def exit_signal(ret): + return ret == -2 + class criu: def __init__(self, opts): @@ -1251,8 +1262,8 @@ class criu: return rst_succeeded = os.access( os.path.join(__ddir, "restore-succeeded"), os.F_OK) - if self.__test.blocking() or (self.__sat and action == 'restore' and - rst_succeeded): + if (self.__test.blocking() and not self.__criu.exit_signal(ret)) or \ + (self.__sat and action == 'restore' and rst_succeeded): raise test_fail_expected_exc(action) else: raise test_fail_exc("CRIU %s" % action) diff --git a/test/zdtm/criu_config.py b/test/zdtm/criu_config.py index 221c23292..9fd292747 100644 --- a/test/zdtm/criu_config.py +++ b/test/zdtm/criu_config.py @@ -41,3 +41,7 @@ class criu_config: if nowait: return cr return cr.wait() + + @staticmethod + def exit_signal(ret): + return ret < 0 From f1716492648be6aaacdd41d8c00942a516a889f5 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Sat, 20 Apr 2024 00:24:27 +0530 Subject: [PATCH 435/775] test/dump-crash: check code path when dump crashes Signed-off-by: Bhavik Sachdev --- criu/cr-dump.c | 4 ++++ criu/include/fault-injection.h | 1 + test/jenkins/criu-fault.sh | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 199ff2e32..ef3b5480f 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2102,6 +2102,10 @@ static int cr_dump_finish(int ret) close_image_dir(); if (ret || post_dump_ret) { + if (fault_injected(FI_DUMP_CRASH)) { + pr_info("fault: CRIU dump crashed!\n"); + abort(); + } pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index fe75dfe86..552ee4338 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -20,6 +20,7 @@ enum faults { FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, FI_DONT_USE_PAGEMAP_SCAN = 135, + FI_DUMP_CRASH = 136, FI_MAX, }; diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 4a6d55e6b..1fda40a96 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -39,3 +39,7 @@ fi ./test/zdtm.py run -t zdtm/static/fpu03 --fault 134 -f h --norst || fail # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail + +if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then + fail +fi From fdf546dbd500141752b16b4706b39291a5c786ad Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sun, 9 Jun 2024 18:25:28 +0200 Subject: [PATCH 436/775] ci: upgrade to Fedora 40 Vagrant images (38 is EOL) Signed-off-by: Adrian Reber --- scripts/ci/vagrant.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 4c1be3544..3904c51d2 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.3.7 -FEDORA_VERSION=38 -FEDORA_BOX_VERSION=38.20230413.1 +VAGRANT_VERSION=2.4.1 +FEDORA_VERSION=40 +FEDORA_BOX_VERSION=40.20240414.0 setup() { if [ -n "$TRAVIS" ]; then @@ -39,7 +39,7 @@ setup() { ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel + rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline From 4f15fe8c5910d9c495b6a0c672790b3a49d02dc7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 May 2024 09:48:34 +0100 Subject: [PATCH 437/775] make: improve check for externally managed Python Move PYTHON_EXTERNALLY_MANAGED and PIP_BREAK_SYSTEM_PACKAGES into Makefile.install to avoid code duplication. In addition, add PIPFLAGS variable to enable specifying pip options during installation. This is particularly useful for packaging, where it is common for `pip install` to run in an environment with pre-installed dependencies and without internet access. In such environment, we need to specify the following options: --no-build-isolation --no-index --no-deps Signed-off-by: Radostin Stoyanov --- Makefile.install | 23 +++++++++++++++++++++++ crit/Makefile | 25 +++++-------------------- lib/Makefile | 25 +++++-------------------- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/Makefile.install b/Makefile.install index 6f5b31924..680b26c62 100644 --- a/Makefile.install +++ b/Makefile.install @@ -29,6 +29,29 @@ LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR +# Detect externally managed Python environment (PEP 668). +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES ?= 0 + +# If Python environment is externally managed and PIP_BREAK_SYSTEM_PACKAGES is not set, skip pip install. +SKIP_PIP_INSTALL := 0 +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + +SKIP_PIP_INSTALL := 1 +$(info Warn: Externally managed python environment) +$(info Consider using PIP_BREAK_SYSTEM_PACKAGES=1) + +endif +endif + +# Default flags for pip install: +# --upgrade: Upgrade crit/pycriu packages +# --ignore-installed: Ignore existing packages and reinstall them +PIPFLAGS ?= --upgrade --ignore-installed + +export SKIP_PIP_INSTALL PIPFLAGS + install-man: $(Q) $(MAKE) -C Documentation install .PHONY: install-man diff --git a/crit/Makefile b/crit/Makefile index 9a856db6d..33bd68eed 100644 --- a/crit/Makefile +++ b/crit/Makefile @@ -1,6 +1,3 @@ -PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') -PIP_BREAK_SYSTEM_PACKAGES := 0 - VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) all-y += ${VERSION_FILE} @@ -10,31 +7,19 @@ ${VERSION_FILE}: $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ install: ${VERSION_FILE} -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP INSTALL crit: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit -endif + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./crit else - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./crit + $(E) " SKIP INSTALL crit" endif .PHONY: install uninstall: -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP UNINSTALL crit: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " UNINSTALL" crit $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit -endif else - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit + $(E) " SKIP UNINSTALL crit" endif .PHONY: uninstall diff --git a/lib/Makefile b/lib/Makefile index ae371e78e..4b8a6cbb8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -4,9 +4,6 @@ UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/ve all-y += lib-c lib-a lib-py -PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') -PIP_BREAK_SYSTEM_PACKAGES := 0 - # # C language bindings. lib/c/Makefile: ; @@ -57,17 +54,11 @@ install: lib-c lib-a lib-py lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP INSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make install" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " INSTALL " pycriu - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib -endif + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./lib else - $(E) " INSTALL " pycriu - $(Q) $(PYTHON) -m pip install --upgrade --ignore-installed --prefix=$(DESTDIR)$(PREFIX) ./lib + $(E) " SKIP INSTALL pycriu" endif .PHONY: install @@ -80,16 +71,10 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - $(E) " SKIP UNINSTALL pycriu: Externally managed python environment (See PEP 668 for more information)" - $(E) " Consider using PIP_BREAK_SYSTEM_PACKAGES=1 make uninstall" -else +ifeq ($(SKIP_PIP_INSTALL),0) $(E) " UNINSTALL" pycriu $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu -endif else - $(E) " UNINSTALL" pycriu - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu + $(E) " SKIP UNINSTALL pycriu" endif .PHONY: uninstall From 7ac45370698ed8c28312cfe6ebc30b423eb8bb4a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 22 Jun 2024 18:34:47 +0100 Subject: [PATCH 438/775] readme: update link to FAQ page The current link opens a page with the following text: The MediaWiki FAQ can be found at: https://www.mediawiki.org/wiki/Special:MyLanguage/Manual:FAQ Signed-off-by: Radostin Stoyanov --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 11d1c490b..f578e745c 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) -- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/index.php?title=FAQ) ### Checkpoint and restore of simple loop process

From 1012e542e5e0132c7d7ad04c6b126e465f397968 Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Thu, 6 Jun 2024 11:12:39 -0700 Subject: [PATCH 439/775] criu: Restore rseq_cs state slightly earlier in the restore sequence and run the plugin finalizer later in the dump sequence Restore rseq_cs state before calling RESUME_DEVICES_LATE as the CUDA plugin will temporarily unfreeze a thread during the plugin hook to assist with device restore Run the plugin finalizer later in the dump sequence since the finalizer is used by the CUDA plugin to handle some process cleanup Signed-off-by: Jesus Ramos --- criu/cr-dump.c | 4 +++- criu/cr-restore.c | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index ef3b5480f..1bc5d934f 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2035,7 +2035,6 @@ static int cr_dump_finish(int ret) if (bfd_flush_images()) ret = -1; - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { @@ -2089,6 +2088,9 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index deecb1294..4db2f4ecf 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2224,6 +2224,11 @@ skip_ns_bouncing: } finalize_restore(); + + /* just before releasing threads we have to restore rseq_cs */ + if (restore_rseq_cs()) + pr_err("Unable to restore rseq_cs state\n"); + /* * Some external devices such as GPUs might need a very late * trigger to kick-off some events, memory notifiers and for @@ -2255,10 +2260,6 @@ skip_ns_bouncing: if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); - /* just before releasing threads we have to restore rseq_cs */ - if (restore_rseq_cs()) - pr_err("Unable to restore rseq_cs state\n"); - /* Detaches from processes and they continue run through sigreturn. */ if (finalize_restore_detach()) goto out_kill_network_unlocked; From 5f486d5aeee6539bb0d666f0a97572fc52072c3a Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Thu, 6 Jun 2024 11:16:07 -0700 Subject: [PATCH 440/775] criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_DEVICES to be used during pstree collection PAUSE_DEVICES is called before a process is frozen and is used by the CUDA plugin to place the process in a state that's ready to be checkpointed and quiesce any pending work CHECKPOINT_DEVICES is called after all processes in the tree have been frozen and PAUSE'd and performs the actual checkpointing operation for CUDA applications Signed-off-by: Jesus Ramos --- criu/include/criu-plugin.h | 6 ++++++ criu/plugin.c | 2 ++ criu/seize.c | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 886832eaa..392ea9f53 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -56,6 +56,10 @@ enum { CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, + CR_PLUGIN_HOOK__PAUSE_DEVICES = 10, + + CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__MAX }; @@ -72,6 +76,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index f3fea2856..58b5ea5bf 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -57,6 +57,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); + __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); + __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); #undef __assign_hook diff --git a/criu/seize.c b/criu/seize.c index 91090ae1a..d392259bc 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -16,6 +16,7 @@ #include "pstree.h" #include "criu-log.h" #include +#include "plugin.h" #include "proc_parse.h" #include "seccomp.h" #include "seize.h" @@ -637,6 +638,11 @@ static int collect_children(struct pstree_item *item) goto free; } + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto free; + } + if (!opts.freeze_cgroup) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -966,6 +972,7 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret = -1; struct proc_status_creds creds; + struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -984,6 +991,11 @@ int collect_pstree(void) if (opts.freeze_cgroup && freeze_processes()) goto err; + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1017,6 +1029,12 @@ int collect_pstree(void) goto err; } + for_each_pstree_item(iter) { + ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + ret = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); From bf417dd0509a66afd43558122d535034e528df09 Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Fri, 31 May 2024 13:38:54 -0700 Subject: [PATCH 441/775] criu/plugin: Add NVIDIA CUDA plugin Adding support for the NVIDIA cuda-checkpoint utility, requires the use of an r555 or higher driver along with the cuda-checkpoint binary. Signed-off-by: Jesus Ramos --- Makefile | 15 +- Makefile.install | 7 +- plugins/cuda/Makefile | 42 ++++ plugins/cuda/README.md | 59 +++++ plugins/cuda/cuda_plugin.c | 459 +++++++++++++++++++++++++++++++++++++ 5 files changed, 578 insertions(+), 4 deletions(-) create mode 100644 plugins/cuda/Makefile create mode 100644 plugins/cuda/README.md create mode 100644 plugins/cuda/cuda_plugin.c diff --git a/Makefile b/Makefile index e49dace7a..97b4dc211 100644 --- a/Makefile +++ b/Makefile @@ -165,7 +165,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: criu lib crit cuda_plugin .PHONY: all # @@ -298,15 +298,19 @@ clean-amdgpu_plugin: $(Q) $(MAKE) -C plugins/amdgpu clean .PHONY: clean-amdgpu_plugin +clean-cuda_plugin: + $(Q) $(MAKE) -C plugins/cuda clean +.PHONY: clean-cuda_plugin + clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top clean-amdgpu_plugin +clean: clean-top clean-amdgpu_plugin clean-cuda_plugin -mrproper-top: clean-top clean-amdgpu_plugin +mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -338,6 +342,10 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +cuda_plugin: criu + $(Q) $(MAKE) -C plugins/cuda all +.PHONY: cuda_plugin + crit: lib $(Q) $(MAKE) -C crit .PHONY: crit @@ -424,6 +432,7 @@ help: @echo ' lint - Run code linters' @echo ' indent - Indent C code' @echo ' amdgpu_plugin - Make AMD GPU plugin' + @echo ' cuda_plugin - Make NVIDIA CUDA plugin' .PHONY: help ruff: diff --git a/Makefile.install b/Makefile.install index 680b26c62..455735f3b 100644 --- a/Makefile.install +++ b/Makefile.install @@ -72,12 +72,16 @@ install-amdgpu_plugin: amdgpu_plugin $(Q) $(MAKE) -C plugins/amdgpu install .PHONY: install-amdgpu_plugin +install-cuda_plugin: cuda_plugin + $(Q) $(MAKE) -C plugins/cuda install +.PHONY: install-cuda_plugin + install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ; .PHONY: install uninstall: @@ -88,4 +92,5 @@ uninstall: $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) -C plugins/amdgpu $@ + $(Q) $(MAKE) -C plugins/cuda $@ .PHONY: uninstall diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile new file mode 100644 index 000000000..2eabc0e31 --- /dev/null +++ b/plugins/cuda/Makefile @@ -0,0 +1,42 @@ +PLUGIN_NAME := cuda_plugin +PLUGIN_SOBJ := cuda_plugin.so + +DEPS_CUDA := $(PLUGIN_SOBJ) + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ + +COMPEL := ../../compel/compel-host + +CC := gcc +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC + +__nmk_dir ?= ../../scripts/nmk/scripts/ +include $(__nmk_dir)msg.mk + +all: $(DEPS_CUDA) + +cuda_plugin.so: cuda_plugin.c + $(call msg-gen, $@) + $(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) + +clean: + $(call msg-clean, $@) + $(Q) $(RM) $(PLUGIN_SOBJ) +.PHONY: clean + +mrproper: clean + +install: + $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) + $(E) " INSTALL " $(PLUGIN_NAME) + $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) +.PHONY: install + +uninstall: + $(E) " UNINSTALL" $(PLUGIN_NAME) + $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) +.PHONY: uninstall + diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md new file mode 100644 index 000000000..7b91f6998 --- /dev/null +++ b/plugins/cuda/README.md @@ -0,0 +1,59 @@ +Checkpoint and Restore for CUDA applications with CRIU +====================================================== + +# Requirements +The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555 +or higher GPU driver is required for CUDA CRIU integration support. + +## cuda-checkpoint +The cuda-checkpoint utility can be found at: +https://github.com/NVIDIA/cuda-checkpoint + +cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA +applications. Updating the cuda-checkpoint utility between driver releases +should not be necessary as the utility simply exposes some extra driver behavior +so driver updates are all that's needed to get access to newer features. + +# Checkpointing Procedure +cuda-checkpoint exposes 4 actions used in the checkpointing process: lock, +checkpoint, restore, unlock. + +* lock - Used with the PAUSE_DEVICES hook while a process is still running to + quiesce the application into a state where it can be checkpointed +* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been + seized/frozen to perform the actual checkpointing operation +* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA + state and release the process back to it's running state + +These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA +plugin will re-wake when needed. + +# Known Limitations +* Currently GPU memory contents are brought into main system memory and CRIU + then checkpoints that as part of the normal procedure. On systems with many + GPU's with high GPU memory usage this can cause memory thrashing. A future + CUDA release will add support for dumping the memory contents to files to + alleviate this as well as support in the CRIU plugin. +* There's currently a small race between when a PAUSE_DEVICES hook is called on + a running process and a process calls cuInit() and finishes initializing CUDA + after the PAUSE is issued but before the process is frozen to checkpoint. This + will cause cuda-checkpoint to report that the process is in an illegal state + for checkpointing and it's recommended to just attempt the CRIU procedure + again, this should be very rare. +* Applications that use NVML will leave some leftover device references as NVML + is not currently supported for checkpointing. There will be support for this + in later drivers. A possible temporary workaround is to have the + {DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N} + remaining references for these applications as in most cases NVML is used to + get info such as gpu count and some capabilities and these values are never + accessed again and unlikely to change. +* CUDA applications that fork() but don't call exec() but also don't issue any + CUDA API calls will have some leftover references to /dev/nvidia* and fail to + checkpoint as a result. This can be worked around in a similar fashion to the + NVML case where the leftover references can be ignored as CUDA is not fork() + safe anyway. +* Restore currently requires that you restore on a system with similar GPU's and + same GPU count. +* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process + Service) are currently not supported for checkpointing. Future CUDA releases + will add support for these. diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c new file mode 100644 index 000000000..b3f2fc8df --- /dev/null +++ b/plugins/cuda/cuda_plugin.c @@ -0,0 +1,459 @@ +#include "criu-log.h" +#include "plugin.h" +#include "util.h" +#include "cr_options.h" +#include "pid.h" +#include "proc_parse.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* cuda-checkpoint binary should live in your PATH */ +#define CUDA_CHECKPOINT "cuda-checkpoint" + +/* cuda-checkpoint --action flags */ +#define ACTION_LOCK "lock" +#define ACTION_CHECKPOINT "checkpoint" +#define ACTION_RESTORE "restore" +#define ACTION_UNLOCK "unlock" + +#define CUDA_CKPT_BUF_SIZE (128) + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "cuda_plugin: " + +/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver + * version doesn't support --action flag + */ +bool plugin_disabled = false; + +struct pid_info { + int pid; + char checkpointed; + struct list_head list; +}; + +/* Used to track which PID's we've paused CUDA operations on so far so we can + * release them after we're done with the DUMP + */ +struct list_head cuda_pids; + +static void dealloc_pid_buffer(struct list_head *pid_buf) +{ + struct pid_info *info; + struct pid_info *n; + + list_for_each_entry_safe(info, n, pid_buf, list) { + list_del(&info->list); + xfree(info); + } +} + +static int add_pid_to_buf(struct list_head *pid_buf, int pid) +{ + struct pid_info *new = xmalloc(sizeof(*new)); + + if (new == NULL) { + return -1; + } + + new->pid = pid; + new->checkpointed = 0; + list_add_tail(&new->list, pid_buf); + + return 0; +} + +static int update_checkpointed_pid(struct list_head *pid_buf, int pid) +{ + struct pid_info *info; + + list_for_each_entry(info, pid_buf, list) { + if (info->pid == pid) { + info->checkpointed = 1; + return 0; + } + } + + return -1; +} + +static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) +{ +#define READ 0 +#define WRITE 1 + int fd[2]; + + if (pipe(fd) != 0) { + pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + return -1; + } + + buf[0] = '\0'; + + int child_pid = fork(); + if (child_pid == -1) { + pr_err("Failed to fork to exec cuda-checkpoint\n"); + close(fd[READ]); + close(fd[WRITE]); + return -1; + } + + if (child_pid == 0) { // child + if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { + return -1; + } + if (dup2(fd[WRITE], STDERR_FILENO) == -1) { + return -1; + } + close(fd[READ]); + return execvp(args[0], (char **)args); + } else { // parent + close(fd[WRITE]); + + int bytes_read = read(fd[READ], buf, buf_size); + if (bytes_read > 0) { + buf[bytes_read - 1] = '\0'; + } + + // Clear out any of the remaining output in the pipe in case the buffer wasn't large enough + struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP }; + while (true) { + int poll_status = poll(&read_poll, 1, -1); + if (poll_status == -1) { + close(fd[READ]); + pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n"); + return -1; + } + if (read_poll.revents & POLLHUP) { + break; + } + // POLLIN, read into scratch buffer to flush things out + char scratch[64]; + bytes_read = read(fd[READ], scratch, sizeof(scratch)); + } + + int status; + if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) { + pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n"); + close(fd[READ]); + return -1; + } + + close(fd[READ]); + + return WEXITSTATUS(status); + } +} + +static bool cuda_checkpoint_supports_flag(const char *flag) +{ + char msg_buf[2048]; + const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; + int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n"); + return false; + } + + if (strstr(msg_buf, flag) == NULL) { + return false; + } + + return true; +} + +/* Retrieve the cuda restore thread TID from the root pid */ +static int get_cuda_restore_tid(int root_pid) +{ + char pid_buf[16]; + char pid_out[CUDA_CKPT_BUF_SIZE]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid); + + const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL }; + int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out); + return -1; + } + + return atoi(pid_out); +} + +static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, + int buf_size) +{ + char pid_buf[16]; + char timeout_buf[16]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */, + NULL /* timeout_val */, NULL }; + if (timeout > 0) { + snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout); + args[5] = "--timeout"; + args[6] = timeout_buf; + } + + return launch_cuda_checkpoint(args, msg_buf, buf_size); +} + +static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset) +{ + /* Since we resumed a thread that CRIU previously already froze we need to + * INTERRUPT it once again, task was already SEIZE'd so we don't need to do + * a compel_interrupt_task() + */ + if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { + pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", + restore_tid); + return -1; + } + + struct proc_status_creds creds; + if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) { + pr_err("compel_wait_task failed after interrupt\n"); + return -1; + } + + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { + pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { + pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) +{ + k_rtsigset_t block; + + if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { + pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + return -1; + } + + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { + pr_err("Failed to block signals on restore tid %d\n", restore_tid); + return -1; + } + + // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { + pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + return -1; + } + + if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { + pr_err("Could not resume cuda restore tid %d\n", restore_tid); + return -1; + } + + return 0; +} + +int cuda_plugin_checkpoint_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int int_ret; + int status; + k_rtsigset_t save_sigset; + + if (plugin_disabled) { + return 0; + } + + restore_tid = get_cuda_restore_tid(pid); + + /* We can possibly hit a race with cuInit() where we are past the point of + * locking the process but at lock time cuInit() hadn't completed in which + * case cuda-checkpoint will report that we're in an invalid state to + * checkpoint + */ + if (restore_tid == -1) { + pr_info("No need to checkpoint devices on pid %d\n", pid); + return 0; + } + + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); + /* We need to resume the checkpoint thread to prepare the mappings for + * checkpointing + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); + goto interrupt; + } + status = update_checkpointed_pid(&cuda_pids, pid); + if (status) { + pr_err("Failed to track checkpointed pid %d\n", pid); + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); + } + } +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return status != 0 ? status : int_ret; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); + +int cuda_plugin_pause_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + + if (plugin_disabled) { + return 0; + } + + restore_tid = get_cuda_restore_tid(pid); + + if (restore_tid == -1) { + pr_info("no need to pause devices on pid %d\n", pid); + return 0; + } + + pr_info("pausing devices on pid %d\n", pid); + int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); + return -1; + } + if (add_pid_to_buf(&cuda_pids, pid)) { + pr_err("unable to track paused pid %d\n", pid); + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); + } + return -1; + } + + return 0; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) + +int resume_device(int pid, int checkpointed) +{ + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int status; + int ret = 0; + int int_ret; + k_rtsigset_t save_sigset; + + int restore_tid = get_cuda_restore_tid(pid); + if (restore_tid == -1) { + pr_info("No need to resume devices on pid %d\n", pid); + return 0; + } + + pr_info("resuming devices on pid %d\n", pid); + /* The resuming process has to stay frozen during this time otherwise + * attempting to access a UVM pointer will crash if we haven't restored the + * underlying mappings yet + */ + pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid); + /* wakeup the restore thread so we can handle the restore for this pid, + * rseq_cs has to be restored before execution + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + + if (checkpointed) { + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); + ret = -1; + goto interrupt; + } + } + + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } + +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return ret != 0 ? ret : int_ret; +} + +int cuda_plugin_resume_devices_late(int pid) +{ + if (plugin_disabled) { + return 0; + } + + return resume_device(pid, 1); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) + +int cuda_plugin_init(int stage) +{ + if (!cuda_checkpoint_supports_flag("--action")) { + pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); + plugin_disabled = true; + return 0; + } + + pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage); + + /* In the DUMP stage track all the PID's we've paused CUDA operations on to + * release them when we're done if the user requested the leave-running option + */ + if (stage == CR_PLUGIN_STAGE__DUMP) { + INIT_LIST_HEAD(&cuda_pids); + } + + return 0; +} + +void cuda_plugin_fini(int stage, int ret) +{ + if (plugin_disabled) { + return; + } + + pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret); + + /* Release all the paused PID's at the end of the DUMP stage in case the + * user provides the -R (leave-running) flag or an error occurred + */ + if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { + struct pid_info *info; + list_for_each_entry(info, &cuda_pids, list) { + resume_device(info->pid, info->checkpointed); + } + } + if (stage == CR_PLUGIN_STAGE__DUMP) { + dealloc_pid_buffer(&cuda_pids); + } +} +CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini) From ca971b7f8b8c6db802127a82b53fab0b97b33bcd Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Jun 2024 14:56:25 +0200 Subject: [PATCH 442/775] compel: fix build on Amazon Linux 2 due to missing PTRACE_ARCH_PRCTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit fc683cb01 ("compel: shstk: save CET state when CPU supports it") started using PTRACE_ARCH_PRCTL to query shadow stack status. While PTRACE_ARCH_PRCTL has existed in the kernel for a long time, it was only added to glibc in version 2.27. Amazon Linux 2 (AL2) has glibc 2.26, which does not have this definition. As a result, build on AL2 fails with the below error: compel/arch/x86/src/lib/infect.c: In function ‘get_task_xsave’: compel/arch/x86/src/lib/infect.c:276:14: error: ‘PTRACE_ARCH_PRCTL’ undeclared (first use in this function) 276 | if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { | ^~~~~~~~~~~~~~~~~ While the definition is present on the system via the kernel headers (in asm/ptrace-abi.h) which can be reached by including linux/ptrace.h, the comment in compel/include/uapi/ptrace.h says: We'd want to include both sys/ptrace.h and linux/ptrace.h, hoping that most definitions come from either one or another. Alas, on Alpine/musl both files declare struct ptrace_peeksiginfo_args, so there is no way they can be used together. Let's rely on libc one. Since including linux/ptrace.h is not an option, define PTRACE_ARCH_PRCTL if it doesn't already exist. An interesting point to note is that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to decide if PTRACE_ARCH_PRCTL is available or not. Another interesting point to note is that AL2 ships with GCC 7 by default, which does not support the -mshstk option, causing other build failures. Luckily, it also ships GCC 10 which does have the option. Using GCC 10 lets the build succeed. Fixes: fc683cb01 ("compel: shstk: save CET state when CPU supports it") Signed-off-by: Pratyush Yadav --- compel/include/uapi/ptrace.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index 63dfee97f..558124fbd 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -86,6 +86,19 @@ struct __ptrace_rseq_configuration { #define PTRACE_EVENT_STOP 128 #endif +/* + * Amazon Linux 2 uses glibc 2.26. PTRACE_ARCH_PRCTL was added in glibc 2.27. + * This allows CRIU to build on Amazon Linux 2. + * + * Note that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the + * preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol + * that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to + * decide if PTRACE_ARCH_PRCTL is available or not. + */ +#if defined(__x86_64__) && !defined(PT_ARCH_PRCTL) +#define PTRACE_ARCH_PRCTL 30 /* From asm/ptrace-abi.h. */ +#endif + extern int ptrace_suspend_seccomp(pid_t pid); extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); From b169e3b63d36770d6ce887d5f2a0d5ec6cfd4256 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 2 Jul 2024 00:39:53 -0700 Subject: [PATCH 443/775] plugins/cuda: fix crosscompilation Signed-off-by: Andrei Vagin --- plugins/cuda/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile index 2eabc0e31..e337056dc 100644 --- a/plugins/cuda/Makefile +++ b/plugins/cuda/Makefile @@ -10,7 +10,6 @@ PLUGIN_INCLUDE += -iquote../../ COMPEL := ../../compel/compel-host -CC := gcc PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC __nmk_dir ?= ../../scripts/nmk/scripts/ From daed6c3535d969eea388d9c924aba9b5fe2cecc8 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Thu, 30 May 2024 19:52:54 +0800 Subject: [PATCH 444/775] irmap: duplicate string in irmap_scan_path_add Duplicate string in irmap_scan_path_add, otherwise it will free before parsing next configuration input. [ avagin: handle errors of xstrdup ] Signed-off-by: Liu Hua Signed-off-by: Andrei Vagin --- criu/irmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/criu/irmap.c b/criu/irmap.c index 37d098db1..d2c5d588a 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -500,7 +500,12 @@ int irmap_scan_path_add(char *path) return -1; } - o->ir->path = path; + o->ir->path = xstrdup(path); + if (!o->ir->path) { + xfree(o->ir); + xfree(o); + return -1; + } o->ir->nr_kids = -1; list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; From 71999d8883fe1508453496ac12094f66ca2cda0b Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 2 Jul 2024 11:56:41 +0800 Subject: [PATCH 445/775] cgroupd: unblock SIGTERM to make stop_cgroupd actually work Sometimes due to sigblockmask inheritance cgroupd can inherit SIGTERM blocked. That will lead cgroupd ignoring SIGTERM from stop_cgroupd() and CRIU will get stuck due to waiting for never-stopping cgroupd. I see this happening in lxc-checkpoint, also saw this in OpenVZ jenkins on cgroup_inotify00 test. Signed-off-by: Pavel Tikhomirov --- criu/cgroup.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/criu/cgroup.c b/criu/cgroup.c index 6d1f74457..d90b70bb7 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -1947,6 +1947,21 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +static int cgroupd_unblock_sigterm(void) +{ + sigset_t unblockmask; + + sigemptyset(&unblockmask); + sigaddset(&unblockmask, SIGTERM); + + if (sigprocmask(SIG_UNBLOCK, &unblockmask, NULL)) { + pr_perror("cgroupd: can't unblock SIGTERM"); + return -1; + } + + return 0; +} + /* * If a thread is a different cgroup set than the main thread in process, * it means it is in a threaded controller. This daemon receives the cg_set @@ -1955,6 +1970,14 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) */ static int cgroupd(int sk) { + /* + * This pairs with SIGTERM in stop_cgroupd(), and ensures that cgroupd + * will receive termination signal, regardless of which signal block + * mask was inherited. + */ + if (cgroupd_unblock_sigterm()) + return -1; + pr_info("cgroud: Daemon started\n"); while (1) { From ac22aaf5760ae670ae0a741c22173dfa22ef3baf Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 2 Jul 2024 23:30:31 -0700 Subject: [PATCH 446/775] apparmor: get_suspend_policy must return NULL in error cases Before this fix, it could return MAP_FAILED which is ((void *) -1). Signed-off-by: Andrei Vagin --- criu/apparmor.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/apparmor.c b/criu/apparmor.c index e46e239f5..48b639216 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -469,6 +469,7 @@ static void *get_suspend_policy(char *name, off_t *len) ret = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (ret == MAP_FAILED) { pr_perror("mmap of %s failed", file); + ret = NULL; goto out; } From 18158381914031b8e72a7fd142ab4383c682f2f9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 3 Jul 2024 09:26:28 -0700 Subject: [PATCH 447/775] vdso: proxify the __vdso_clock_gettime64 function It was added in v5.3-rc1~211^2~4^2~10. Fixes #2390 Signed-off-by: Andrei Vagin --- criu/arch/x86/include/asm/vdso.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h index 3b3f292bd..ca46374a5 100644 --- a/criu/arch/x86/include/asm/vdso.h +++ b/criu/arch/x86/include/asm/vdso.h @@ -12,7 +12,7 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_MAX 7 #define VDSO_SYMBOL_GTOD 2 /* @@ -42,11 +42,12 @@ const char *aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol4 = "__vdso_time"; \ const char *aarch_vdso_symbol5 = "__kernel_sigreturn"; \ - const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; + const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; \ + const char *aarch_vdso_symbol7 = "__vdso_clock_gettime64"; \ #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ - aarch_vdso_symbol6 + aarch_vdso_symbol6, aarch_vdso_symbol7 /* "__kernel_vsyscall", */ From 42b177da62838fefd0ea75d92e396636ce040fff Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 13:24:19 +0100 Subject: [PATCH 448/775] scripts/build: drop centos 7 targets The CI tests with CentOS 7 have been disabled and removed [1,2]. This patch removes the obsolete Makefile targets for these tests. [1] https://github.com/checkpoint-restore/criu/commit/24bc083653f7d2b984653194e921b1ff32292b3b [2] https://github.com/checkpoint-restore/criu/commit/f8466ca798acd124eebbba2655894ebd2f777879 Signed-off-by: Radostin Stoyanov --- scripts/build/Makefile | 2 +- scripts/ci/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 2c006ad87..bc4a59db1 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide centos7 armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 1caa1e423..9dc0190b3 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos7 centos8 archlinux +TARGETS := alpine fedora-rawhide centos8 archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME From 4f45572fde61c12b54c73a12d4c584f57dd6492c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 8 Jul 2024 19:31:38 -0700 Subject: [PATCH 449/775] util: use close_range when it's supported close_range is faster than reading /proc/self/fd and closing descriptors one by one. Signed-off-by: Andrei Vagin --- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../mips/plugins/std/syscalls/syscall_64.tbl | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + criu/include/kerndat.h | 1 + criu/include/util.h | 2 ++ criu/kerndat.c | 25 +++++++++++++++++++ criu/util.c | 14 +++++++++++ 10 files changed, 48 insertions(+) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 217e346a3..9a33009eb 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -118,6 +118,7 @@ fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) +close_range 436 436 (unsigned int fd, unsigned int max_fd, unsigned int flags) pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 9f50d5e8a..85faca5a9 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -115,6 +115,7 @@ __NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 5436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index 4c9b75cf1..c56b4e6de 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index af7d550e2..018d58a59 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index ab36a5cd6..cc23dc3f3 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -102,6 +102,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 4e843bee9..7fbfd69ad 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -113,6 +113,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 41524ed66..e03a57341 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -88,6 +88,7 @@ struct kerndat_s { bool has_membarrier_get_registrations; bool has_pagemap_scan; bool has_shstk; + bool has_close_range; }; extern struct kerndat_s kdat; diff --git a/criu/include/util.h b/criu/include/util.h index 4334e69c2..9037dc9e6 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -411,4 +411,6 @@ extern void util_init(void); extern char *resolve_mountpoint(char *path); +extern int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/kerndat.c b/criu/kerndat.c index f899ef642..1a584fe92 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1699,6 +1699,27 @@ static int kerndat_has_membarrier_get_registrations(void) return 0; } +static int kerndat_has_close_range(void) +{ + /* fd is greater than max_fd, so close_range should return EINVAL. */ + if (cr_close_range(2, 1, 0) == 0) { + pr_err("close_range succeeded unexpectedly\n"); + return -1; + } + + if (errno == ENOSYS) { + pr_debug("close_range isn't supported\n"); + return 0; + } + if (errno != EINVAL) { + pr_perror("close_range returned unexpected error code"); + return -1; + } + + kdat.has_close_range = true; + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1956,6 +1977,10 @@ int kerndat_init(void) pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_close_range()) { + pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/util.c b/criu/util.c index 95ba0feda..d74c2aeef 100644 --- a/criu/util.c +++ b/criu/util.c @@ -54,6 +54,7 @@ #include "action-scripts.h" #include "compel/infect-util.h" +#include #define VMA_OPT_LEN 128 @@ -518,12 +519,25 @@ int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned return cr_system_userns(in, out, err, cmd, argv, flags, -1); } +int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) +{ + return syscall(__NR_close_range, fd, max_fd, flags); +} + static int close_fds(int minfd) { DIR *dir; struct dirent *de; int fd, ret, dfd; + if (kdat.has_close_range) { + if (cr_close_range(minfd, ~0, 0)) { + pr_perror("close_range failed"); + return -1; + } + return 0; + } + dir = opendir("/proc/self/fd"); if (dir == NULL) { pr_perror("Can't open /proc/self/fd"); From b9081ca56bd7b56f59231fcc962c64fedca4d4ee Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 12 Jul 2024 23:30:35 +0700 Subject: [PATCH 450/775] zdtm: make cgroup testcases run non-parallel cgroup testcases live in the same cgroup root zdtmtst and zdtmtst.defaultroot controller then create child subgroup for testing. This can cause problems when cgroup testcases run in parallel. For example, testcase A dumps the child subgroup of testcase B since it's in the cgroup root but in the middle of restoring of testcase A, testcase B completes and cleans up the subgroup directory. This causes error in testcase A restore. This commit adds excl flag to all cgroup testcases description so that these don't run parallel. Signed-off-by: Bui Quang Minh --- test/zdtm/static/cgroup00.desc | 2 +- test/zdtm/static/cgroup01.desc | 2 +- test/zdtm/static/cgroup02.desc | 2 +- test/zdtm/static/cgroup_threads.desc | 2 +- test/zdtm/static/cgroup_yard.desc | 2 +- test/zdtm/static/cgroupns.desc | 2 +- test/zdtm/static/cgroupv2_00.desc | 2 +- test/zdtm/static/cgroupv2_01.desc | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/zdtm/static/cgroup00.desc b/test/zdtm/static/cgroup00.desc index 3c6c4a7e2..42a3f2b73 100644 --- a/test/zdtm/static/cgroup00.desc +++ b/test/zdtm/static/cgroup00.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup01.desc b/test/zdtm/static/cgroup01.desc index 3c6c4a7e2..42a3f2b73 100644 --- a/test/zdtm/static/cgroup01.desc +++ b/test/zdtm/static/cgroup01.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup02.desc b/test/zdtm/static/cgroup02.desc index df17a5789..eb5a9dd37 100644 --- a/test/zdtm/static/cgroup02.desc +++ b/test/zdtm/static/cgroup02.desc @@ -1,4 +1,4 @@ { 'dopts': '--manage-cgroups --cgroup-root name=zdtmtst:/prefix', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'ropts': '--manage-cgroups --cgroup-root /newroot --cgroup-root name=zdtmtst:/prefix'} diff --git a/test/zdtm/static/cgroup_threads.desc b/test/zdtm/static/cgroup_threads.desc index 3c6c4a7e2..42a3f2b73 100644 --- a/test/zdtm/static/cgroup_threads.desc +++ b/test/zdtm/static/cgroup_threads.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup_yard.desc b/test/zdtm/static/cgroup_yard.desc index 8736d6780..9ad4a9b57 100644 --- a/test/zdtm/static/cgroup_yard.desc +++ b/test/zdtm/static/cgroup_yard.desc @@ -1,6 +1,6 @@ { 'flavor': 'h', -'flags': 'suid', +'flags': 'suid excl', # We create the external cgroup yard in working directory during --pre-dump # hook. We have to go up a few directories to find the yard. 'opts': '--manage-cgroups --cgroup-yard ../../../../../../external_yard' diff --git a/test/zdtm/static/cgroupns.desc b/test/zdtm/static/cgroupns.desc index 80dd710e1..dc61e36cf 100644 --- a/test/zdtm/static/cgroupns.desc +++ b/test/zdtm/static/cgroupns.desc @@ -1,4 +1,4 @@ { 'feature': 'cgroupns', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc index 4bfd4b265..e70c84df8 100644 --- a/test/zdtm/static/cgroupv2_00.desc +++ b/test/zdtm/static/cgroupv2_00.desc @@ -1 +1 @@ -{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc index 4bfd4b265..e70c84df8 100644 --- a/test/zdtm/static/cgroupv2_01.desc +++ b/test/zdtm/static/cgroupv2_01.desc @@ -1 +1 @@ -{'flavor': 'h ns', 'flags': 'suid', 'opts': '--manage-cgroups=full'} +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} From 089345f77a34d1bc7ef146d650636afcd3cdda21 Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Wed, 10 Jul 2024 18:34:50 +0200 Subject: [PATCH 451/775] Adjust to glibc __rseq_size semantic change In commit 2e456ccf0c34a056e3ccafac4a0c7effef14d918 ("Linux: Make __rseq_size useful for feature detection (bug 31965)") glibc 2.40 changed the meaning of __rseq_size slightly: it is now the size of the active/feature area (20 bytes initially), and not the size of the entire initially defined struct (32 bytes including padding). The reason for the change is that the size including padding does not allow detection of newly added features while previously unused padding is consumed. The prep_libc_rseq_info change in criu/cr-restore.c is not necessary on kernels which have full ptrace support for obtaining rseq information because the code is not used. On older kernels, it is a correctness fix because with size 20 (the new value), rseq registeration would fail. The two other changes are required to make rseq unregistration work in tests. Signed-off-by: Florian Weimer --- criu/cr-restore.c | 8 ++++++++ test/zdtm/static/rseq00.c | 5 ++++- test/zdtm/transition/rseq01.c | 5 ++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4db2f4ecf..b95d4f134 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2618,7 +2618,15 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) if (!kdat.has_ptrace_get_rseq_conf) { #if defined(__GLIBC__) && defined(RSEQ_SIG) rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + /* + * Current glibc reports the feature/active size in + * __rseq_size, not the size passed to the kernel. + * This could be 20, but older kernels expect 32 for + * the size argument even if only 20 bytes are used. + */ rseq->rseq_abi_size = __rseq_size; + if (rseq->rseq_abi_size < 32) + rseq->rseq_abi_size = 32; rseq->signature = RSEQ_SIG; #else rseq->rseq_abi_pointer = 0; diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c index 471ad6a43..7add7801e 100644 --- a/test/zdtm/static/rseq00.c +++ b/test/zdtm/static/rseq00.c @@ -46,12 +46,15 @@ static inline void *__criu_thread_pointer(void) static inline void unregister_glibc_rseq(void) { struct rseq *rseq = (struct rseq *)((char *)__criu_thread_pointer() + __rseq_offset); + unsigned int size = __rseq_size; /* hack: mark glibc rseq structure as failed to register */ rseq->cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED; /* unregister rseq */ - syscall(__NR_rseq, (void *)rseq, __rseq_size, 1, RSEQ_SIG); + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)rseq, size, 1, RSEQ_SIG); } #else static inline void unregister_glibc_rseq(void) diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c index 0fbcc2dca..08a7a8e1a 100644 --- a/test/zdtm/transition/rseq01.c +++ b/test/zdtm/transition/rseq01.c @@ -33,7 +33,10 @@ static inline void *thread_pointer(void) static inline void unregister_old_rseq(void) { /* unregister rseq */ - syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); + unsigned int size = __rseq_size; + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), size, 1, RSEQ_SIG); } #else static inline void unregister_old_rseq(void) From 5783706d57cb2121cd96c378eaa9b448d9ee38aa Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 9 Jul 2024 12:22:25 +0100 Subject: [PATCH 452/775] docs: update amdgpu-plugin man page This patch updates the dependencies section of the AMDGPU plugin man page to reflect that the plugin has been merged upstream and to fix a formatting issue. Signed-off-by: Radostin Stoyanov --- Documentation/criu-amdgpu-plugin.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 35321a915..68803f3db 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -27,14 +27,10 @@ to criu to allow Checkpoint / Restore with ROCm. Dependencies -~~~~~~~~~~~~~~ +------------ *amdkfd support*:: In order to snapshot the *VRAM* and other *GPU* device states, we require - an updated version of amdkfd(amdgpu) driver. The kernel patches are under - review currently. - -*criu 3.16*:: - This work is rebased on latest criu release available at this time. + an updated version of amdkfd(amdgpu) driver. OPTIONS ------- From fcbadfbdbf3ffcefeae7c7b92d96761c23fdc8e5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Jul 2024 03:30:34 +0100 Subject: [PATCH 453/775] plugins: set executable bit on .so files For historical reasons, some tools like rpm [1] or ldd [2,3] may expect the executable bit to be present for the correct identification of shared libraries. The executable bit on .so files is set by default by compilers (e.g., GCC). It is not strictly necessary but primarily a convention. [1] https://docs.fedoraproject.org/en-US/package-maintainers/CommonRpmlintIssues/#unstripped_binary_or_object [2] https://sourceware.org/git/?p=glibc.git;a=blob;f=elf/ldd.bash.in;h=d6b640df;hb=HEAD#l154 [3] $ sudo ldd /usr/lib/criu/*.so /usr/lib/criu/amdgpu_plugin.so: ldd: warning: you do not have execution permission for `/usr/lib/criu/amdgpu_plugin.so' linux-vdso.so.1 (0x00007fd0a2a3e000) libdrm.so.2 => /lib64/libdrm.so.2 (0x00007fd0a29eb000) libdrm_amdgpu.so.1 => /lib64/libdrm_amdgpu.so.1 (0x00007fd0a29de000) libc.so.6 => /lib64/libc.so.6 (0x00007fd0a27fc000) /lib64/ld-linux-x86-64.so.2 (0x00007fd0a2a40000) /usr/lib/criu/cuda_plugin.so: ldd: warning: you do not have execution permission for `/usr/lib/criu/cuda_plugin.so' linux-vdso.so.1 (0x00007f1806e13000) libc.so.6 => /lib64/libc.so.6 (0x00007f1806c08000) /lib64/ld-linux-x86-64.so.2 (0x00007f1806e15000) Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 2 +- plugins/cuda/Makefile | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 5efa8fb0b..6dad00122 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -54,7 +54,7 @@ install: ifeq ($(CONFIG_AMDGPU),y) $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) endif .PHONY: install diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile index e337056dc..cc3d98ac9 100644 --- a/plugins/cuda/Makefile +++ b/plugins/cuda/Makefile @@ -31,11 +31,10 @@ mrproper: clean install: $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) .PHONY: install uninstall: $(E) " UNINSTALL" $(PLUGIN_NAME) $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) .PHONY: uninstall - From 21108b40de1f0ba62c13dca693897c6ef6ac3c62 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 20 Jul 2024 21:14:23 -0700 Subject: [PATCH 454/775] test/zdtm: mount a new tmpfs to the zdtm root /dev The current file system can be mounted with nodev. Fixes #2441 Signed-off-by: Andrei Vagin --- test/others/mnt-ext-dev/run.sh | 2 -- test/zdtm.py | 62 ++++++++++++++++++++++++++-------- test/zdtm/lib/ns.c | 15 +++++++- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 3f6163e08..5cdbc45a8 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -8,8 +8,6 @@ truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop mkfs.ext4 -F zdtm.loop dev=`losetup --find --show zdtm.loop` -mkdir -p ../../dev -cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev diff --git a/test/zdtm.py b/test/zdtm.py index df23ea03d..102f384c0 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -74,7 +74,10 @@ tests_root = None def clean_tests_root(): global tests_root if tests_root and tests_root[0] == os.getpid(): + subprocess.call(["./umount2", os.path.join(tests_root[1], "dev")]) + os.rmdir(os.path.join(tests_root[1], "root/root")) os.rmdir(os.path.join(tests_root[1], "root")) + os.rmdir(os.path.join(tests_root[1], "dev")) os.rmdir(tests_root[1]) @@ -85,8 +88,18 @@ def make_tests_root(): tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", tmpdir)) atexit.register(clean_tests_root) os.mkdir(os.path.join(tests_root[1], "root")) - os.chmod(tests_root[1], 0o777) - return os.path.join(tests_root[1], "root") + os.mkdir(os.path.join(tests_root[1], "root", "root")) + # The current file system can be mounted with nodev, so let's create a + # new tmpfs mount for /dev. + devpath = os.path.join(tests_root[1], "dev") + os.mkdir(devpath) + # zdtm wants to create files on this mount. User namespace tests are + # running with custom user and group mappings. + subprocess.check_call(["mount", "-t", "tmpfs", "criu-test-dev", devpath]) + os.chmod(devpath, 0o777) + os.chmod(tests_root[1], 0o755) + os.chmod(os.path.join(tests_root[1], "root"), 0o755) + return os.path.join(tests_root[1], "root", "root"), os.path.join(tests_root[1], "dev") # Report generation @@ -182,15 +195,16 @@ class host_flavor: class ns_flavor: __root_dirs = [ - "/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", "/dev/pts", - "/dev/net", "/tmp", "/usr", "/proc", "/run" + "/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", + "/tmp", "/usr", "/proc", "/run" ] + __dev_dirs = ["pts", "net"] def __init__(self, opts): self.name = "ns" self.ns = True self.uns = False - self.root = make_tests_root() + self.root, self.devpath = make_tests_root() self.root_mounted = False def __copy_one(self, fname): @@ -236,16 +250,19 @@ class ns_flavor: self.__copy_one(lib) def __mknod(self, name, rdev=None): - name = "/dev/" + name + tdev = stat.S_IFCHR if not rdev: - if not os.access(name, os.F_OK): + if not os.access(os.path.join("/dev", name), os.F_OK): print("Skipping %s at root" % name) return else: - rdev = os.stat(name).st_rdev + s = os.stat(os.path.join("/dev", name)) + rdev = s.st_rdev + if stat.S_ISBLK(s.st_mode): + tdev = stat.S_IFBLK - name = self.root + name - os.mknod(name, stat.S_IFCHR, rdev) + name = os.path.join(self.devpath, name) + os.mknod(name, tdev, rdev) os.chmod(name, 0o666) def __construct_root(self): @@ -256,11 +273,18 @@ class ns_flavor: for ldir in ["/bin", "/sbin", "/lib", "/lib64"]: os.symlink(".." + ldir, self.root + "/usr" + ldir) + def __construct_dev(self): + for dir in self.__dev_dirs: + os.mkdir(os.path.join(self.devpath, dir)) + os.chmod(os.path.join(self.devpath, dir), 0o755) self.__mknod("tty", os.makedev(5, 0)) self.__mknod("null", os.makedev(1, 3)) self.__mknod("net/tun") self.__mknod("rtc") self.__mknod("autofs", os.makedev(10, 235)) + ext_dev = os.getenv("ZDTM_MNT_EXT_DEV") + if ext_dev: + self.__mknod(os.path.basename(ext_dev)) def __copy_deps(self, deps): for d in deps.split('|'): @@ -283,6 +307,9 @@ class ns_flavor: self.__construct_root() os.mknod(self.root + "/.constructed", stat.S_IFREG | 0o600) + if not os.access(self.devpath + "/.constructed", os.F_OK): + self.__construct_dev() + os.mknod(self.devpath + "/.constructed", stat.S_IFREG | 0o600) for b in l_bins: self.__copy_libs(b) for b in x_bins: @@ -480,6 +507,7 @@ class zdtm_test: if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root + env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" if self.__flavor.uns: @@ -587,12 +615,18 @@ class zdtm_test: return opts def getdopts(self): - return self.__getcropts() + self.__freezer.getdopts( - ) + self.__desc.get('dopts', '').split() + opts = self.__getcropts() + self.__freezer.getdopts() + \ + self.__desc.get('dopts', '').split() + if self.__flavor.ns: + opts += ["--external", "mnt[/dev]:ZDTM_DEV"] + return opts def getropts(self): - return self.__getcropts() + self.__freezer.getropts( - ) + self.__desc.get('ropts', '').split() + opts = self.__getcropts() + self.__freezer.getropts() + \ + self.__desc.get('ropts', '').split() + if self.__flavor.ns: + opts += ["--external", "mnt[ZDTM_DEV]:%s" % self.__flavor.devpath] + return opts def unlink_pidfile(self): self.__pid = 0 diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 6f6cccc99..205938d20 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -27,7 +27,7 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path; + char *root, *criu_path, *dev_path; char path[PATH_MAX]; root = getenv("ZDTM_ROOT"); @@ -51,6 +51,19 @@ static int prepare_mntns(void) return -1; } + dev_path = getenv("ZDTM_DEV"); + if (dev_path) { + snprintf(path, sizeof(path), "%s/dev", root); + if (mount(dev_path, path, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + if (mount(NULL, path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + } + criu_path = getenv("ZDTM_CRIU"); if (criu_path) { snprintf(path, sizeof(path), "%s%s", root, criu_path); From 85050be66b4ea78f46a069dbd7981384728d0456 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 31 Jul 2024 23:32:30 +0200 Subject: [PATCH 455/775] seize: fix pause-devices plugin hook The plugin hook "PAUSE_DEVICES" was recently introduced in the following commit. This hook was intended to execute the cuda-checkpoint tool before the process tree is frozen. However, the run_plugins() call has been placed immediately *after* freeze_processes(). This causes the cuda-checkpoint tool to hang indefinitely during the checkpointing of CUDA applications running in containers, eventually leading to its termination by the timeout alarm. a85f488595e0a3a6e6cc6ca7c94d4a00b1341aaf criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_DEVICES to be used during pstree collection This problem can be reproduced with the following example: sudo podman run -d --rm \ --device nvidia.com/gpu=all --security-opt=label=disable \ quay.io/radostin/cuda-counter sudo podman container checkpoint -l -e /tmp/checkpoint.tar Signed-off-by: Radostin Stoyanov --- criu/seize.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index d392259bc..ae270022f 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -983,6 +983,11 @@ int collect_pstree(void) */ alarm(opts.timeout); + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (opts.freeze_cgroup && cgroup_version()) goto err; @@ -991,11 +996,6 @@ int collect_pstree(void) if (opts.freeze_cgroup && freeze_processes()) goto err; - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; From c42b58f4fb2e05feaf0228f63d0fcf828cc2875c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 24 Jul 2024 11:30:59 +0100 Subject: [PATCH 456/775] plugin: enable multiple plugins for the same hook CRIU provides two plugins for checkpoint/restore of GPU applications: amdgpu and cuda. Both plugins use the `RESUME_DEVICES_LATE` hook to enable restore: CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) However, CRIU currently does not support running more than one plugin for the same hook. As a result, when both plugins are installed, the resume function for CUDA applications is not executed. To fix this, we need to make sure that both `plugin_resume_devices_late()` functions return `-ENOTSUP` when restore is not supported. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 3 ++- plugins/cuda/cuda_plugin.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index a41469a50..b73b5101d 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1809,7 +1809,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { pr_perror("failed to open kfd in plugin"); - return -1; + return -ENOTSUP; } args.pid = target_pid; @@ -1818,6 +1818,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { if (errno == ESRCH) { pr_info("Pid %d has no kfd process info\n", target_pid); + exit_code = -ENOTSUP; } else { pr_perror("restore late ioctl failed"); exit_code = -1; diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index b3f2fc8df..f16c4c505 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -408,7 +408,7 @@ interrupt: int cuda_plugin_resume_devices_late(int pid) { if (plugin_disabled) { - return 0; + return -ENOTSUP; } return resume_device(pid, 1); From 8437663cc644af6dde39e371a24c45ea6b681c3c Mon Sep 17 00:00:00 2001 From: liuchao173 Date: Thu, 8 Aug 2024 20:55:42 +0800 Subject: [PATCH 457/775] delete redundant include header files restorer.h has been included in line 43. Fixes: 22963d282729 ("Hide asm/restorer.h from sources") Signed-off-by: liuchao173 --- criu/pie/restorer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 7c34c06d4..51ed6ed4c 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -49,7 +49,6 @@ #include "images/inventory.pb-c.h" #include "shmem.h" -#include "restorer.h" /* * sys_getgroups() buffer size. Not too much, to avoid stack overflow. From 9a85fb6382acb219eed7f72745ad136425ba60a5 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 16:04:36 +0100 Subject: [PATCH 458/775] ci/podman: show criu logs in case of error Signed-off-by: Radostin Stoyanov --- scripts/ci/podman-test.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 72ad59a50..3198589f5 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -24,6 +24,9 @@ podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' +# Show criu logs in case of error +trap 'cat /var/lib/containers/storage/overlay-containers/*/userdata/*.log' EXIT + sleep 1 for i in $(seq 20); do echo "Test $i for podman container checkpoint" @@ -64,3 +67,5 @@ for i in $(seq 20); do podman ps -a rm -f /tmp/chkpt.tar.gz done + +trap 'echo PASS' EXIT \ No newline at end of file From 4dde52a308057a7e392173d6f0c73141b0536989 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 16:07:59 +0100 Subject: [PATCH 459/775] ci/podman: show mounts Show information about mounts available on the host filesystem. This is useful for debugging. Signed-off-by: Radostin Stoyanov --- scripts/ci/podman-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 3198589f5..185783011 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -20,6 +20,7 @@ sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers. # Test checkpoint/restore with action script echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf +cat /proc/self/mountinfo podman info podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' From fde0b7ac69c5b604fc8745341d16d2a494674cbc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 8 Jul 2024 16:53:39 +0100 Subject: [PATCH 460/775] cuda: don't leak fds to cuda-checkpoint Leaking open file descriptors to third-party tools can lead to security risks. Signed-off-by: Radostin Stoyanov --- criu/include/util.h | 1 + criu/util.c | 2 +- plugins/cuda/cuda_plugin.c | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/criu/include/util.h b/criu/include/util.h index 9037dc9e6..435469e1e 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -170,6 +170,7 @@ extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); extern int cr_daemon(int nochdir, int noclose, int close_fd); extern int status_ready(void); extern int is_root_user(void); +extern int close_fds(int minfd); extern int set_proc_self_fd(int fd); diff --git a/criu/util.c b/criu/util.c index d74c2aeef..7dfa1fe42 100644 --- a/criu/util.c +++ b/criu/util.c @@ -524,7 +524,7 @@ int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) return syscall(__NR_close_range, fd, max_fd, flags); } -static int close_fds(int minfd) +int close_fds(int minfd) { DIR *dir; struct dirent *de; diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index f16c4c505..e44b4d007 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -115,7 +115,9 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) if (dup2(fd[WRITE], STDERR_FILENO) == -1) { return -1; } - close(fd[READ]); + + close_fds(STDERR_FILENO + 1); + return execvp(args[0], (char **)args); } else { // parent close(fd[WRITE]); From ad66c27a113b9a37610f8aba686387f5734f5366 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 18:48:30 +0100 Subject: [PATCH 461/775] cuda: fix launch cuda-checkpoint When the cuda-checkpoint tool is not installed, execvp() is expected to fail and return -1. In this case, we need to call exit() to terminate the child process that was created earlier with fork(). Since CRIU can be used with applications that do not use CUDA, even when the CUDA plugin is installed, this patch also updates the log messages to show debug and warning (instead of error) when the cuda-checkpoint tool is not found in $PATH. Signed-off-by: Radostin Stoyanov Signed-off-by: Andrei Vagin --- plugins/cuda/cuda_plugin.c | 141 ++++++++++++++++++++++++------------- 1 file changed, 92 insertions(+), 49 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index e44b4d007..39c78e370 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -45,7 +45,7 @@ struct pid_info { /* Used to track which PID's we've paused CUDA operations on so far so we can * release them after we're done with the DUMP */ -struct list_head cuda_pids; +static LIST_HEAD(cuda_pids); static void dealloc_pid_buffer(struct list_head *pid_buf) { @@ -91,7 +91,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) { #define READ 0 #define WRITE 1 - int fd[2]; + int fd[2], buf_off; if (pipe(fd) != 0) { pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); @@ -110,68 +110,103 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) if (child_pid == 0) { // child if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { - return -1; + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDOUT_FILENO); + _exit(EXIT_FAILURE); } if (dup2(fd[WRITE], STDERR_FILENO) == -1) { - return -1; + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDERR_FILENO); + _exit(EXIT_FAILURE); } + close(fd[READ]); close_fds(STDERR_FILENO + 1); - return execvp(args[0], (char **)args); - } else { // parent - close(fd[WRITE]); + execvp(args[0], (char **)args); - int bytes_read = read(fd[READ], buf, buf_size); - if (bytes_read > 0) { - buf[bytes_read - 1] = '\0'; - } + /* We can't use pr_error() as log file fd is closed. */ + fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno)); - // Clear out any of the remaining output in the pipe in case the buffer wasn't large enough - struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP }; - while (true) { - int poll_status = poll(&read_poll, 1, -1); - if (poll_status == -1) { - close(fd[READ]); - pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n"); - return -1; - } - if (read_poll.revents & POLLHUP) { - break; - } - // POLLIN, read into scratch buffer to flush things out - char scratch[64]; - bytes_read = read(fd[READ], scratch, sizeof(scratch)); - } - - int status; - if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) { - pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n"); - close(fd[READ]); - return -1; - } - - close(fd[READ]); - - return WEXITSTATUS(status); + _exit(EXIT_FAILURE); } + + close(fd[WRITE]); + buf_off = 0; + /* Reserve one byte for the null charracter. */ + buf_size--; + while (buf_off < buf_size) { + int bytes_read; + bytes_read = read(fd[READ], buf + buf_off, buf_size - buf_off); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; + } + if (bytes_read == 0) + break; + buf_off += bytes_read; + } + buf[buf_off] = '\0'; + + /* Clear out any of the remaining output in the pipe in case the buffer wasn't large enough */ + while (true) { + char scratch[1024]; + int bytes_read; + bytes_read = read(fd[READ], scratch, sizeof(scratch)); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; + } + if (bytes_read == 0) + break; + } + close(fd[READ]); + + int status, exit_code = -1; + if (waitpid(child_pid, &status, 0) == -1) { + pr_perror("Unable to wait for the cuda-checkpoint process %d", child_pid); + goto err; + } + if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); + + pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); + } else if (WIFEXITED(status)) { + exit_code = WEXITSTATUS(status); + } else { + pr_err("cuda-checkpoint exited improperly: %u\n", status); + } + + if (exit_code != EXIT_SUCCESS) + pr_debug("cuda-checkpoint output ===>\n%s\n" + "<=== cuda-checkpoint output\n", + buf); + + return exit_code; +err: + kill(child_pid, SIGKILL); + waitpid(child_pid, NULL, 0); + return -1; } -static bool cuda_checkpoint_supports_flag(const char *flag) +/** + * Checks if a given flag is supported by the cuda-checkpoint utility + * + * Returns: + * 1 if the flag is supported, + * 0 if the flag is not supported, + * -1 if there was an error launching the cuda-checkpoint utility. + */ +static int cuda_checkpoint_supports_flag(const char *flag) { char msg_buf[2048]; const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; - int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)); - if (ret != 0) { - pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n"); - return false; - } - if (strstr(msg_buf, flag) == NULL) { - return false; - } + if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0) + return -1; - return true; + if (strstr(msg_buf, flag) == NULL) + return 0; + + return 1; } /* Retrieve the cuda restore thread TID from the root pid */ @@ -419,7 +454,15 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_ int cuda_plugin_init(int stage) { - if (!cuda_checkpoint_supports_flag("--action")) { + int ret = cuda_checkpoint_supports_flag("--action"); + + if (ret == -1) { + pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT); + plugin_disabled = true; + return 0; + } + + if (ret == 0) { pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); plugin_disabled = true; return 0; From 2453ed69a2fd01290200b861fb230b670ca4e19c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 7 Jul 2024 23:52:42 +0100 Subject: [PATCH 462/775] zdtm: add option to run tests with criu plugins By default, if the "CRIU_LIBS_DIR" environment variable is not set, CRIU will load all plugins installed in `/usr/lib/criu`. This may result in running the ZDTM tests with plugins for a different version of CRIU (e.g., installed from a package). This patch updates ZDTM to always set the "CRIU_LIBS_DIR" environment variable and use a local "plugins" directory. This directory contains copies of the plugin files built from source. In addition, this patch adds the `--criu-plugin` option to the `zdtm.py run` command, allowing tests to be run with specified CRIU plugins. Example: - Run test only with AMDGPU plugin ./zdtm.py run -t zdtm/static/busyloop00 --criu-plugin amdgpu - Run test only with CUDA plugin ./zdtm.py run -t zdtm/static/busyloop00 --criu-plugin cuda - Run test with both AMDGPU and CUDA plugins ./zdtm.py run -t zdtm/static/busyloop00 --criu-plugin amdgpu cuda Signed-off-by: Radostin Stoyanov --- test/plugins/.gitignore | 1 + test/plugins/Makefile | 18 ++++++++++++++++++ test/zdtm.py | 21 ++++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 test/plugins/.gitignore create mode 100644 test/plugins/Makefile diff --git a/test/plugins/.gitignore b/test/plugins/.gitignore new file mode 100644 index 000000000..140f8cf80 --- /dev/null +++ b/test/plugins/.gitignore @@ -0,0 +1 @@ +*.so diff --git a/test/plugins/Makefile b/test/plugins/Makefile new file mode 100644 index 000000000..7827b655c --- /dev/null +++ b/test/plugins/Makefile @@ -0,0 +1,18 @@ +SRC_DIR := ../../plugins +PLUGIN_TARGETS := amdgpu_plugin.so cuda_plugin.so + +# Silent make rules. +Q := @ + +all: $(PLUGIN_TARGETS) + +amdgpu_plugin.so: $(SRC_DIR)/amdgpu/amdgpu_plugin.so + $(Q) cp $< $@ + +cuda_plugin.so: $(SRC_DIR)/cuda/cuda_plugin.so + $(Q) cp $< $@ + +clean: + $(Q) $(RM) $(PLUGIN_TARGETS) + +.PHONY: all clean diff --git a/test/zdtm.py b/test/zdtm.py index 102f384c0..87914f740 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -40,6 +40,12 @@ LIBFAULT_PATH = os.path.join( "libfault.so" ) +# A directory that contains the CRIU plugins. +PLUGINS_DIR = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "plugins" +) + prev_line = None uuid = uuid.uuid4() @@ -672,6 +678,12 @@ class zdtm_test: subprocess.check_call(["make", "-C", "zdtm/"]) if 'preload_libfault' in opts and opts['preload_libfault']: subprocess.check_call(["make", "-C", "libfault/"]) + + subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", "clean"]) + if 'criu_plugin' in opts and opts['criu_plugin']: + for name in opts['criu_plugin']: + subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", f"{name}_plugin.so"]) + if 'rootless' in opts and opts['rootless']: return subprocess.check_call( @@ -929,7 +941,9 @@ class criu_cli: timeout=60): env = dict( os.environ, - ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") + ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0", + CRIU_LIBS_DIR=PLUGINS_DIR + ) if fault: print("Forcing %s fault" % fault) @@ -2852,6 +2866,11 @@ def get_cli_args(): rp.add_argument("--test-shard-count", type=int, default=0, help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") + rp.add_argument("--criu-plugin", + help="Run tests with CRIU plugin", + choices=['amdgpu', 'cuda'], + nargs='+', + default=None) lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) From a045c874cb4b2c1d3ec7be76cd22e5f675035831 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 20 Jul 2024 13:17:09 +0100 Subject: [PATCH 463/775] ci: run tests with amdgpu and cuda plugins Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index ef2dffb1a..950453c0d 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -354,7 +354,15 @@ make -C test/others/action-script run # compel testing make -C compel/test -# amdgpu_plugin testing +# amdgpu and cuda plugin testing make amdgpu_plugin make -C plugins/amdgpu/ test_topology_remap ./plugins/amdgpu/test_topology_remap + +./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu cuda + +./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu cuda From 551cd924476021a9e69175b679c4a1db276fc64d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 25 Jul 2024 09:50:10 +0100 Subject: [PATCH 464/775] timer: fix printf specifiers for __suseconds64_t New internal glibc types __timeval64 [1] and __suseconds64_t [2] have been introduced as a solution for the Y2038 problem [3]. These 64-bit types are used across all architectures. However, this change causes the following build errors when cross-compiling on ARMv7 (armhf): criu/timer.c:49:17: error: format '%ld' expects argument of type 'long int', but argument 5 has type '__suseconds64_t' {aka 'long long int'} [-Werror=format=] 49 | pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, | ^~~~~~~~~~~~~~~~~~~~~~~~ 50 | (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, | ~~~~~~~~~~~~~~~~~~~~~ | | | __suseconds64_t {aka long long int} criu/timer.c:49:17: error: format '%ld' expects argument of type 'long int', but argument 7 has type '__suseconds64_t' {aka 'long long int'} [-Werror=format=] 49 | pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, | ^~~~~~~~~~~~~~~~~~~~~~~~ 50 | (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, 51 | (int64_t)val->it_interval.tv_sec, val->it_interval.tv_usec); | ~~~~~~~~~~~~~~~~~~~~~~~~ | | | __suseconds64_t {aka long long int} ns.c:234:48: error: format '%ld' expects argument of type 'long int', but argument 5 has type 'time_t' {aka 'long long int'} [-Werror=format=] 234 | len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); | ~~^ ~~~~~~ | | | | long int time_t {aka long long int} | %lld msg.c:58:41: error: format '%ld' expects argument of type 'long int', but argument 3 has type '__suseconds64_t' {aka 'long long int'} [-Werror=format=] 58 | off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); | ~~~~^ ~~~~~~~~~~~~~~~~~ | | | | long int __suseconds64_t {aka long long int} | %.3lld ../lib/zdtmtst.h:137:26: error: format '%ld' expects argument of type 'long int', but argument 4 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 137 | test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ | ^~~~~~~~~~~~~~ pthread_timers_h.c:72:17: note: in expansion of macro 'pr_perror' 72 | pr_perror("wrong interval: %ld:%ld", itimerspec.it_interval.tv_sec, itimerspec.it_interval.tv_nsec); | ^~~~~~~~~ vdso00.c:22:32: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 22 | test_msg("%d time: %10li\n", getpid(), tv.tv_sec); | ~~~~^ ~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %10lli vdso00.c:29:32: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 29 | test_msg("%d time: %10li\n", getpid(), tv.tv_sec); | ~~~~^ ~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %10lli vdso01.c:357:42: error: format '%li' expects argument of type 'long int', but argument 2 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 357 | test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %lli vdso01.c:357:72: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 357 | test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | vdso01.c:328:43: error: format '%li' expects argument of type 'long int', but argument 2 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 328 | test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | %lli vdso01.c:328:74: error: format '%li' expects argument of type 'long int', but argument 3 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 328 | test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); | ~~^ ~~~~~~~~~~ | | | | long int __time64_t {aka long long int} | ../lib/zdtmtst.h:144:26: error: format '%ld' expects argument of type 'long int', but argument 4 has type 'time_t' {aka 'long long int'} [-Werror=format=] 144 | test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ | ^~~~~~~~~~~~~~~ mtime_mmap.c:80:17: note: in expansion of macro 'fail' 80 | fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); | ^~~~ ../lib/zdtmtst.h:144:26: error: format '%ld' expects argument of type 'long int', but argument 4 has type '__time64_t' {aka 'long long int'} [-Werror=format=] 144 | test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ | ^~~~~~~~~~~~~~~ mtime_mmap.c:101:17: note: in expansion of macro 'fail' 101 | fail("After migration, mtime changed to %ld", fst.st_mtime); | ^~~~ [1] https://sourceware.org/git/?p=glibc.git;h=504c98717062cb9bcbd4b3e59e932d04331ddca5 [2] https://sourceware.org/git/?p=glibc.git;h=3fced064f23562ec24f8312ffbc14950993969e6 [3] https://en.wikipedia.org/wiki/Year_2038_problem Signed-off-by: Radostin Stoyanov --- criu/timer.c | 6 +++--- test/zdtm/lib/msg.c | 3 ++- test/zdtm/lib/ns.c | 3 ++- test/zdtm/static/mtime_mmap.c | 5 +++-- test/zdtm/static/pthread_timers.c | 4 +++- test/zdtm/static/vdso00.c | 6 +++--- test/zdtm/static/vdso01.c | 7 +++++-- 7 files changed, 21 insertions(+), 13 deletions(-) diff --git a/criu/timer.c b/criu/timer.c index 4b286635d..e94cf0280 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -46,9 +46,9 @@ static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) return -1; } - pr_info("Restored %s timer to %" PRId64 ".%ld -> %" PRId64 ".%ld\n", n, - (int64_t)val->it_value.tv_sec, val->it_value.tv_usec, - (int64_t)val->it_interval.tv_sec, val->it_interval.tv_usec); + pr_info("Restored %s timer to %" PRId64 ".%" PRId64 " -> %" PRId64 ".%" PRId64 "\n", n, + (int64_t)val->it_value.tv_sec, (int64_t)val->it_value.tv_usec, + (int64_t)val->it_interval.tv_sec, (int64_t)val->it_interval.tv_usec); return 0; } diff --git a/test/zdtm/lib/msg.c b/test/zdtm/lib/msg.c index 1cf92e3e0..9ba1c47a4 100644 --- a/test/zdtm/lib/msg.c +++ b/test/zdtm/lib/msg.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -55,7 +56,7 @@ void test_msg(const char *format, ...) off += strftime(buf, sizeof(buf), "%H:%M:%S", tm); } - off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); + off += sprintf(buf + off, ".%.3" PRId64 ": ", (int64_t)(tv.tv_usec / 1000)); off += sprintf(buf + off, "%5d: ", getpid()); skip: diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 205938d20..3c0dbdeb8 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -231,7 +232,7 @@ static inline int _settime(clockid_t clk_id, time_t offset) if (clk_id == CLOCK_MONOTONIC_COARSE || clk_id == CLOCK_MONOTONIC_RAW) clk_id = CLOCK_MONOTONIC; - len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); + len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, (int64_t)offset); fd = open("/proc/self/timens_offsets", O_WRONLY); if (fd < 0) { diff --git a/test/zdtm/static/mtime_mmap.c b/test/zdtm/static/mtime_mmap.c index faa2d6fad..4de8438ee 100644 --- a/test/zdtm/static/mtime_mmap.c +++ b/test/zdtm/static/mtime_mmap.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -77,7 +78,7 @@ int main(int argc, char **argv) mtime_new = fst.st_mtime; /* time of last modification */ if (mtime_new <= mtime_old) { - fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); + fail("mtime %" PRId64 " wasn't updated on mmapped %s file", (int64_t)mtime_new, filename); goto failed; } @@ -98,7 +99,7 @@ int main(int argc, char **argv) /* time of last modification */ if (fst.st_mtime != mtime_new) { - fail("After migration, mtime changed to %ld", fst.st_mtime); + fail("After migration, mtime changed to %" PRId64, (int64_t)fst.st_mtime); goto failed; } diff --git a/test/zdtm/static/pthread_timers.c b/test/zdtm/static/pthread_timers.c index 5246a985f..b1b2a9a23 100644 --- a/test/zdtm/static/pthread_timers.c +++ b/test/zdtm/static/pthread_timers.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -69,7 +70,8 @@ int main(int argc, char **argv) } if (itimerspec.it_interval.tv_nsec != TEST_INTERVAL_NSEC || itimerspec.it_interval.tv_sec) { - pr_perror("wrong interval: %ld:%ld", itimerspec.it_interval.tv_sec, itimerspec.it_interval.tv_nsec); + pr_perror("wrong interval: %" PRId64 ":%" PRId64, + (int64_t)itimerspec.it_interval.tv_sec, (int64_t)itimerspec.it_interval.tv_nsec); return 1; } diff --git a/test/zdtm/static/vdso00.c b/test/zdtm/static/vdso00.c index a9bef4dbd..69123a203 100644 --- a/test/zdtm/static/vdso00.c +++ b/test/zdtm/static/vdso00.c @@ -1,6 +1,6 @@ #include #include - +#include #include #include @@ -19,14 +19,14 @@ int main(int argc, char *argv[]) test_msg("%s pid %d\n", argv[0], getpid()); gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); test_daemon(); test_waitsig(); /* this call will fail if vDSO is corrupted */ gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); pass(); diff --git a/test/zdtm/static/vdso01.c b/test/zdtm/static/vdso01.c index 4e33d30a8..d8b3c94d5 100644 --- a/test/zdtm/static/vdso01.c +++ b/test/zdtm/static/vdso01.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -324,7 +325,8 @@ static int vdso_clock_gettime_handler(void *func) clock_gettime(CLOCK_REALTIME, &ts1); vdso_clock_gettime(CLOCK_REALTIME, &ts2); - test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); + test_msg("clock_gettime: tv_sec %" PRId64 " vdso_clock_gettime: tv_sec %" PRId64 "\n", + (int64_t)ts1.tv_sec, (int64_t)ts2.tv_sec); if (labs(ts1.tv_sec - ts2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -354,7 +356,8 @@ static int vdso_gettimeofday_handler(void *func) gettimeofday(&tv1, &tz); vdso_gettimeofday(&tv2, &tz); - test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); + test_msg("gettimeofday: tv_sec %" PRId64 " vdso_gettimeofday: tv_sec %" PRId64 "\n", + (int64_t)tv1.tv_sec, (int64_t)tv2.tv_sec); if (labs(tv1.tv_sec - tv2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); From 67fe44e981e8ec3e351b180cd9d9b95c0c153131 Mon Sep 17 00:00:00 2001 From: haozi007 Date: Tue, 6 Aug 2024 10:57:21 +0800 Subject: [PATCH 465/775] support user set remote mmap vma address 1. os auto assignment vma addr maybe conflict with vma in gpu living migrate scene; 2. so, we should give choice to user; Signed-off-by: haozi007 --- compel/include/uapi/infect.h | 1 + compel/src/lib/infect.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index cd6255909..7e6134f4b 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -120,6 +120,7 @@ struct infect_ctx { open_proc_fn open_proc; int log_fd; /* fd for parasite code to send messages to */ + unsigned long remote_map_addr; /* User-specified address where to mmap parasitic code, default not set */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 79d00c9a1..1e3ffb967 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -816,7 +816,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; - int ret, fd, lfd; + int ret, fd, lfd, remote_flags; if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; @@ -860,7 +860,11 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, goto err_cure; } - ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_FILE | MAP_SHARED, fd, 0); + remote_flags = MAP_FILE | MAP_SHARED; + if (ctl->ictx.remote_map_addr){ + remote_flags |= MAP_FIXED_NOREPLACE; + } + ctl->remote_map = remote_mmap(ctl, (void *)ctl->ictx.remote_map_addr, size, remote_prot, remote_flags, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; From dea63059143e7b72b9af846ca969f1ba2abb3c8e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 16 Aug 2024 07:49:32 -0700 Subject: [PATCH 466/775] test/zdtm: allow to run tests with the mocked cuda-checkpoint tool Here is an example how to run one test: $ python test/zdtm.py run -t zdtm/static/env00 --ignore-taint --mocked-cuda-checkpoint Signed-off-by: Andrei Vagin --- test/cuda-checkpoint/.gitignore | 1 + test/cuda-checkpoint/Makefile | 17 +++++++++ test/cuda-checkpoint/cuda-checkpoint.c | 53 ++++++++++++++++++++++++++ test/zdtm.py | 16 +++++++- 4 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 test/cuda-checkpoint/.gitignore create mode 100644 test/cuda-checkpoint/Makefile create mode 100644 test/cuda-checkpoint/cuda-checkpoint.c diff --git a/test/cuda-checkpoint/.gitignore b/test/cuda-checkpoint/.gitignore new file mode 100644 index 000000000..717fb7028 --- /dev/null +++ b/test/cuda-checkpoint/.gitignore @@ -0,0 +1 @@ +cuda-checkpoint diff --git a/test/cuda-checkpoint/Makefile b/test/cuda-checkpoint/Makefile new file mode 100644 index 000000000..c59dadddc --- /dev/null +++ b/test/cuda-checkpoint/Makefile @@ -0,0 +1,17 @@ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) + +BIN := cuda-checkpoint +SRC := cuda-checkpoint.c +DEP := $(SRC:%.c=%.d) +OBJ := $(SRC:%.c=%.o) +TARGETS := $(BIN) + +include ../zdtm/Makefile.inc + +all: $(TARGETS) +.PHONY: all + +clean-more: + $(RM) $(TARGETS) +.PHONY: clean-more +clean: clean-more diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c new file mode 100644 index 000000000..f35a4b41d --- /dev/null +++ b/test/cuda-checkpoint/cuda-checkpoint.c @@ -0,0 +1,53 @@ +/* The mocked version of cuda-checkpoint. */ +#include +#include +#include + +int main(int argc, char *argv[]) +{ + int c; + + while (1) { + int option_index = 0; + static struct option long_options[] = { + { "pid", required_argument, 0, 'p' }, + { "get-restore-tid", no_argument, 0, 'g' }, + { "action", required_argument, 0, 'a' }, + { "timeout", required_argument, 0, 't' }, + { "help", no_argument, 0, 'h' }, + { 0, 0, 0, 0 } + }; + + c = getopt_long(argc, argv, "p:ga:ht:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'p': + printf("%s\n", optarg); + break; + case 'g': + case 'a': + case 't': + break; + case 'h': + printf("--action - execute an action"); + break; + + default: + fprintf(stderr, "getopt returned character code 0%o ??\n", c); + return 1; + } + } + + if (optind < argc) { + fprintf(stderr, "non-option ARGV-elements: "); + while (optind < argc) + fprintf(stderr, "%s ", argv[optind++]); + fprintf(stderr, "\n"); + return 1; + } + + return 0; +} diff --git a/test/zdtm.py b/test/zdtm.py index 87914f740..6b2132cc3 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -684,6 +684,8 @@ class zdtm_test: for name in opts['criu_plugin']: subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", f"{name}_plugin.so"]) + if 'mocked_cuda_checkpoint' in opts and opts['mocked_cuda_checkpoint']: + subprocess.check_call(["make", "-C", "cuda-checkpoint/"]) if 'rootless' in opts and opts['rootless']: return subprocess.check_call( @@ -1141,6 +1143,7 @@ class criu: self.__pre_dump_mode = opts['pre_dump_mode'] self.__preload_libfault = bool(opts['preload_libfault']) self.__mntns_compat_mode = bool(opts['mntns_compat_mode']) + self.__cuda_checkpoint = bool(opts['mocked_cuda_checkpoint']) if opts['rpc']: self.__criu = criu_rpc @@ -1223,6 +1226,9 @@ class criu: s_args = ["--log-file", log, "--images-dir", self.__ddir(), "--verbosity=4"] + opts + if self.__cuda_checkpoint: + s_args += [ "--libdir" , os.path.join(os.getcwd(), "..", "plugins", "cuda") ] + with open(os.path.join(self.__ddir(), action + '.cropt'), 'w') as f: f.write(' '.join(s_args) + '\n') @@ -2160,7 +2166,7 @@ class Launcher: 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2173,8 +2179,11 @@ class Launcher: if opts['rootless'] and os.getuid() == 0: os.setgid(NON_ROOT_UID) os.setuid(NON_ROOT_UID) + env = dict(os.environ, CR_CT_TEST_INFO=arg) + if opts['mocked_cuda_checkpoint']: + env['PATH'] = os.path.join(os.getcwd(), "cuda-checkpoint") + ":" + env["PATH"] sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], - env=dict(os.environ, CR_CT_TEST_INFO=arg), + env=env, stdout=log, stderr=subprocess.STDOUT, close_fds=True) @@ -2871,6 +2880,9 @@ def get_cli_args(): choices=['amdgpu', 'cuda'], nargs='+', default=None) + rp.add_argument("--mocked-cuda-checkpoint", + action="store_true", + help="Run criu with the cuda plugin and the mocked cuda-checkpoint tool") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) From de31abb9700931383da4dd820b0c5881586949ea Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 12 Aug 2024 07:56:28 -0700 Subject: [PATCH 467/775] criu/plugin: don't call plugin device hooks for non-alive tasks Dead tasks don't hold any resources. Fixes: 2465 Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 4 +++- criu/seize.c | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b95d4f134..4d4dfbe6f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2238,8 +2238,10 @@ skip_ns_bouncing: * mapped memory) could be done sanely once the pie code hands * over the control to master process. */ + pr_info("Run late stage hook from criu master for external devices\n"); for_each_pstree_item(item) { - pr_info("Run late stage hook from criu master for external devices\n"); + if (!task_alive(item)) + continue; ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); /* * This may not really be an error. Only certain plugin hooks diff --git a/criu/seize.c b/criu/seize.c index ae270022f..ba26072e6 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1030,6 +1030,8 @@ int collect_pstree(void) } for_each_pstree_item(iter) { + if (!task_alive(iter)) + continue; ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); if (ret < 0 && ret != -ENOTSUP) goto err; From 9a19cf34de764e2ca77b8d5944a0ee26af79cd13 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 12 Aug 2024 08:08:10 -0700 Subject: [PATCH 468/775] scripts/ci: run tests with the mocked cuda-checkpoint tool Signed-off-by: Andrei Vagin --- scripts/ci/run-ci-tests.sh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 950453c0d..26ea00c53 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -359,10 +359,8 @@ make amdgpu_plugin make -C plugins/amdgpu/ test_topology_remap ./plugins/amdgpu/test_topology_remap -./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin cuda -./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu -./test/zdtm.py run -t zdtm/static/maps00 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda -./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin cuda -./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu -./test/zdtm.py run -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint From 2ee58444117f2c992d446594b10d6fca7029d3af Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 9 Jul 2024 13:07:03 +0100 Subject: [PATCH 469/775] plugins/amdgpu: fix cross-compilation To enable cross-compile we need to use the CC definition from criu/scripts/nmk/scripts/tools.mk: CC := $(CROSS_COMPILE)$(HOSTCC) Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 6dad00122..7d3388b80 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,7 +15,6 @@ DEPS_NOK := ; __nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -CC := gcc PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu From d68205e919fe84c2121ee508059dcf910020a1f6 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 9 Jul 2024 13:15:10 +0100 Subject: [PATCH 470/775] ci: enable cross compile testing for amdgpu-plugin Skip cross-compilation on armv7 because, among many other errors, it fails with the following: In file included from ../../include/common/lock.h:9, from ../../criu/include/files.h:9, from amdgpu_plugin.c:30: ../../include/common/asm/atomic.h:60:2: error: #error ARM architecture version (CONFIG_ARMV*) not set or unsupported. 60 | #error ARM architecture version (CONFIG_ARMV*) not set or unsupported. | ^~~~~ ../../include/common/asm/atomic.h: In function 'atomic_add_return': ../../include/common/asm/atomic.h:81:9: error: implicit declaration of function 'smp_mb' [-Werror=implicit-function-declaration] 81 | smp_mb(); | ^~~~~~ Signed-off-by: Radostin Stoyanov --- scripts/build/Dockerfile.stable-cross.tmpl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 6a68cd1ca..078372c38 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -21,7 +21,8 @@ RUN apt-install \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} + libnl-route-3-dev:${DEBIAN_ARCH} \ + libdrm-dev:${DEBIAN_ARCH} ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ @@ -39,4 +40,10 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +# amdgpu_plugin with armv7 is not supported +RUN make mrproper && date && \ + make -j $(nproc) && \ + if [ "$SUBARCH" != "armv7" ]; then \ + make -j $(nproc) amdgpu_plugin; \ + fi && \ + make -j $(nproc) zdtm && date From 3e2ed18790c0d5812f9bdedbfe94812947894f6f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 21 Jul 2024 12:02:40 +0100 Subject: [PATCH 471/775] plugins/amdgpu: use C99-standard types Co-developed-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/kfd_ioctl.h | 350 ++++++++++++++++++------------------- 1 file changed, 175 insertions(+), 175 deletions(-) diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index e1ebb75a3..1a3bcea95 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -39,8 +39,8 @@ #define KFD_IOCTL_MINOR_VERSION 8 struct kfd_ioctl_get_version_args { - __u32 major_version; /* from KFD */ - __u32 minor_version; /* from KFD */ + uint32_t major_version; /* from KFD */ + uint32_t minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -53,51 +53,51 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ - __u64 doorbell_offset; /* from KFD */ + uint64_t ring_base_address; /* to KFD */ + uint64_t write_pointer_address; /* from KFD */ + uint64_t read_pointer_address; /* from KFD */ + uint64_t doorbell_offset; /* from KFD */ - __u32 ring_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 queue_type; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ - __u32 queue_id; /* from KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t queue_type; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ + uint32_t queue_id; /* from KFD */ - __u64 eop_buffer_address; /* to KFD */ - __u64 eop_buffer_size; /* to KFD */ - __u64 ctx_save_restore_address; /* to KFD */ - __u32 ctx_save_restore_size; /* to KFD */ - __u32 ctl_stack_size; /* to KFD */ + uint64_t eop_buffer_address; /* to KFD */ + uint64_t eop_buffer_size; /* to KFD */ + uint64_t ctx_save_restore_address; /* to KFD */ + uint32_t ctx_save_restore_size; /* to KFD */ + uint32_t ctl_stack_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - __u32 queue_id; /* to KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_update_queue_args { - __u64 ring_base_address; /* to KFD */ + uint64_t ring_base_address; /* to KFD */ - __u32 queue_id; /* to KFD */ - __u32 ring_size; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ }; struct kfd_ioctl_set_cu_mask_args { - __u32 queue_id; /* to KFD */ - __u32 num_cu_mask; /* to KFD */ - __u64 cu_mask_ptr; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t num_cu_mask; /* to KFD */ + uint64_t cu_mask_ptr; /* to KFD */ }; struct kfd_ioctl_get_queue_wave_state_args { - __u64 ctl_stack_address; /* to KFD */ - __u32 ctl_stack_used_size; /* from KFD */ - __u32 save_area_used_size; /* from KFD */ - __u32 queue_id; /* to KFD */ - __u32 pad; + uint64_t ctl_stack_address; /* to KFD */ + uint32_t ctl_stack_used_size; /* from KFD */ + uint32_t save_area_used_size; /* from KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -105,13 +105,13 @@ struct kfd_ioctl_get_queue_wave_state_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - __u64 alternate_aperture_base; /* to KFD */ - __u64 alternate_aperture_size; /* to KFD */ + uint64_t alternate_aperture_base; /* to KFD */ + uint64_t alternate_aperture_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 default_policy; /* to KFD */ - __u32 alternate_policy; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t default_policy; /* to KFD */ + uint32_t alternate_policy; /* to KFD */ + uint32_t pad; }; /* @@ -122,24 +122,24 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - __u64 gpu_clock_counter; /* from KFD */ - __u64 cpu_clock_counter; /* from KFD */ - __u64 system_clock_counter; /* from KFD */ - __u64 system_clock_freq; /* from KFD */ + uint64_t gpu_clock_counter; /* from KFD */ + uint64_t cpu_clock_counter; /* from KFD */ + uint64_t system_clock_counter; /* from KFD */ + uint64_t system_clock_freq; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_process_device_apertures { - __u64 lds_base; /* from KFD */ - __u64 lds_limit; /* from KFD */ - __u64 scratch_base; /* from KFD */ - __u64 scratch_limit; /* from KFD */ - __u64 gpuvm_base; /* from KFD */ - __u64 gpuvm_limit; /* from KFD */ - __u32 gpu_id; /* from KFD */ - __u32 pad; + uint64_t lds_base; /* from KFD */ + uint64_t lds_limit; /* from KFD */ + uint64_t scratch_base; /* from KFD */ + uint64_t scratch_limit; /* from KFD */ + uint64_t gpuvm_base; /* from KFD */ + uint64_t gpuvm_limit; /* from KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t pad; }; /* @@ -152,20 +152,20 @@ struct kfd_ioctl_get_process_apertures_args { struct kfd_process_device_apertures process_apertures[NUM_OF_SUPPORTED_GPUS]; /* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; struct kfd_ioctl_get_process_apertures_new_args { /* User allocated. Pointer to struct kfd_process_device_apertures * filled in by Kernel */ - __u64 kfd_process_device_apertures_ptr; + uint64_t kfd_process_device_apertures_ptr; /* to KFD - indicates amount of memory present in kfd_process_device_apertures_ptr * from KFD - Number of entries filled by KFD. */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -173,25 +173,25 @@ struct kfd_ioctl_get_process_apertures_new_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_unregister_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_address_watch_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; #define KFD_INVALID_FD 0xffffffff @@ -228,43 +228,43 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_MEM_ERR_GPU_HANG 3 struct kfd_ioctl_create_event_args { - __u64 event_page_offset; /* from KFD */ - __u32 event_trigger_data; /* from KFD - signal events only */ - __u32 event_type; /* to KFD */ - __u32 auto_reset; /* to KFD */ - __u32 node_id; /* to KFD - only valid for certain event types */ - __u32 event_id; /* from KFD */ - __u32 event_slot_index; /* from KFD */ + uint64_t event_page_offset; /* from KFD */ + uint32_t event_trigger_data; /* from KFD - signal events only */ + uint32_t event_type; /* to KFD */ + uint32_t auto_reset; /* to KFD */ + uint32_t node_id; /* to KFD - only valid for certain event types */ + uint32_t event_id; /* from KFD */ + uint32_t event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_set_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_reset_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_memory_exception_failure { - __u32 NotPresent; /* Page not present or supervisor privilege */ - __u32 ReadOnly; /* Write access to a read-only page */ - __u32 NoExecute; /* Execute access to a page marked NX */ - __u32 imprecise; /* Can't determine the exact fault address */ + uint32_t NotPresent; /* Page not present or supervisor privilege */ + uint32_t ReadOnly; /* Write access to a read-only page */ + uint32_t NoExecute; /* Execute access to a page marked NX */ + uint32_t imprecise; /* Can't determine the exact fault address */ }; /* memory exception data */ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - __u64 va; - __u32 gpu_id; - __u32 ErrorType; /* 0 = no RAS error, + uint64_t va; + uint32_t gpu_id; + uint32_t ErrorType; /* 0 = no RAS error, * 1 = ECC_SRAM, * 2 = Link_SYNFLOOD (poison), * 3 = GPU hang (not attributable to a specific cause), @@ -274,10 +274,10 @@ struct kfd_hsa_memory_exception_data { /* hw exception data */ struct kfd_hsa_hw_exception_data { - __u32 reset_type; - __u32 reset_cause; - __u32 memory_lost; - __u32 gpu_id; + uint32_t reset_type; + uint32_t reset_cause; + uint32_t memory_lost; + uint32_t gpu_id; }; /* Event data */ @@ -286,57 +286,57 @@ struct kfd_event_data { struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; }; /* From KFD */ - __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - __u32 event_id; /* to KFD */ - __u32 pad; + uint64_t kfd_event_data_ext; /* pointer to an extension structure for future exception types */ + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_wait_events_args { - __u64 events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - __u32 num_events; /* to KFD */ - __u32 wait_for_all; /* to KFD */ - __u32 timeout; /* to KFD */ - __u32 wait_result; /* from KFD */ + uint64_t events_ptr; /* pointed to struct kfd_event_data array, to KFD */ + uint32_t num_events; /* to KFD */ + uint32_t wait_for_all; /* to KFD */ + uint32_t timeout; /* to KFD */ + uint32_t wait_result; /* from KFD */ }; struct kfd_ioctl_set_scratch_backing_va_args { - __u64 va_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t va_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_get_tile_config_args { /* to KFD: pointer to tile array */ - __u64 tile_config_ptr; + uint64_t tile_config_ptr; /* to KFD: pointer to macro tile array */ - __u64 macro_tile_config_ptr; + uint64_t macro_tile_config_ptr; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_tile_configs; + uint32_t num_tile_configs; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_macro_tile_configs; + uint32_t num_macro_tile_configs; - __u32 gpu_id; /* to KFD */ - __u32 gb_addr_config; /* from KFD */ - __u32 num_banks; /* from KFD */ - __u32 num_ranks; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t gb_addr_config; /* from KFD */ + uint32_t num_banks; /* from KFD */ + uint32_t num_ranks; /* from KFD */ /* struct size can be extended later if needed without breaking ABI compatibility */ }; struct kfd_ioctl_set_trap_handler_args { - __u64 tba_addr; /* to KFD */ - __u64 tma_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t tba_addr; /* to KFD */ + uint64_t tma_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_acquire_vm_args { - __u32 drm_fd; /* to KFD */ - __u32 gpu_id; /* to KFD */ + uint32_t drm_fd; /* to KFD */ + uint32_t gpu_id; /* to KFD */ }; /* Allocation flags: memory types */ @@ -367,12 +367,12 @@ struct kfd_ioctl_acquire_vm_args { * @flags: memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above */ struct kfd_ioctl_alloc_memory_of_gpu_args { - __u64 va_addr; /* to KFD */ - __u64 size; /* to KFD */ - __u64 handle; /* from KFD */ - __u64 mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ - __u32 gpu_id; /* to KFD */ - __u32 flags; + uint64_t va_addr; /* to KFD */ + uint64_t size; /* to KFD */ + uint64_t handle; /* from KFD */ + uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ + uint32_t gpu_id; /* to KFD */ + uint32_t flags; }; /* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu @@ -380,13 +380,13 @@ struct kfd_ioctl_alloc_memory_of_gpu_args { * @handle: memory handle returned by alloc */ struct kfd_ioctl_free_memory_of_gpu_args { - __u64 handle; /* to KFD */ + uint64_t handle; /* to KFD */ }; /* Map memory to one or more GPUs * * @handle: memory handle returned by alloc - * @device_ids_array_ptr: array of gpu_ids (__u32 per device) + * @device_ids_array_ptr: array of gpu_ids (uint32_t per device) * @n_devices: number of devices in the array * @n_success: number of devices mapped successfully * @@ -399,10 +399,10 @@ struct kfd_ioctl_free_memory_of_gpu_args { * n_devices. */ struct kfd_ioctl_map_memory_to_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Unmap memory from one or more GPUs @@ -410,10 +410,10 @@ struct kfd_ioctl_map_memory_to_gpu_args { * same arguments as for mapping */ struct kfd_ioctl_unmap_memory_from_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Allocate GWS for specific queue @@ -424,28 +424,28 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { * only support contiguous GWS allocation */ struct kfd_ioctl_alloc_queue_gws_args { - __u32 queue_id; /* to KFD */ - __u32 num_gws; /* to KFD */ - __u32 first_gws; /* from KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t num_gws; /* to KFD */ + uint32_t first_gws; /* from KFD */ + uint32_t pad; }; struct kfd_ioctl_get_dmabuf_info_args { - __u64 size; /* from KFD */ - __u64 metadata_ptr; /* to KFD */ - __u32 metadata_size; /* to KFD (space allocated by user) + uint64_t size; /* from KFD */ + uint64_t metadata_ptr; /* to KFD */ + uint32_t metadata_size; /* to KFD (space allocated by user) * from KFD (actual metadata size) */ - __u32 gpu_id; /* from KFD */ - __u32 flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ - __u32 dmabuf_fd; /* to KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ + uint32_t dmabuf_fd; /* to KFD */ }; struct kfd_ioctl_import_dmabuf_args { - __u64 va_addr; /* to KFD */ - __u64 handle; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 dmabuf_fd; /* to KFD */ + uint64_t va_addr; /* to KFD */ + uint64_t handle; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t dmabuf_fd; /* to KFD */ }; /* @@ -463,8 +463,8 @@ enum kfd_smi_event { #define KFD_SMI_EVENT_MSG_SIZE 96 struct kfd_ioctl_smi_events_args { - __u32 gpuid; /* to KFD */ - __u32 anon_fd; /* from KFD */ + uint32_t gpuid; /* to KFD */ + uint32_t anon_fd; /* from KFD */ }; /************************************************************************************************** @@ -510,33 +510,33 @@ enum kfd_criu_op { * Return: 0 on success, -errno on failure */ struct kfd_ioctl_criu_args { - __u64 devices; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 bos; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 pid; /* Used during ops: PROCESS_INFO, RESUME */ - __u32 op; + uint64_t devices; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t bos; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t pid; /* Used during ops: PROCESS_INFO, RESUME */ + uint32_t op; }; struct kfd_criu_device_bucket { - __u32 user_gpu_id; - __u32 actual_gpu_id; - __u32 drm_fd; - __u32 pad; + uint32_t user_gpu_id; + uint32_t actual_gpu_id; + uint32_t drm_fd; + uint32_t pad; }; struct kfd_criu_bo_bucket { - __u64 addr; - __u64 size; - __u64 offset; - __u64 restored_offset; /* During restore, updated offset for BO */ - __u32 gpu_id; /* This is the user_gpu_id */ - __u32 alloc_flags; - __u32 dmabuf_fd; - __u32 pad; + uint64_t addr; + uint64_t size; + uint64_t offset; + uint64_t restored_offset; /* During restore, updated offset for BO */ + uint32_t gpu_id; /* This is the user_gpu_id */ + uint32_t alloc_flags; + uint32_t dmabuf_fd; + uint32_t pad; }; /* CRIU IOCTLs - END */ @@ -616,8 +616,8 @@ enum kfd_ioctl_svm_attr_type { * @value: attribute value */ struct kfd_ioctl_svm_attribute { - __u32 type; - __u32 value; + uint32_t type; + uint32_t value; }; /** @@ -659,10 +659,10 @@ struct kfd_ioctl_svm_attribute { * attribute type to indicate the access for the specified GPU. */ struct kfd_ioctl_svm_args { - __u64 start_addr; - __u64 size; - __u32 op; - __u32 nattr; + uint64_t start_addr; + uint64_t size; + uint32_t op; + uint32_t nattr; /* Variable length array of attributes */ struct kfd_ioctl_svm_attribute attrs[0]; }; From 21ea718f9ffe70fa5fe2089a8fa5b39f101d740a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Jul 2024 04:36:18 +0100 Subject: [PATCH 472/775] plugins/amdgpu: fix printf format specifiers Errors on aarch64: In file included from amdgpu_plugin_drm.h:10, from amdgpu_plugin.c:33: amdgpu_plugin.c: In function 'amdgpu_plugin_dump_file': amdgpu_plugin_util.h:24:20: error: format '%lld' expects argument of type 'long long int', but argument 6 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin.c:1236:9: note: in expansion of macro 'pr_info' 1236 | pr_info("devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, args.num_objects, | ^~~~~~~ cc1: all warnings being treated as errors Errors on ppc64: In file included from amdgpu_plugin_drm.h:10, from amdgpu_plugin.c:33: amdgpu_plugin.c: In function 'amdgpu_plugin_dump_file': amdgpu_plugin_util.h:24:20: error: format '%llu' expects argument of type 'long long unsigned int', but argument 6 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin.c:1236:9: note: in expansion of macro 'pr_info' 1236 | pr_info("devices:%u bos:%u objects:%u priv_data:%llu\n", | ^~~~~~~ cc1: all warnings being treated as errors In file included from amdgpu_plugin_util.c:38: amdgpu_plugin_util.c: In function 'print_kfd_bo_stat': amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:196:17: note: in expansion of macro 'pr_info' 196 | pr_info("%s(), %d. KFD BO Addr: %llx \n", __func__, idx, bo->addr); | ^~~~~~~ amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:197:17: note: in expansion of macro 'pr_info' 197 | pr_info("%s(), %d. KFD BO Size: %llx \n", __func__, idx, bo->size); | ^~~~~~~ amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:198:17: note: in expansion of macro 'pr_info' 198 | pr_info("%s(), %d. KFD BO Offset: %llx \n", __func__, idx, bo->offset); | ^~~~~~~ amdgpu_plugin_util.h:24:20: error: format '%llx' expects argument of type 'long long unsigned int', but argument 5 has type '__u64' {aka 'long unsigned int'} [-Werror=format=] 24 | #define LOG_PREFIX "amdgpu_plugin: " | ^~~~~~~~~~~~~~~~~ ../../criu/include/log.h:47:52: note: in expansion of macro 'LOG_PREFIX' 47 | #define pr_info(fmt, ...) print_on_level(LOG_INFO, LOG_PREFIX fmt, ##__VA_ARGS__) | ^~~~~~~~~~ amdgpu_plugin_util.c:199:17: note: in expansion of macro 'pr_info' 199 | pr_info("%s(), %d. KFD BO Restored Offset: %llx \n", __func__, idx, bo->restored_offset); | ^~~~~~~ cc1: all warnings being treated as errors Co-developed-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 4 ++-- plugins/amdgpu/amdgpu_plugin_util.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index b73b5101d..707aea5a9 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1233,8 +1233,8 @@ int amdgpu_plugin_dump_file(int fd, int id) goto exit; } - pr_info("devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, args.num_objects, - args.priv_data_size); + pr_info("devices:%" PRIu32 " bos:%" PRIu32 " objects:%" PRIu32 " priv_data:%" PRIu64 "\n", + args.num_devices, args.num_bos, args.num_objects, args.priv_data_size); e = xmalloc(sizeof(*e)); if (!e) { diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index 62e569fc8..a165fc9cd 100755 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -193,10 +193,10 @@ void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) for (int idx = 0; idx < bo_cnt; idx++) { bo = &bo_list[idx]; pr_info("\n"); - pr_info("%s(), %d. KFD BO Addr: %llx \n", __func__, idx, bo->addr); - pr_info("%s(), %d. KFD BO Size: %llx \n", __func__, idx, bo->size); - pr_info("%s(), %d. KFD BO Offset: %llx \n", __func__, idx, bo->offset); - pr_info("%s(), %d. KFD BO Restored Offset: %llx \n", __func__, idx, bo->restored_offset); + pr_info("%s(), %d. KFD BO Addr: %" PRIx64 " \n", __func__, idx, bo->addr); + pr_info("%s(), %d. KFD BO Size: %" PRIx64 " \n", __func__, idx, bo->size); + pr_info("%s(), %d. KFD BO Offset: %" PRIx64 " \n", __func__, idx, bo->offset); + pr_info("%s(), %d. KFD BO Restored Offset: %" PRIx64 " \n", __func__, idx, bo->restored_offset); pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags); pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id); pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd); From 615ccf98cf82f649851a37d37eb5e58d6d17bb53 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 5 Jul 2024 08:22:55 +0000 Subject: [PATCH 473/775] crit: do not crash on aarch64 doing 'crit x ./ rss' Running 'crit x ./ rss' on aarch64 crashes with: File "/home/criu/crit/crit/__main__.py", line 331, in explore_rss while vmas[vmi]['start'] < pme: ~~~~^^^^^ IndexError: list index out of range This adds an additional check to the while loop to do access indexes out of range. Signed-off-by: Adrian Reber --- crit/crit/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crit/crit/__main__.py b/crit/crit/__main__.py index e15327f50..bce523445 100755 --- a/crit/crit/__main__.py +++ b/crit/crit/__main__.py @@ -323,12 +323,12 @@ def explore_rss(opts): pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) - while vmas[vmi]['end'] <= pm['vaddr']: + while vmi < len(vmas) and vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' - while vmas[vmi]['start'] < pme: + while vmi < len(vmas) and vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' From 01c65732b631fa0c61dff8d59d56bb13ab252e3d Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 8 Jul 2024 16:50:23 +0000 Subject: [PATCH 474/775] test: better test for SELinux tools Previously the check was just if /sys/fs/selinux is mounted. This extends the check to see if all necessary tools are installed. Signed-off-by: Adrian Reber --- test/zdtm/static/selinux00.checkskip | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/zdtm/static/selinux00.checkskip b/test/zdtm/static/selinux00.checkskip index 8d946a75e..4c85647d1 100755 --- a/test/zdtm/static/selinux00.checkskip +++ b/test/zdtm/static/selinux00.checkskip @@ -2,6 +2,19 @@ test -d /sys/fs/selinux || exit 1 +# check if necessary commands are installed +if ! command -v setenforce &>/dev/null; then + exit 1 +fi + +if ! command -v setsebool &>/dev/null; then + exit 1 +fi + +if ! command -v getsebool &>/dev/null; then + exit 1 +fi + # See selinux00.hook for details getsebool unconfined_dyntrans_all > /dev/null 2>&1 From d44fc0de5a314629e305507d3d09a17d372c47d8 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 8 Jul 2024 16:51:28 +0000 Subject: [PATCH 475/775] test: only run macvlan tests if macvlan devices can be created Some test environments (Actuated runners for example) do not support maclvan devices. Skip tests depending on it automatically. Signed-off-by: Adrian Reber --- test/others/ns_ext/run.sh | 5 ++++ test/zdtm/static/macvlan.checkskip | 38 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100755 test/zdtm/static/macvlan.checkskip diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index 4ebe3e280..5d1e139d7 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -2,6 +2,11 @@ set -x +if ! ../../zdtm/static/macvlan.checkskip; then + echo "No macvlan support. Skipping" + exit 0 +fi + if [[ "$1" == "pid" ]]; then NS=pid else diff --git a/test/zdtm/static/macvlan.checkskip b/test/zdtm/static/macvlan.checkskip new file mode 100755 index 000000000..f4e060953 --- /dev/null +++ b/test/zdtm/static/macvlan.checkskip @@ -0,0 +1,38 @@ +#!/bin/bash + +FAIL=0 + +create_macvlan_device() { + if ! ip link add test_mvlan1 type veth >/dev/null 2>&1; then + FAIL=1 + fi + if ! ip link add mymacvlan1 link test_mvlan1 type macvlan >/dev/null 2>&1; then + FAIL=1 + fi + + return "${FAIL}" +} + +cleanup() { + ip link del test_mvlan1 >/dev/null 2>&1 + ip link del mymacvlan1 >/dev/null 2>&1 +} + +trap "cleanup" QUIT TERM INT HUP EXIT + +# Test once without loading the module +if create_macvlan_device; then + exit 0 +fi + +# Test once more with explicitly loading the module +if ! modprobe macvlan >/dev/null 2>&1; then + exit 1 +fi +create_macvlan_device + +if [ "${FAIL}" == "1" ]; then + exit 1 +fi + +exit 0 From 8beac656fccac38d779d672ed9510d57c6fc73e2 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 8 Jul 2024 17:02:09 +0000 Subject: [PATCH 476/775] coredump: fail on unsupported architectures early Currently coredump only works on x86_64. Fail early on any other architecture. Signed-off-by: Adrian Reber --- coredump/coredump | 5 +++++ test/others/criu-coredump/test.sh | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/coredump/coredump b/coredump/coredump index f70d37c13..3fbdafe81 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import platform import argparse import os import sys @@ -36,6 +37,10 @@ def main(): opts = vars(parser.parse_args()) + if platform.machine() != 'x86_64': + print('ERROR: %s only supported on x86_64' % sys.argv[0]) + sys.exit(1) + try: coredump(opts) except SystemExit as error: diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index eec2b817f..4399044d7 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -43,5 +43,13 @@ function run_test { echo "= done" } +UNAME_M=$(uname -m) + +if [ "$UNAME_M" != "x86_64" ]; then + # the criu-coredump script is only x86_64 aware + echo "criu-coredump only support x86_64. skipping." + exit 0 +fi + gen_imgs run_test From dbfa450246dd39f3b80aa1a5ef15c077809abe68 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 4 Jul 2024 12:09:06 +0000 Subject: [PATCH 477/775] ci: run aarch64 tests native via actuated Signed-off-by: Adrian Reber --- .github/workflows/actuated-aarch64-test.yaml | 52 ++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/actuated-aarch64-test.yaml diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml new file mode 100644 index 000000000..8b0a63fc7 --- /dev/null +++ b/.github/workflows/actuated-aarch64-test.yaml @@ -0,0 +1,52 @@ +name: Actuated aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: actuated-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + # Actuated runners are not available in all repositories. + if: ${{ github.repository == 'checkpoint-restore/criu' }} + # The memory size and the number of CPUs can be freely selected. + # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. + runs-on: actuated-arm64-4cpu-3gb + strategy: + matrix: + target: [GCC=1, CLANG=1] + + steps: + # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md + # vmmeter start + - name: Prepare arkade + uses: alexellis/arkade-get@master + with: + crane: latest + print-summary: false + + - name: Install vmmeter + run: | + crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin + + - name: Run vmmeter + uses: self-actuated/vmmeter-action@master + # vmmeter end + + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} + # Following tests are failing on the actuated VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" From b1b3c14b179053a7c56b7d3449fc104ea7cca274 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 16 Aug 2024 22:15:20 +0100 Subject: [PATCH 478/775] cuda: unlock on timeout error When attempting to checkpoint a container with CUDA processes, CRIU could fail with the following error: Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 1 Error (cuda_plugin.c:143): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:384): cuda_plugin: PAUSE_DEVICES failed with In this situation, the target process is locked, but CRIU fails due to a timeout and exits with an error. We need to make sure that the target PID is unlocked in such case. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 39c78e370..174545476 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -4,6 +4,7 @@ #include "cr_options.h" #include "pid.h" #include "proc_parse.h" +#include "seize.h" #include #include @@ -379,18 +380,23 @@ int cuda_plugin_pause_devices(int pid) int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); if (status) { pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); - return -1; - } - if (add_pid_to_buf(&cuda_pids, pid)) { - pr_err("unable to track paused pid %d\n", pid); - status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); - } + if (alarm_timeouted()) + goto unlock; return -1; } + if (add_pid_to_buf(&cuda_pids, pid)) { + pr_err("unable to track paused pid %d\n", pid); + goto unlock; + } + return 0; +unlock: + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); + } + return -1; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) From edb6fbb820ca1afcf6264edeacd3207dfc2d721c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 17 Aug 2024 14:30:21 +0100 Subject: [PATCH 479/775] scripts/uninstall_module: fix package discovery The `uninstall_module.py` script is a wrapper for the `pip uninstall` command that enables support for specifying installation prefix (i.e., `--prefix`). When this functionality is used, we intentionally set `sys.path` to include only search paths for the specified prefix to avoid unintentional uninstallation of packages in system paths. Since `importlib_metadata` version 8.1.0, the `Distribution.from_name()` method has been modified [1] to perform additional pre-processing of Distribution objects [2] that requires loading distribution metadata and results in the following error: File "/usr/local/lib/python3.12/site-packages/importlib_metadata/__init__.py", line 422, in buckets = bucket(dists, lambda dist: bool(dist.metadata)) ^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/importlib_metadata/__init__.py", line 454, in metadata from . import _adapters File "/usr/local/lib/python3.12/site-packages/importlib_metadata/_adapters.py", line 3, in import email.message File "/usr/lib64/python3.12/email/message.py", line 11, in import quopri ModuleNotFoundError: No module named 'quopri' This error occurs because we have excluded system paths from the list of search paths (`sys.path`). However, this pre-processing is not required for our use case, as we only use the discovery mechanism of importlib_metadata to resolve the metadata directory path of the module being uninstalled. To fix this problem, this patch updates `uninstall_module` to avoid the `from_name()` method and use `discover(name=package_name)` directly. [1] https://github.com/python/importlib_metadata/commit/a65c29adc027b3615154cab73aaedd58a6aa23da [2] https://github.com/python/importlib_metadata/blob/a65c29ad/importlib_metadata/__init__.py#L391 Fixes: #2468 Signed-off-by: Radostin Stoyanov --- scripts/uninstall_module.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py index 439fca18a..8a9b70892 100755 --- a/scripts/uninstall_module.py +++ b/scripts/uninstall_module.py @@ -38,8 +38,9 @@ def uninstall_module(package_name: str, prefix=None): if prefix: add_site_dir(prefix) try: - dist_info_path = str(importlib_metadata.distribution(package_name)._path) - except importlib_metadata.PackageNotFoundError: + distribution = next(importlib_metadata.Distribution.discover(name=package_name)) + dist_info_path = str(distribution._path) + except StopIteration: print(f"Skipping {package_name} as it is not installed.") sys.exit(0) From 59f49c6276090ff7b8de19b2fcabbd968fc77f0d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 12 Sep 2024 14:30:30 +0100 Subject: [PATCH 480/775] codespell: fix typos This patch fixes the following typos reported by codespell: ./test/others/bers/bers.c:394: dependin ==> depending, depend in ./criu/kerndat.c:837: hitted ==> hit Signed-off-by: Radostin Stoyanov --- criu/kerndat.c | 2 +- test/others/bers/bers.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index 1a584fe92..fa1ed21fa 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -834,7 +834,7 @@ static int kerndat_detect_stack_guard_gap(void) * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete - * patch released which hitted a number + * patch released which hit a number * of repos (Ubuntu, Fedora) where instead * of PAGE_SIZE the 1M gap is cut off. */ diff --git a/test/others/bers/bers.c b/test/others/bers/bers.c index 37cf84dd3..b291e3bcb 100644 --- a/test/others/bers/bers.c +++ b/test/others/bers/bers.c @@ -391,7 +391,7 @@ usage: pr_msg(" -f|--files create files for each task\n"); pr_msg(" -m|--memory allocate megabytes for each task\n"); pr_msg(" --memory-chunks split memory to equal parts\n"); - pr_msg(" --mem-fill fill memory with data dependin on :\n"); + pr_msg(" --mem-fill fill memory with data depending on :\n"); pr_msg(" all fill every byte of memory\n"); pr_msg(" light fill first bytes of every page\n"); pr_msg(" dirtify fill every page\n"); From 651df375bd3cbacd27bde44546f19f29546db576 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 11 Sep 2024 17:37:45 -0700 Subject: [PATCH 481/775] criu: Allow disabling freeze cgroups Some plugins (e.g., CUDA) may not function correctly when processes are frozen using cgroups. This change introduces a mechanism to disable the use of freeze cgroups during process seizing, even if explicitly requested via the --freeze-cgroup option. The CUDA plugin is updated to utilize this new mechanism to ensure compatibility. Signed-off-by: Andrei Vagin --- criu/include/seize.h | 1 + criu/seize.c | 66 +++++++++++++++++++++++++++++++------- plugins/cuda/cuda_plugin.c | 2 ++ 3 files changed, 58 insertions(+), 11 deletions(-) diff --git a/criu/include/seize.h b/criu/include/seize.h index 4545bf262..3225029dd 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -8,5 +8,6 @@ extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); +extern void dont_use_freeze_cgroup(void); #endif diff --git a/criu/seize.c b/criu/seize.c index ba26072e6..edeb57cc8 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -25,6 +25,19 @@ #include "xmalloc.h" #include "util.h" +static bool freeze_cgroup_disabled; + +/* + * Disables the use of freeze cgroups for process seizing, even if explicitly + * requested via the --freeze-cgroup option. This is necessary for plugins + * (e.g., CUDA) that do not function correctly when processes are frozen using + * cgroups. + */ +void __attribute__((used)) dont_use_freeze_cgroup(void) +{ + freeze_cgroup_disabled = true; +} + char *task_comm_info(pid_t pid, char *comm, size_t size) { bool is_read = false; @@ -397,7 +410,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || freeze_cgroup_disabled) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -492,6 +505,31 @@ static int log_unfrozen_stacks(char *root) return 0; } +static int check_freezer_cgroup(void) +{ + enum freezer_state state = THAWED; + int fd; + + BUG_ON(!freeze_cgroup_disabled); + + fd = freezer_open(); + if (fd < 0) + return -1; + + state = get_freezer_state(fd); + close(fd); + if (state == FREEZER_ERROR) { + return -1; + } + + if (state != THAWED) { + pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n"); + return -1; + } + + return 0; +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -643,7 +681,7 @@ static int collect_children(struct pstree_item *item) goto free; } - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || freeze_cgroup_disabled) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -831,7 +869,8 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) + if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && + compel_interrupt_task(pid)) continue; ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); @@ -887,7 +926,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup) + if (opts.freeze_cgroup && !freeze_cgroup_disabled) attempts = 1; /* @@ -993,12 +1032,16 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && freeze_processes()) - goto err; - - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { - set_cr_errno(ESRCH); - goto err; + if (opts.freeze_cgroup && !freeze_cgroup_disabled) { + if (freeze_processes()) + goto err; + } else { + if (opts.freeze_cgroup && check_freezer_cgroup()) + goto err; + if (compel_interrupt_task(pid)) { + set_cr_errno(ESRCH); + goto err; + } } ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL); @@ -1024,7 +1067,8 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && freezer_wait_processes()) { + if (opts.freeze_cgroup && !freeze_cgroup_disabled && + freezer_wait_processes()) { ret = -1; goto err; } diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 174545476..04d70b114 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -483,6 +483,8 @@ int cuda_plugin_init(int stage) INIT_LIST_HEAD(&cuda_pids); } + dont_use_freeze_cgroup(); + return 0; } From e1331a4b60e49d0b70895d3d57350825ad2026d1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 12 Sep 2024 11:17:56 -0700 Subject: [PATCH 482/775] fault: allow to check dont_use_freeze_cgroup Adds a new "fault" to call dont_use_freeze_cgroup. Signed-off-by: Andrei Vagin --- criu/fault-injection.c | 9 +++++++++ criu/include/fault-injection.h | 1 + criu/include/seize.h | 1 + test/jenkins/criu-fault.sh | 4 ++++ 4 files changed, 15 insertions(+) diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 83dc1fc8d..2272e6d84 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -1,6 +1,7 @@ #include #include "criu-log.h" #include "fault-injection.h" +#include "seize.h" enum faults fi_strategy; @@ -21,5 +22,13 @@ int fault_injection_init(void) } fi_strategy = start; + + switch (fi_strategy) { + case FI_DISABLE_FREEZE_CGROUP: + dont_use_freeze_cgroup(); + break; + default: + break; + }; return 0; } diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 552ee4338..82c3a1f7f 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -21,6 +21,7 @@ enum faults { FI_CORRUPT_EXTREGS = 134, FI_DONT_USE_PAGEMAP_SCAN = 135, FI_DUMP_CRASH = 136, + FI_DISABLE_FREEZE_CGROUP = 137, FI_MAX, }; diff --git a/criu/include/seize.h b/criu/include/seize.h index 3225029dd..f5ea76b16 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,6 +2,7 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 1fda40a96..fc0eddc2b 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -40,6 +40,10 @@ fi # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail +# check dont_use_freeze_cgroup +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst + if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then fail fi From 6918998897f1c75cd82e042df6c113c42e30c57e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 14 Sep 2024 09:08:12 -0700 Subject: [PATCH 483/775] plugin/cuda: disable CUDA plugin if /dev/nvidiactl isn't present The presence of /dev/nvidiactl indicates that the system has a compatible NVIDIA GPU driver installed and that the GPU is accessible to the operating system. Signed-off-by: Andrei Vagin --- criu/include/fault-injection.h | 1 + plugins/cuda/cuda_plugin.c | 10 +++++++++- scripts/ci/run-ci-tests.sh | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 82c3a1f7f..59adf05b9 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -22,6 +22,7 @@ enum faults { FI_DONT_USE_PAGEMAP_SCAN = 135, FI_DUMP_CRASH = 136, FI_DISABLE_FREEZE_CGROUP = 137, + FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 04d70b114..23c3f4b1a 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -5,6 +5,7 @@ #include "pid.h" #include "proc_parse.h" #include "seize.h" +#include "fault-injection.h" #include #include @@ -460,8 +461,15 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_ int cuda_plugin_init(int stage) { - int ret = cuda_checkpoint_supports_flag("--action"); + int ret; + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { + pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); + plugin_disabled = true; + return 0; + } + + ret = cuda_checkpoint_supports_flag("--action"); if (ret == -1) { pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT); plugin_disabled = true; diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 26ea00c53..38b7b5097 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -363,4 +363,4 @@ make -C plugins/amdgpu/ test_topology_remap ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda -./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint +./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 From 60ee5ebd9d20ca87ad102449676d0230b8217173 Mon Sep 17 00:00:00 2001 From: David Francis Date: Mon, 16 Sep 2024 09:36:25 -0400 Subject: [PATCH 484/775] plugins/amdgpu: Zero ib_info on initialization This struct was being used un-initialized, meaning it was filled with random garbage. Mea culpa. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_plugin.c | 1 + 1 file changed, 1 insertion(+) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 707aea5a9..b56ba6d14 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -608,6 +608,7 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, while (bytes_remain > 0) { memset(&cs_req, 0, sizeof(cs_req)); memset(&fence, 0, sizeof(fence)); + memset(&ib_info, 0, sizeof(ib_info)); memset(ib, 0, packets_per_buffer * 28); if (type == SDMA_OP_VRAM_WRITE) { From 096c1f7a4dee8bbf6fbcd82726961e1ea0249086 Mon Sep 17 00:00:00 2001 From: David Francis Date: Mon, 16 Sep 2024 09:43:12 -0400 Subject: [PATCH 485/775] plugins/amdgpu - Increase maximum parameter length The topology parsing assumed that all parameter names were 30 characters or fewer, but recommended_sdma_engine_id_mask is 31 characters. Make the maximum length a macro, and set it to 64. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_plugin_topology.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index c5fa51fda..5b4396a0c 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -20,6 +20,7 @@ #include "amdgpu_plugin_topology.h" #define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" +#define MAX_PARAMETER_LEN 64 /* User override options */ /* Skip firmware version check */ @@ -417,7 +418,9 @@ struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id static bool get_prop(char *line, char *name, uint64_t *value) { - if (sscanf(line, " %29s %lu", name, value) != 2) + char format[16]; + sprintf(format, " %%%ds %%lu", MAX_PARAMETER_LEN); + if (sscanf(line, format, name, value) != 2) return false; return true; } @@ -437,7 +440,7 @@ static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -565,7 +568,7 @@ static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -654,7 +657,7 @@ static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); From a8cbe76d4f230273bb3cd51174d87b73bb5c1c26 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 12 Sep 2024 11:19:35 -0700 Subject: [PATCH 486/775] util: dump fsfd log messages It should help to investigate errors of fsconfig, fsmount and etc. Signed-off-by: Andrei Vagin --- criu/cgroup.c | 18 +++++++----- criu/cr-check.c | 21 +++++++------- criu/include/syscall.h | 17 ------------ criu/include/util.h | 5 ++++ criu/util.c | 62 +++++++++++++++++++++++++++++++++++++++--- 5 files changed, 84 insertions(+), 39 deletions(-) delete mode 100644 criu/include/syscall.h diff --git a/criu/cgroup.c b/criu/cgroup.c index d90b70bb7..fcaed0708 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -28,7 +28,6 @@ #include "images/cgroup.pb-c.h" #include "kerndat.h" #include "linux/mount.h" -#include "syscall.h" /* * This structure describes set of controller groups @@ -581,14 +580,15 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) int fsfd, fd; char *name; - fsfd = sys_fsopen(fstype, 0); + fsfd = cr_fsopen(fstype, 0); if (fsfd < 0) { pr_perror("Unable to open the cgroup file system"); return -1; } if (strstartswith(cc->name, namestr)) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", cc->name); goto err; } @@ -596,7 +596,8 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) char *saveptr = NULL, *buf = strdupa(cc->name); name = strtok_r(buf, ",", &saveptr); while (name) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", name); goto err; } @@ -604,14 +605,17 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) } } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to create the cgroup (%s) file system", cc->name); goto err; } - fd = sys_fsmount(fsfd, 0, 0); - if (fd < 0) + fd = cr_fsmount(fsfd, 0, 0); + if (fd < 0) { + fsfd_dump_messages(fsfd); pr_perror("Unable to mount the cgroup (%s) file system", cc->name); + } close(fsfd); return fd; diff --git a/criu/cr-check.c b/criu/cr-check.c index 507f9915c..0388cbe7f 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -54,7 +54,6 @@ #include "restorer.h" #include "uffd.h" #include "linux/aio_abi.h" -#include "syscall.h" #include "mount-v2.h" #include "images/inventory.pb-c.h" @@ -1437,18 +1436,18 @@ static int ovl_mount(void) { int tmpfs, fsfd, ovl; - fsfd = sys_fsopen("tmpfs", 0); + fsfd = cr_fsopen("tmpfs", 0); if (fsfd == -1) { pr_perror("Unable to fsopen tmpfs"); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { pr_perror("Unable to create tmpfs mount"); return -1; } - tmpfs = sys_fsmount(fsfd, 0, 0); + tmpfs = cr_fsmount(fsfd, 0, 0); if (tmpfs == -1) { pr_perror("Unable to mount tmpfs"); return -1; @@ -1475,23 +1474,23 @@ static int ovl_mount(void) return -1; } - fsfd = sys_fsopen("overlay", 0); + fsfd = cr_fsopen("overlay", 0); if (fsfd == -1) { pr_perror("Unable to fsopen overlayfs"); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || - sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || - sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || - sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { pr_perror("Unable to configure overlayfs"); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { pr_perror("Unable to create overlayfs"); return -1; } - ovl = sys_fsmount(fsfd, 0, 0); + ovl = cr_fsmount(fsfd, 0, 0); if (ovl == -1) { pr_perror("Unable to mount overlayfs"); return -1; diff --git a/criu/include/syscall.h b/criu/include/syscall.h deleted file mode 100644 index c38d6d971..000000000 --- a/criu/include/syscall.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __CR_SYSCALL_H__ -#define __CR_SYSCALL_H__ - -static inline int sys_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} -static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - return syscall(__NR_fsconfig, fd, cmd, key, value, aux); -} -static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - return syscall(__NR_fsmount, fd, flags, attr_flags); -} - -#endif /* __CR_SYSCALL_H__ */ \ No newline at end of file diff --git a/criu/include/util.h b/criu/include/util.h index 435469e1e..ae293a68c 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -387,6 +387,11 @@ static inline void print_stack_trace(pid_t pid) extern int mount_detached_fs(const char *fsname); +extern int cr_fsopen(const char *fsname, unsigned int flags); +extern int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux); +extern int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags); +extern void fsfd_dump_messages(int fd); + extern char *get_legacy_iptables_bin(bool ipv6, bool restore); extern int set_opts_cap_eff(void); diff --git a/criu/util.c b/criu/util.c index 7dfa1fe42..d2bc9a865 100644 --- a/criu/util.c +++ b/criu/util.c @@ -39,7 +39,6 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" -#include "syscall.h" #include "util-caps.h" #include "clone-noasan.h" @@ -1556,23 +1555,78 @@ void print_stack_trace(pid_t pid) } #endif +int cr_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + int ret = syscall(__NR_fsconfig, fd, cmd, key, value, aux); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + int ret = syscall(__NR_fsmount, fd, flags, attr_flags); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +void fsfd_dump_messages(int fd) +{ + char buf[4096]; + int err, n; + + err = errno; + + for (;;) { + n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) { + if (errno != ENODATA) + pr_perror("Unable to read from fs descriptor"); + break; + } + buf[n] = 0; + + switch (buf[0]) { + case 'w': + pr_warn("%s\n", buf); + break; + case 'i': + pr_info("%s\n", buf); + break; + case 'e': + /* fallthrough */ + default: + pr_err("%s\n", buf); + break; + } + } + + errno = err; +} + int mount_detached_fs(const char *fsname) { int fsfd, fd; - fsfd = sys_fsopen(fsname, 0); + fsfd = cr_fsopen(fsname, 0); if (fsfd < 0) { pr_perror("Unable to open the %s file system", fsname); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { pr_perror("Unable to create the %s file system", fsname); close(fsfd); return -1; } - fd = sys_fsmount(fsfd, 0, 0); + fd = cr_fsmount(fsfd, 0, 0); if (fd < 0) pr_perror("Unable to mount the %s file system", fsname); close(fsfd); From c2b48ff423aa663b3534a5ba96907366e4c1b408 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 20 Sep 2024 21:40:17 -0700 Subject: [PATCH 487/775] criu: Version 4.0 (CRIUDA) Major changes: * CUDA plugin to support checkpointing and restoring NVIDIA CUDA applications. * Shadow stack support * Pagemap cache: Added support for PAGEMAP_SCAN ioctl The full changelog can be found here: https://criu.org/Download/criu/4.0. Signed-off-by: Andrei Vagin --- Makefile.versions | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.versions b/Makefile.versions index 5f21c11c2..c5859801a 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. -CRIU_VERSION_MAJOR := 3 -CRIU_VERSION_MINOR := 19 +CRIU_VERSION_MAJOR := 4 +CRIU_VERSION_MINOR := 0 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := Bronze Peacock +CRIU_VERSION_NAME := CRIUDA CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL From f1d465448fa3d464da6f8bc31500f5ce005b72da Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 12:24:52 +0100 Subject: [PATCH 488/775] amdgpu: remove exec permissions on source files This patch fixes the following warnings that appear when building an RPM package: + /usr/lib/rpm/redhat/brp-mangle-shebangs *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.c is executable but has no shebang, removing executable bit *** WARNING: ./usr/src/debug/criu-4.0-1.fc42.x86_64/plugins/amdgpu/amdgpu_plugin_util.h is executable but has no shebang, removing executable bit Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_util.c | 0 plugins/amdgpu/amdgpu_plugin_util.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.c mode change 100755 => 100644 plugins/amdgpu/amdgpu_plugin_util.h diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c old mode 100755 new mode 100644 diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h old mode 100755 new mode 100644 From 4f8f6f2883689546c4f0f793ac5d5dc6bd5a937e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 26 Sep 2024 10:59:32 +0100 Subject: [PATCH 489/775] Makefile.config: set CR_PLUGIN_DEFAULT variable By default, CRIU uses the path "/usr/lib/criu" to install and load plugins at runtime. This path is defined by the `PLUGINDIR` variable in Makefile.install and `CR_PLUGIN_DEFAULT` in `criu/include/plugin.h`. However, some distribution packages might install the CRIU plugins at "/usr/lib64/criu" instead. This patch updates the makefile to align the path defined by `CR_PLUGIN_DEFAULT` with the value of `PLUGINDIR`. Signed-off-by: Radostin Stoyanov --- Makefile.config | 4 ++++ plugins/amdgpu/Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile.config b/Makefile.config index 52c250b21..5ab689d41 100644 --- a/Makefile.config +++ b/Makefile.config @@ -59,6 +59,10 @@ endif export LIBS += $(LIBS_FEATURES) +ifneq ($(PLUGINDIR),) + FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" +endif + CONFIG_FILE = .config $(CONFIG_FILE): diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 7d3388b80..a20d1d163 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -15,7 +15,7 @@ DEPS_NOK := ; __nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) From 3322d1e94c8a91d795a1f341de07a1c130dce254 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 20:00:09 +0530 Subject: [PATCH 490/775] images: Add protobuf definition for pidfd We only use the last pid from the list in NSpid entry (from /proc//fdinfo/) while restoring pidfds. The last pid refers to the pid of the process in the most deeply nested pid namespace. Since CRIU does not currently support nested pid namespaces, this entry is the one we want. After Linux 6.9, inode numbers can be used to compare pidfds. pidfds referring to the same process will have the same inode numbers. We use inode numbers to restore pidfds that point to dead processes. Signed-off-by: Bhavik Sachdev --- images/Makefile | 1 + images/fdinfo.proto | 3 +++ images/pidfd.proto | 13 +++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 images/pidfd.proto diff --git a/images/Makefile b/images/Makefile index ca85b1a21..855d894da 100644 --- a/images/Makefile +++ b/images/Makefile @@ -73,6 +73,7 @@ proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o proto-obj-y += rseq.o +proto-obj-y += pidfd.o CFLAGS += -iquote $(obj)/ diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c1186..32ec13cf4 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -17,6 +17,7 @@ import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; +import "pidfd.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + PIDFD = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -78,4 +80,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional pidfd_entry pidfd = 22; } diff --git a/images/pidfd.proto b/images/pidfd.proto new file mode 100644 index 000000000..a9da3e454 --- /dev/null +++ b/images/pidfd.proto @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "fown.proto"; + +message pidfd_entry { + required uint32 id = 1; + required uint32 ino = 2; + required uint32 flags = 3; + required int32 nspid = 4; + required fown_entry fown = 5; +} From 1ce408ffa4a723e7110cbc0d68c68bfc5871b287 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 13 Jun 2024 21:18:51 +0530 Subject: [PATCH 491/775] criu: Support C/R of pidfds Process file descriptors (pidfds) were introduced to provide a stable handle on a process. They solve the problem of pid recycling. For a detailed explanation, see https://lwn.net/Articles/801319/ and http://www.corsix.org/content/what-is-a-pidfd Before Linux 6.9, anonymous inodes were used for the implementation of pidfds. So, we detect them in a fashion similiar to other fd types that use anonymous inodes by calling `readlink()`. After 6.9, pidfs (a file system for pidfds) was introduced. In 6.9 `S_ISREG()` returned true for pidfds, but this again changed with 6.10. (https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285) After this change, pidfs inodes have no file type in st_mode in userspace. We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9 Hence, check for pidfds occurs before the check for regular files. For pidfds that refer to dead processes, we lose the pid of the process as the Pid and NSpid fields in /proc//fdinfo/ change to -1. So, we create a temporary process for each unique inode and open pidfds that refer to this process. After all pidfds have been opened we kill this temporary process. This commit does not include support for pidfds that point to a specific thread, i.e pidfds opened with `PIDFD_THREAD` flag. Fixes: #2258 Signed-off-by: Bhavik Sachdev --- criu/Makefile.crtools | 1 + criu/cr-restore.c | 3 +- criu/files.c | 17 +++ criu/image-desc.c | 1 + criu/include/fs-magic.h | 4 + criu/include/image-desc.h | 1 + criu/include/magic.h | 1 + criu/include/pidfd.h | 16 ++ criu/include/protobuf-desc.h | 1 + criu/pidfd.c | 287 +++++++++++++++++++++++++++++++++++ criu/proc_parse.c | 29 ++++ criu/protobuf-desc.c | 1 + 12 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 criu/include/pidfd.h create mode 100644 criu/pidfd.c diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 3ddf45cd7..ba6132d2f 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 4d4dfbe6f..d5b6c8037 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -79,6 +79,7 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 3b653e24b..a57fb860f 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1800,5 +1811,11 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + + if (init_dead_pidfd_hash()) { + pr_err("Could not initialise hash map for dead pidfds\n"); + return -1; + } + return collect_image(&files_cinfo); } diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c098..2d87c7381 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f4891..ffc0455d5 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be64..79e1ac111 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/magic.h b/criu/include/magic.h index 0e8c37234..6f0aff26d 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 000000000..4d2d71700 --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern int init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101..c4241be55 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 000000000..fdf5dec60 --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,287 @@ +#include "common/lock.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" +#include "rst-malloc.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; +}; + +struct dead_pidfd { + unsigned int ino; + int pid; + size_t count; + mutex_t pidfd_lock; + struct hlist_node hash; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; +static mutex_t *dead_pidfd_hash_lock; + +int init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); + + dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); + if (!dead_pidfd_hash_lock) + return -1; + + mutex_init(dead_pidfd_hash_lock); + + return 0; +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + mutex_lock(dead_pidfd_hash_lock); + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) { + mutex_unlock(dead_pidfd_hash_lock); + return dead; + } + } + mutex_unlock(dead_pidfd_hash_lock); + + return NULL; +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + if (p->flags & PIDFD_THREAD) { + pr_err("PIDFD_THREAD flag is currently not supported\n"); + return -1; + } + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..\n", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1) + sleep(1); + } + return tmp_process; +} + +static int free_dead_pidfd(struct dead_pidfd *dead) +{ + int status; + + if (kill(dead->pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", + dead->pid); + goto err; + } + + if (waitpid(dead->pid, &status, 0) != dead->pid) { + pr_perror("Could not wait on temporary process with pid: %d", + dead->pid); + goto err; + } + + if (!WIFSIGNALED(status)) { + pr_err("Expected temporary process to be terminated by a signal\n"); + goto err; + } + + if (WTERMSIG(status) != SIGKILL) { + pr_err("Expected temporary process to be terminated by SIGKILL\n"); + goto err; + } + + mutex_lock(dead_pidfd_hash_lock); + hlist_del(&dead->hash); + mutex_unlock(dead_pidfd_hash_lock); + return 0; +err: + return -1; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info; + struct dead_pidfd *dead = NULL; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + mutex_lock(&dead->pidfd_lock); + BUG_ON(dead->count == 0); + dead->count--; + if (dead->pid == -1) { + dead->pid = create_tmp_process(); + if (dead->pid < 0) { + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + } + + pidfd = pidfd_open(dead->pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + mutex_unlock(&dead->pidfd_lock); + goto err_close; + } + + if (dead->count == 0) { + if (free_dead_pidfd(dead)) { + pr_err("Failed to delete dead_pidfd struct\n"); + mutex_unlock(&dead->pidfd_lock); + close(pidfd); + goto err_close; + } + } + mutex_unlock(&dead->pidfd_lock); + +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; +err_close: + pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (dead) { + mutex_lock(&dead->pidfd_lock); + dead->count++; + mutex_unlock(&dead->pidfd_lock); + goto out; + } + + dead = shmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate shared memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->ino = info->pidfe->ino; + dead->count = 1; + dead->pid = -1; + mutex_init(&dead->pidfd_lock); + + mutex_lock(dead_pidfd_hash_lock); + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); + mutex_unlock(dead_pidfd_hash_lock); +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 92655a484..eb869dbbd 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -2165,6 +2167,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5b..e0dbfccc2 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; From 3096df9ea3cfd494905bf0497a31c77688a49cf6 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 19:58:29 +0530 Subject: [PATCH 492/775] zdtm: Check pidfd fdinfo entry is consistent Ensures that entries in /proc//fdinfo/ are same. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_self.c | 140 ++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 test/zdtm/static/pidfd_self.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 1e891f0ba..a2e852d73 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -53,6 +53,7 @@ TST_NOFILE := \ shm \ shm-mp \ ptrace_sig \ + pidfd_self \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_self.c b/test/zdtm/static/pidfd_self.c new file mode 100644 index 000000000..2730ee123 --- /dev/null +++ b/test/zdtm/static/pidfd_self.c @@ -0,0 +1,140 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check pidfd /proc/self/fdinfo/ entry remains consistent after checkpoint/restore\n"; +const char *test_author = "Bhavik Sachdev "; + +struct pidfd_status { + unsigned int flags; + pid_t pid; +}; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static void show_pidfd(char *prefix, struct pidfd_status *s) +{ + test_msg("\n\t%s\n\tflags: 0%o\n\tpid: %d\n", prefix, s->flags, s->pid); +} + +static int parse_self_fdinfo(int pidfd, struct pidfd_status *s) +{ + char buf[256]; + int ret = -1; + FILE *f; + + sprintf(buf, "/proc/self/fdinfo/%d", pidfd); + f = fopen(buf, "r"); + if (!f) { + perror("Can't open /proc/self/fdinfo/ to parse"); + return -1; + } + + memset(s, 0, sizeof(*s)); + + /* + * flags: file access mode (octal) 02000002 => [O_RDWR | O_CLOEXEC] + * pid: the pid to which we have pidfd open + */ + while (fgets(buf, sizeof(buf), f)) { + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "flags: 0%o", &s->flags) != 1) { + goto parse_err; + } + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "Pid: %d", &s->pid) != 1) + goto parse_err; + ret = 0; + break; + } + + if (ret) + goto parse_err; +err: + fclose(f); + return ret; + +parse_err: + pr_perror("Format error"); + goto err; +} + +static int check_pidfd(int fd, struct pidfd_status *old) +{ + struct pidfd_status new; + + if (parse_self_fdinfo(fd, &new)) + return -1; + + show_pidfd("restored", &new); + + if (old->flags != new.flags || old->pid != new.pid) + return -1; + + return 0; +} + +int main(int argc, char* argv[]) +{ + struct pidfd_status old; + int pidfd, ret; + + test_init(argc, argv); + + pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + parse_self_fdinfo(pidfd, &old); + + show_pidfd("old", &old); + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = check_pidfd(pidfd, &old); + if (ret) { + fail(); + goto err; + } + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + fail(); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} From 2899d46000be4ee85af7000068b2414400ed66be Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Tue, 9 Jul 2024 20:01:00 +0530 Subject: [PATCH 493/775] zdtm: Check pidfd can send signal after C/R Ensure `pidfd_send_signal()` syscall works as expected after C/R. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_child.c | 66 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 test/zdtm/static/pidfd_child.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index a2e852d73..0268ae492 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_child \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_child.c b/test/zdtm/static/pidfd_child.c new file mode 100644 index 000000000..ec559605d --- /dev/null +++ b/test/zdtm/static/pidfd_child.c @@ -0,0 +1,66 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks pidfd sends signal to child process after restore\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + int pidfd, status; + pid_t child; + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("Unable to fork a new process"); + return 1; + } else if (child == 0) { + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + fail("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + goto err_close; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + goto err_close; + } + + pass(); + close(pidfd); + return 0; +err_close: + close(pidfd); + return 1; +} From 3f30ec0eda1a98ec18c8b102c49a0b7988c92c6d Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 8 Jul 2024 22:25:00 +0530 Subject: [PATCH 494/775] zdtm: Check pidfd can kill descendant processes Validate that pidfds can been used to send signals to different processes after C/R using the `pidfd_send_signal()` syscall. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_kill.c | 128 ++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 test/zdtm/static/pidfd_kill.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 0268ae492..ab45b580a 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -55,6 +55,7 @@ TST_NOFILE := \ ptrace_sig \ pidfd_self \ pidfd_child \ + pidfd_kill \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/pidfd_kill.c b/test/zdtm/static/pidfd_kill.c new file mode 100644 index 000000000..6232d033a --- /dev/null +++ b/test/zdtm/static/pidfd_kill.c @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Kill child and grandchild process using pidfds\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int wait_for_child(int child) +{ + int status; + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + test_msg("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, gchild, cpidfd, gpidfd, gchild_pid, ret; + int p[2]; + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + gchild = fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } + + if (gchild == 0) { + test_waitsig(); + return 0; + } + + close(p[READ]); + if (write(p[WRITE], &gchild, sizeof(gchild)) + != sizeof(gchild)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + test_waitsig(); + return wait_for_child(gchild); + } + + cpidfd = pidfd_open(child, 0); + if (cpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &gchild_pid, sizeof(gchild_pid)) + != sizeof(gchild_pid)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + + gpidfd = pidfd_open(gchild_pid, 0); + if (gpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(gpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + if (pidfd_send_signal(cpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + ret = wait_for_child(child); + if (ret) + goto fail_close; + + pass(); + close(cpidfd); + close(gpidfd); + return 0; + +fail_close: + fail(); + close(cpidfd); + close(gpidfd); + return 1; +} From 2e6f348458b83a5228693d31dd53611df56fd8f3 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Thu, 25 Jul 2024 01:12:36 +0530 Subject: [PATCH 495/775] zdtm: Check dead pidfd is restored correctly After, C/R of pidfds that point to dead processes their inodes might change. But if two pidfds point to same dead process they should continue to do so after C/R. This test ensures that this happens by calling `statx()` on pidfds after C/R and then comparing their inode numbers. Support for comparing pidfds by using `statx()` and inode numbers was introduced alongside pidfs. So if `f_type` of pidfd is not equal to `PID_FS_MAGIC` then we skip this test. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_dead.c | 244 ++++++++++++++++++++++++++++++++++ 2 files changed, 245 insertions(+) create mode 100644 test/zdtm/static/pidfd_dead.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab45b580a..20e4bc272 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_dead \ pidfd_child \ pidfd_kill \ pipe00 \ diff --git a/test/zdtm/static/pidfd_dead.c b/test/zdtm/static/pidfd_dead.c new file mode 100644 index 000000000..9c825899d --- /dev/null +++ b/test/zdtm/static/pidfd_dead.c @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of pidfds that point to dead processes\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main opens a pidfd for both child and grandchild. + * Before C/R we kill both child and grandchild. + * We end up with two unique dead pidfds. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int open_pidfd_pair(int pidfd[2], int pid) +{ + pidfd[0] = pidfd_open(pid, 0); + if (pidfd[0] < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + pidfd[1] = pidfd_open(pid, 0); + if (pidfd[1] < 0) { + close(pidfd[0]); + pr_perror("pidfd_open() failed"); + return 1; + } + return 0; +} + +static int compare_pidfds(int pidfd[2]) +{ + /* + * After linux 6.9 we can compare inode numbers + * to determine if two pidfds point to the same process. + * While the inode number may change before and after C/R + * pidfds pointing to the same pid should have the same inode number. + */ + struct statx stats[2]; + statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino != stats[1].stx_ino) + return 1; + return 0; +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, ret, gchild, p[2], status; + int cpidfd[2], gpidfd[2]; + struct statx stats[2]; + + test_init(argc, argv); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild = test_fork(); + close(p[READ]); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[WRITE]); + while(1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + + return 0; + } + } + + ret = open_pidfd_pair(cpidfd, child); + if (ret) + return 1; + + close(p[WRITE]); + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[READ]); + + ret = open_pidfd_pair(gpidfd, gchild); + if (ret) + return 1; + + /* + * We kill grandchild and child processes only after opening pidfds. + */ + if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + goto fail_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto fail_close; + } + + if (!WIFEXITED(status)) { + fail("Expected child to exit normally"); + goto fail_close; + } + + if (WEXITSTATUS(status) != 0) { + fail("Expected child to exit with 0"); + goto fail_close; + } + usleep(1000); + + if (kill(gchild, 0) != -1 && errno != ESRCH) { + fail("Expected grand child to not exist"); + goto fail_close; + } + + if (kill(child, 0) != -1 && errno != ESRCH) { + fail("Expected child to not exist"); + goto fail_close; + } + + test_daemon(); + test_waitsig(); + + ret = compare_pidfds(cpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + ret = compare_pidfds(gpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino == stats[1].stx_ino) { + fail("pidfds pointing to diff pids should have diff inodes"); + goto fail_close; + } + + pass(); + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 0; + +fail_close: + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 1; +} From 7a64004dc81d6d8fe8e5dbb8e31c787b54c96982 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Fri, 16 Aug 2024 21:20:57 +0530 Subject: [PATCH 496/775] zdtm: Check fd from pidfd_getfd is C/Red correctly We get the read end of a pipe using `pidfd_getfd` and check if we can read from it after C/R. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/fd_from_pidfd.c | 108 +++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/fd_from_pidfd.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 20e4bc272..f4dbb1d96 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -57,6 +57,7 @@ TST_NOFILE := \ pidfd_dead \ pidfd_child \ pidfd_kill \ + fd_from_pidfd \ pipe00 \ pipe01 \ pipe02 \ diff --git a/test/zdtm/static/fd_from_pidfd.c b/test/zdtm/static/fd_from_pidfd.c new file mode 100644 index 000000000..1f863d6c0 --- /dev/null +++ b/test/zdtm/static/fd_from_pidfd.c @@ -0,0 +1,108 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if fd obtained from pidfd_get_fd is C/R correctly\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, targetfd, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int pidfd, child, p[2], child_read, read_data, status; + int data = 42; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + close(p[WRITE]); + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + close(p[READ]); + if (write(p[WRITE], &data, sizeof(data)) != sizeof(data)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + child_read = pidfd_getfd(pidfd, p[READ], 0); + if (child_read < 0) { + pr_perror("pidfd_getfd"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(child_read, &read_data, sizeof(read_data)) != sizeof(read_data)) { + pr_perror("read"); + goto err_close; + } + + if (read_data != data) { + fail("data from fd obtained using pidfd_getfd incorrect"); + goto err_close; + } + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + pr_perror("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + close(child_read); + close(pidfd); + return 0; +err_close: + close(child_read); + close(pidfd); + return 1; +} From f29e655df9d5320c8bcab1ec26bac2b0315af4a5 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Mon, 26 Aug 2024 20:56:14 +0530 Subject: [PATCH 497/775] zdtm: Check pidfd for thread is valid after C/R We open a pidfd to a thread using `PIDFD_THREAD` flag and after C/R ensure that we can send signals using it with `PIDFD_SIGNAL_THREAD`. signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_of_thread.c | 114 ++++++++++++++++++++++++++ test/zdtm/static/pidfd_of_thread.desc | 1 + 3 files changed, 116 insertions(+) create mode 100644 test/zdtm/static/pidfd_of_thread.c create mode 100644 test/zdtm/static/pidfd_of_thread.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index f4dbb1d96..44ac64fe5 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -54,6 +54,7 @@ TST_NOFILE := \ shm-mp \ ptrace_sig \ pidfd_self \ + pidfd_of_thread \ pidfd_dead \ pidfd_child \ pidfd_kill \ diff --git a/test/zdtm/static/pidfd_of_thread.c b/test/zdtm/static/pidfd_of_thread.c new file mode 100644 index 000000000..d232c7ac1 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check C/R of pidfds that point to threads\n"; +const char *test_author = "Bhavik Sachdev "; + +/* see also: https://codebrowser.dev/glibc/glibc/sysdeps/unix/sysv/linux/tst-clone3.c.html */ + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +#ifndef PIDFD_SIGNAL_THREAD +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#endif + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int thread_func(void *a) +{ + test_waitsig(); + return 0; +} + +#define CTID_INIT_VAL 1 + +int main(int argc, char* argv[]) +{ + char st[64 * 1024] __attribute__ ((aligned)); + pid_t tid; + int pidfd, test_pidfd; + futex_t exited; + + int clone_flags = CLONE_THREAD; + clone_flags |= CLONE_VM | CLONE_SIGHAND; + clone_flags |= CLONE_CHILD_CLEARTID; + + test_init(argc, argv); + + test_pidfd = pidfd_open(getpid(), 0); + if (test_pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + /* PIDFD_THREAD, PIDFD_SIGNAL_THREAD are supported only with pidfs */ + if (get_fs_type(test_pidfd) != PID_FS_MAGIC) { + test_daemon(); + test_waitsig(); + skip("pidfs not supported."); + close(test_pidfd); + return 0; + } + close(test_pidfd); + + futex_set(&exited, CTID_INIT_VAL); + + tid = clone(thread_func, st + sizeof(st), clone_flags, NULL, NULL, NULL, &(exited.raw)); + if (tid == -1) { + pr_perror("clone() failed"); + return 1; + } + + test_msg("Successfully created a thread with tid: %d\n", tid); + pidfd = pidfd_open(tid, PIDFD_THREAD); + if (pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, PIDFD_SIGNAL_THREAD)) { + pr_perror("pidfd_send_signal() failed"); + fail(); + close(pidfd); + return 1; + } + + test_msg("Waiting for thread to exit\n"); + futex_wait_until(&exited, 0); + + pass(); + close(pidfd); + return 0; +} diff --git a/test/zdtm/static/pidfd_of_thread.desc b/test/zdtm/static/pidfd_of_thread.desc new file mode 100644 index 000000000..802caed65 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.desc @@ -0,0 +1 @@ +{'flags': 'noauto crfail'} From 88aa7e2c10a83a61226447cfe41d9e50ce001178 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 12:39:18 +0100 Subject: [PATCH 498/775] make/lint: use 'ruff check ' The command `ruff ` has been deprecated and removed: https://astral.sh/blog/ruff-v0.5.0#removed-deprecated-features Signed-off-by: Radostin Stoyanov --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 97b4dc211..46d9adef3 100644 --- a/Makefile +++ b/Makefile @@ -437,7 +437,7 @@ help: ruff: @ruff --version - ruff ${RUFF_FLAGS} --config=scripts/ruff.toml \ + ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ From b524dab32f03f15a66b637057233ac28ef7b0091 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:58:41 +0100 Subject: [PATCH 499/775] pycriu: fix lint errors This patch fixes the following errors reported by ruff: lib/pycriu/images/pb2dict.py:307:24: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 305 | elif field.type in _basic_cast: 306 | cast = _basic_cast[field.type] 307 | if pretty and (cast == int): | ^^^^^^^^^^^ E721 308 | if is_hex: 309 | # Fields that have (criu).hex = true option set | lib/pycriu/images/pb2dict.py:379:13: E721 Use `is` and `is not` for type comparisons, or `isinstance()` for isinstance checks | 377 | elif field.type in _basic_cast: 378 | cast = _basic_cast[field.type] 379 | if (cast == int) and is_string(value): | ^^^^^^^^^^^ E721 380 | if _marked_as_dev(field): 381 | return encode_dev(field, value) | Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 0d1a24692..e3dd95ac0 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -304,7 +304,7 @@ def _pb2dict_cast(field, value, pretty=False, is_hex=False): return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] - if pretty and (cast == int): + if pretty and cast is int: if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. @@ -376,7 +376,7 @@ def _dict2pb_cast(field, value): return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] - if (cast == int) and is_string(value): + if cast is int and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) From 5335b35f72da90a62ad3d771ca175d59ab1bd8b1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 4 Oct 2024 12:14:29 +0100 Subject: [PATCH 500/775] images/inventory: add field for enabled plugins This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be *enabled* during restore. - When the field is empty, it indicates that no plugins were used during checkpointing. Thus, all plugins can be *disabled* during restore. Signed-off-by: Radostin Stoyanov --- criu/cr-restore.c | 6 +- criu/image.c | 124 +++++++++++++++++++++++++++++++++ criu/include/image.h | 4 ++ criu/plugin.c | 3 + images/inventory.proto | 8 +++ plugins/amdgpu/amdgpu_plugin.c | 31 +++++++++ plugins/cuda/cuda_plugin.c | 22 +++++- 7 files changed, 193 insertions(+), 5 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index d5b6c8037..646300bdb 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2354,12 +2354,12 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) - return -1; - if (check_img_inventory(/* restore = */ true) < 0) goto err; + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; + if (init_stats(RESTORE_STATS)) goto err; diff --git a/criu/image.c b/criu/image.c index 9fb390ab7..9589167fb 100644 --- a/criu/image.c +++ b/criu/image.c @@ -26,6 +26,14 @@ TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +struct inventory_plugin { + struct list_head node; + char *name; +}; + +struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); +static int n_inventory_plugins; + int check_img_inventory(bool restore) { int ret = -1; @@ -99,6 +107,19 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } + + if (!he->plugins_entry) { + /* backwards compatibility: if the 'plugins_entry' field is missing, + * all plugins should be enabled during restore. + */ + n_inventory_plugins = -1; + } else { + PluginsEntry *pe = he->plugins_entry; + for (int i = 0; i < pe->n_plugins; i++) { + if (add_inventory_plugin(pe->plugins[i])) + goto out_err; + } + } } ret = 0; @@ -110,8 +131,92 @@ out_close: return ret; } +/** + * Check if the 'plugins' field in the inventory image contains + * the specified plugin name. If found, the plugin is removed + * from the linked list. + */ +bool check_and_remove_inventory_plugin(const char *name, size_t n) +{ + if (n_inventory_plugins == -1) + return true; /* backwards compatibility */ + + if (n_inventory_plugins > 0) { + struct inventory_plugin *p, *tmp; + + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + if (!strncmp(name, p->name, n)) { + xfree(p->name); + list_del(&p->node); + xfree(p); + n_inventory_plugins--; + return true; + } + } + } + + return false; +} + +/** + * We expect during restore all loaded plugins to be removed from + * the inventory_plugins_list. If the list is not empty, show an + * error message for each missing plugin. + */ +int check_inventory_plugins(void) +{ + struct inventory_plugin *p; + + if (n_inventory_plugins <= 0) + return 0; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pr_err("Missing required plugin: %s\n", p->name); + } + + return -1; +} + +/** + * Add plugin name to the inventory image. These values + * can be used to identify required plugins during restore. + */ +int add_inventory_plugin(const char *name) +{ + struct inventory_plugin *p; + + p = xmalloc(sizeof(struct inventory_plugin)); + if (p == NULL) + return -1; + + p->name = xstrdup(name); + if (!p->name) { + xfree(p); + return -1; + } + list_add(&p->node, &inventory_plugins_list); + n_inventory_plugins++; + + return 0; +} + +void free_inventory_plugins_list(void) +{ + struct inventory_plugin *p, *tmp; + + if (!list_empty(&inventory_plugins_list)) { + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + xfree(p->name); + list_del(&p->node); + xfree(p); + } + } + n_inventory_plugins = 0; +} + int write_img_inventory(InventoryEntry *he) { + PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -121,8 +226,27 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + if (!list_empty(&inventory_plugins_list)) { + struct inventory_plugin *p; + int i = 0; + + pe.n_plugins = n_inventory_plugins; + pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); + if (!pe.plugins) + return -1; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pe.plugins[i] = p->name; + i++; + } + } + he->plugins_entry = &pe; + ret = pb_write_one(img, he, PB_INVENTORY); + free_inventory_plugins_list(); + xfree(pe.plugins); + xfree(he->root_ids); close_image(img); if (ret < 0) diff --git a/criu/include/image.h b/criu/include/image.h index a17aae35c..afa7d5e12 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -177,4 +177,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +extern int add_inventory_plugin(const char *name); +extern int check_inventory_plugins(void); +extern bool check_and_remove_inventory_plugin(const char *name, size_t n); + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 58b5ea5bf..65e79a069 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -256,6 +256,9 @@ int cr_plugin_init(int stage) goto err; } + if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) + goto err; + exit_code = 0; err: closedir(d); diff --git a/images/inventory.proto b/images/inventory.proto index a735bad1d..7f655031b 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,6 +10,13 @@ enum lsmtype { APPARMOR = 2; } +// It is not possible to distinguish between an empty repeated field +// and unset repeated field. To solve this problem and provide backwards +// compabibility, we use the 'plugins_entry' message. +message plugins_entry { + repeated string plugins = 12; +}; + message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -21,4 +28,5 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; + optional plugins_entry plugins_entry = 12; } diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index b56ba6d14..96c086162 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -60,6 +60,10 @@ static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; +bool plugin_added_to_inventory = false; + +bool plugin_disabled = false; + /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -332,6 +336,13 @@ void getenv_size_t(const char *var, size_t *value) int amdgpu_plugin_init(int stage) { + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); @@ -365,6 +376,9 @@ int amdgpu_plugin_init(int stage) void amdgpu_plugin_fini(int stage, int ret) { + if (plugin_disabled) + return; + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) @@ -414,6 +428,14 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) if (ret) pr_perror("%s(), Can't handle VMAs of input device", __func__); + if (!ret && !plugin_added_to_inventory) { + ret = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (ret) + pr_err("Failed to add AMDGPU plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -1540,6 +1562,9 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1746,6 +1771,9 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; + if (plugin_disabled) + return -ENOTSUP; + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1805,6 +1833,9 @@ int amdgpu_plugin_resume_devices_late(int target_pid) struct kfd_ioctl_criu_args args = { 0 }; int fd, exit_code = 0; + if (plugin_disabled) + return -ENOTSUP; + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 23c3f4b1a..c4fc67fa9 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -38,6 +38,8 @@ */ bool plugin_disabled = false; +bool plugin_added_to_inventory = false; + struct pid_info { int pid; char checkpointed; @@ -319,7 +321,7 @@ int cuda_plugin_checkpoint_devices(int pid) k_rtsigset_t save_sigset; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -354,6 +356,15 @@ int cuda_plugin_checkpoint_devices(int pid) pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); } } + + if (!status && !plugin_added_to_inventory) { + status = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (status) + pr_err("Failed to add CUDA plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); @@ -367,7 +378,7 @@ int cuda_plugin_pause_devices(int pid) char msg_buf[CUDA_CKPT_BUF_SIZE]; if (plugin_disabled) { - return 0; + return -ENOTSUP; } restore_tid = get_cuda_restore_tid(pid); @@ -463,6 +474,13 @@ int cuda_plugin_init(int stage) { int ret; + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); plugin_disabled = true; From 5ca4400699cc50fcd6de7d994358136b502d1374 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Oct 2024 13:36:22 +0100 Subject: [PATCH 501/775] zdtm: add inventory test plugins This patch adds two test plugins to verify that CRIU plugins listed in the inventory image are enabled, while those that are not listed can be disabled. Signed-off-by: Radostin Stoyanov --- scripts/ci/run-ci-tests.sh | 1 + test/plugins/Makefile | 16 +++++++++++++++- test/plugins/inventory_test_disabled_plugin.c | 17 +++++++++++++++++ test/plugins/inventory_test_enabled_plugin.c | 17 +++++++++++++++++ test/zdtm.py | 2 +- 5 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 test/plugins/inventory_test_disabled_plugin.c create mode 100644 test/plugins/inventory_test_enabled_plugin.c diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 38b7b5097..b472e954c 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -362,5 +362,6 @@ make -C plugins/amdgpu/ test_topology_remap ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu ./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled ./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 diff --git a/test/plugins/Makefile b/test/plugins/Makefile index 7827b655c..4f620ad50 100644 --- a/test/plugins/Makefile +++ b/test/plugins/Makefile @@ -1,5 +1,13 @@ SRC_DIR := ../../plugins -PLUGIN_TARGETS := amdgpu_plugin.so cuda_plugin.so +PLUGIN_TARGETS := inventory_test_enabled_plugin.so inventory_test_disabled_plugin.so amdgpu_plugin.so cuda_plugin.so + +ARCH := x86 + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC # Silent make rules. Q := @ @@ -12,6 +20,12 @@ amdgpu_plugin.so: $(SRC_DIR)/amdgpu/amdgpu_plugin.so cuda_plugin.so: $(SRC_DIR)/cuda/cuda_plugin.so $(Q) cp $< $@ +inventory_test_enabled_plugin.so: inventory_test_enabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + +inventory_test_disabled_plugin.so: inventory_test_disabled_plugin.c + $(Q) $(CC) $(PLUGIN_CFLAGS) $< -o $@ $(PLUGIN_INCLUDE) + clean: $(Q) $(RM) $(PLUGIN_TARGETS) diff --git a/test/plugins/inventory_test_disabled_plugin.c b/test/plugins/inventory_test_disabled_plugin.c new file mode 100644 index 000000000..468fe924b --- /dev/null +++ b/test/plugins/inventory_test_disabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_disabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return 0; +} + +void inventory_test_disabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_disabled_plugin", inventory_test_disabled_plugin_init, inventory_test_disabled_plugin_fini) \ No newline at end of file diff --git a/test/plugins/inventory_test_enabled_plugin.c b/test/plugins/inventory_test_enabled_plugin.c new file mode 100644 index 000000000..89e684e2a --- /dev/null +++ b/test/plugins/inventory_test_enabled_plugin.c @@ -0,0 +1,17 @@ +#include "criu-plugin.h" +#include "image.h" + +int inventory_test_enabled_plugin_init(int stage) +{ + if (stage == CR_PLUGIN_STAGE__RESTORE) + return !check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name)); + + return add_inventory_plugin(CR_PLUGIN_DESC.name); +} + +void inventory_test_enabled_plugin_fini(int stage, int ret) +{ + return; +} + +CR_PLUGIN_REGISTER("inventory_test_enabled_plugin", inventory_test_enabled_plugin_init, inventory_test_enabled_plugin_fini) \ No newline at end of file diff --git a/test/zdtm.py b/test/zdtm.py index 6b2132cc3..37ebe63b7 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2877,7 +2877,7 @@ def get_cli_args(): rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") rp.add_argument("--criu-plugin", help="Run tests with CRIU plugin", - choices=['amdgpu', 'cuda'], + choices=['amdgpu', 'cuda', 'inventory_test_enabled', 'inventory_test_disabled'], nargs='+', default=None) rp.add_argument("--mocked-cuda-checkpoint", From c49eb18f9f00d18162684f840a1bed4dce9c1d13 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Wed, 9 Oct 2024 09:50:28 +0100 Subject: [PATCH 502/775] pidfd: block SIGCHLD during tmp process creation This patch blocks SIGCHLD during temporary process creation to prevent a race condition between kill() and waitpid() where sigchld_handler() causes `criu restore` to fail with an error. Fixes: #2490 Signed-off-by: Bhavik Sachdev Signed-off-by: Radostin Stoyanov --- criu/pidfd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/criu/pidfd.c b/criu/pidfd.c index fdf5dec60..3ea3c9309 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -145,6 +145,20 @@ static int create_tmp_process(void) static int free_dead_pidfd(struct dead_pidfd *dead) { int status; + sigset_t blockmask, oldmask; + + /* + * Block SIGCHLD to prevent interfering from sigchld_handler() + * and to properly handle the tmp process termination without + * a race condition. A similar approach is used in cr_system(). + */ + sigemptyset(&oldmask); + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { + pr_perror("Cannot set mask of blocked signals"); + goto err; + } if (kill(dead->pid, SIGKILL) < 0) { pr_perror("Could not kill temporary process with pid: %d", @@ -158,6 +172,12 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } + /* Restore the original signal mask after tmp process has terminated */ + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + pr_perror("Cannot clear blocked signals"); + goto err; + } + if (!WIFSIGNALED(status)) { pr_err("Expected temporary process to be terminated by a signal\n"); goto err; From d8f93e7baccb299e2f056beeab8c110654af9325 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:49:50 -0700 Subject: [PATCH 503/775] include: add common header files for riscv64 Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- - rebased - imported a page_size() type fix (authored by Cryolitia PukNgae) Signed-off-by: PukNgae Cryolitia Signed-off-by: Alexander Mikhalitsyn --- include/common/arch/riscv64/asm/atomic.h | 109 ++++++++++++++++++ include/common/arch/riscv64/asm/bitops.h | 50 ++++++++ include/common/arch/riscv64/asm/bitsperlong.h | 6 + include/common/arch/riscv64/asm/linkage.h | 23 ++++ include/common/arch/riscv64/asm/page.h | 44 +++++++ 5 files changed, 232 insertions(+) create mode 100644 include/common/arch/riscv64/asm/atomic.h create mode 100644 include/common/arch/riscv64/asm/bitops.h create mode 100644 include/common/arch/riscv64/asm/bitsperlong.h create mode 100644 include/common/arch/riscv64/asm/linkage.h create mode 100644 include/common/arch/riscv64/asm/page.h diff --git a/include/common/arch/riscv64/asm/atomic.h b/include/common/arch/riscv64/asm/atomic.h new file mode 100644 index 000000000..4b08bd9fd --- /dev/null +++ b/include/common/arch/riscv64/asm/atomic.h @@ -0,0 +1,109 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef struct { + int counter; +} atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(v->counter), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(ptr->counter) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h new file mode 100644 index 000000000..400cc3e15 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitops.h @@ -0,0 +1,50 @@ +#ifndef __CR_ASM_BITOPS_H__ +#define __CR_ASM_BITOPS_H__ + +#include "common/compiler.h" +#include "common/asm-generic/bitops.h" + +#define BITS_PER_LONG 64 + +#define BIT_MASK(nr) ((1##UL) << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +#define __AMO(op) "amo" #op ".d" + +#define __test_and_op_bit_ord(op, mod, nr, addr, ord) \ + ({ \ + unsigned long __res, __mask; \ + __mask = BIT_MASK(nr); \ + __asm__ __volatile__(__AMO(op) #ord " %0, %2, %1" \ + : "=r"(__res), "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(__mask)) \ + : "memory"); \ + ((__res & __mask) != 0); \ + }) + +#define __op_bit_ord(op, mod, nr, addr, ord) \ + __asm__ __volatile__(__AMO(op) #ord " zero, %1, %0" \ + : "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(BIT_MASK(nr))) \ + : "memory"); + +#define __test_and_op_bit(op, mod, nr, addr) __test_and_op_bit_ord(op, mod, nr, addr, .aqrl) +#define __op_bit(op, mod, nr, addr) __op_bit_ord(op, mod, nr, addr, ) + +/* Bitmask modifiers */ +#define __NOP(x) (x) +#define __NOT(x) (~(x)) + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation may be reordered on other architectures than x86. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + return __test_and_op_bit(or, __NOP, nr, addr); +} + +#endif /* __CR_ASM_BITOPS_H__ */ diff --git a/include/common/arch/riscv64/asm/bitsperlong.h b/include/common/arch/riscv64/asm/bitsperlong.h new file mode 100644 index 000000000..d95727d19 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/riscv64/asm/linkage.h b/include/common/arch/riscv64/asm/linkage.h new file mode 100644 index 000000000..c6d40f750 --- /dev/null +++ b/include/common/arch/riscv64/asm/linkage.h @@ -0,0 +1,23 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x00 +#define __ALIGN_STR ".align 4, 0x00" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + __ALIGN; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/riscv64/asm/page.h b/include/common/arch/riscv64/asm/page.h new file mode 100644 index 000000000..5113cb6db --- /dev/null +++ b/include/common/arch/riscv64/asm/page.h @@ -0,0 +1,44 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +extern unsigned __page_size; +extern unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +/* + * Don't add ifdefs for PAGE_SIZE: if any header defines it as a constant + * on aarch64, then we need refrain using PAGE_SIZE in criu and use + * page_size() across sources (as it may differ on aarch64). + */ +#define PAGE_SIZE page_size() +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#define PAGE_SHIFT page_shift() + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) + +#else /* CR_NOGLIBC */ + +extern unsigned long page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ +#endif /* __CR_ASM_PAGE_H__ */ From 95359a62aa4dfb613d2a2cf8f7491b3ec766d348 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:58:26 -0700 Subject: [PATCH 504/775] compel: add riscv64 support Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- - rebased - added a membarrier() to syscall table (fix authored by Cryolitia PukNgae) Signed-off-by: PukNgae Cryolitia Signed-off-by: Alexander Mikhalitsyn --- Makefile | 6 +- compel/Makefile | 4 +- .../riscv64/plugins/include/asm/prologue.h | 35 +++ .../plugins/include/asm/syscall-types.h | 28 +++ .../arch/riscv64/plugins/include/features.h | 4 + .../arch/riscv64/plugins/std/parasite-head.S | 7 + .../plugins/std/syscalls/Makefile.syscalls | 59 +++++ .../plugins/std/syscalls/gen-sys-exec-tbl.pl | 43 ++++ .../plugins/std/syscalls/gen-syscalls.pl | 99 ++++++++ .../plugins/std/syscalls/syscall-aux.S | 37 +++ .../plugins/std/syscalls/syscall-aux.h | 3 + .../plugins/std/syscalls/syscall-common.S | 17 ++ .../riscv64/plugins/std/syscalls/syscall.def | 125 ++++++++++ .../riscv64/plugins/std/syscalls/syscalls.S | 112 +++++++++ compel/arch/riscv64/scripts/compel-pack.lds.S | 32 +++ compel/arch/riscv64/src/lib/cpu.c | 78 ++++++ compel/arch/riscv64/src/lib/handle-elf-host.c | 1 + compel/arch/riscv64/src/lib/handle-elf.c | 32 +++ compel/arch/riscv64/src/lib/include/cpu.h | 0 .../arch/riscv64/src/lib/include/handle-elf.h | 12 + compel/arch/riscv64/src/lib/include/syscall.h | 8 + .../src/lib/include/uapi/asm/breakpoints.h | 15 ++ .../riscv64/src/lib/include/uapi/asm/cpu.h | 7 + .../riscv64/src/lib/include/uapi/asm/fpu.h | 4 + .../src/lib/include/uapi/asm/infect-types.h | 52 ++++ .../include/uapi/asm/instruction_formats.h | 26 ++ .../lib/include/uapi/asm/processor-flags.h | 4 + .../src/lib/include/uapi/asm/sigframe.h | 68 ++++++ compel/arch/riscv64/src/lib/infect.c | 222 ++++++++++++++++++ compel/src/main.c | 3 + scripts/nmk/scripts/include.mk | 1 + 31 files changed, 1141 insertions(+), 3 deletions(-) create mode 100644 compel/arch/riscv64/plugins/include/asm/prologue.h create mode 100644 compel/arch/riscv64/plugins/include/asm/syscall-types.h create mode 100644 compel/arch/riscv64/plugins/include/features.h create mode 100644 compel/arch/riscv64/plugins/std/parasite-head.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls create mode 100755 compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl create mode 100755 compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall-common.S create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscall.def create mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscalls.S create mode 100644 compel/arch/riscv64/scripts/compel-pack.lds.S create mode 100644 compel/arch/riscv64/src/lib/cpu.c create mode 120000 compel/arch/riscv64/src/lib/handle-elf-host.c create mode 100644 compel/arch/riscv64/src/lib/handle-elf.c create mode 100644 compel/arch/riscv64/src/lib/include/cpu.h create mode 100644 compel/arch/riscv64/src/lib/include/handle-elf.h create mode 100644 compel/arch/riscv64/src/lib/include/syscall.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h create mode 100644 compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h create mode 100644 compel/arch/riscv64/src/lib/infect.c diff --git a/Makefile b/Makefile index 46d9adef3..60b78a074 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64 riscv64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -84,6 +84,10 @@ ifeq ($(ARCH),loongarch64) DEFINES := -DCONFIG_LOONGARCH64 endif +ifeq ($(ARCH),riscv64) + DEFINES := -DCONFIG_RISCV64 +endif + # # CFLAGS_PIE: # diff --git a/compel/Makefile b/compel/Makefile index 78ec4826a..c0b8a82a0 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -32,8 +32,8 @@ ifeq ($(ARCH),x86) lib-y += arch/$(ARCH)/src/lib/thread_area.o endif -# handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64 loongarch64,$(ARCH)),) +# handle_elf() has no support of ELF relocations on ARM and RISCV64 (yet?) +ifneq ($(filter arm aarch64 loongarch64 riscv64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/riscv64/plugins/include/asm/prologue.h b/compel/arch/riscv64/plugins/include/asm/prologue.h new file mode 100644 index 000000000..5c22b7b06 --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/asm/syscall-types.h b/compel/arch/riscv64/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..b9740a9ee --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 // number of signals +#define _NSIG_BPW 64 // number of signals per word + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/features.h b/compel/arch/riscv64/plugins/include/features.h new file mode 100644 index 000000000..274cee52a --- /dev/null +++ b/compel/arch/riscv64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/parasite-head.S b/compel/arch/riscv64/plugins/std/parasite-head.S new file mode 100644 index 000000000..3e9d272e3 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/parasite-head.S @@ -0,0 +1,7 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + jal parasite_service + ebreak +END(__export_parasite_head_start) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..5af35bcb4 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,59 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def +sys-asm-common-name := std/syscalls/syscall-common.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl +sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o + +ifeq ($(ARCH),arm) +arch_bits := 32 +else +arch_bits := 64 +endif + +sys-exec-tbl := sys-exec-tbl.c + +$(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen) \ + $(sys-def) \ + $(sys-codes) \ + $(sys-proto) \ + $(sys-asm) \ + $(sys-asm-common-name) \ + $(sys-types) \ + $(arch_bits) + +$(sys-asm:.S=).o: $(sys-asm) + +$(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen-tbl) \ + $(sys-def) \ + $(sys-exec-tbl) \ + $(arch_bits) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) $(sys-codes) +mrproper-y += $(std-headers-deps) +mrproper-y += $(obj)/include/uapi/std/syscall-aux.S +mrproper-y += $(obj)/include/uapi/std/syscall-aux.h \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl new file mode 100755 index 000000000..61a807eb6 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $tblout = $ARGV[1]; +my $bits = $ARGV[2]; + +my $code = "code$bits"; + +open TBLOUT, ">", $tblout or die $!; +open IN, "<", $in or die $!; + +print TBLOUT "/* Autogenerated, don't edit */\n"; +print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; + +for () { + if ($_ =~ /\#/) { + next; + } + + my $sys_name; + my $sys_num; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{alias}; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{name}; + } else { + unlink $tblout; + die "Invalid syscall definition file: invalid entry $_\n"; + } + + $sys_num = $+{$code}; + + if ($sys_num ne "!") { + print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; + } +} + +print TBLOUT " { }, /* terminator */"; +print TBLOUT "};" \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl new file mode 100755 index 000000000..a53f1962f --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $codesout = $ARGV[1]; +my $codes = $ARGV[1]; +$codes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $protosout = $ARGV[2]; +my $protos = $ARGV[2]; +$protos =~ s/.*include\/uapi\//compel\/plugins\//g; +my $asmout = $ARGV[3]; +my $asmcommon = $ARGV[4]; +my $prototypes = $ARGV[5]; +$prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $bits = $ARGV[6]; + +my $codesdef = $codes; +$codesdef =~ tr/.\-\//_/; +my $protosdef = $protos; +$protosdef =~ tr/.\-\//_/; +my $code = "code$bits"; +my $need_aux = 0; + +unlink $codesout; +unlink $protosout; +unlink $asmout; + +open CODESOUT, ">", $codesout or die $!; +open PROTOSOUT, ">", $protosout or die $!; +open ASMOUT, ">", $asmout or die $!; +open IN, "<", $in or die $!; + +print CODESOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $codesdef +#define $codesdef +END + +print PROTOSOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $protosdef +#define $protosdef +#include <$prototypes> +#include <$codes> +END + +print ASMOUT <<"END"; +/* Autogenerated, don't edit */ +#include <$codes> +#include "$asmcommon" +END + + +for () { + if ($_ =~ /\#/) { + next; + } + + my $code_macro; + my $sys_macro; + my $sys_name; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{alias}"; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{name}"; + } else { + unlink $codesout; + unlink $protosout; + unlink $asmout; + + die "Invalid syscall definition file: invalid entry $_\n"; + } + + if ($+{$code} ne "!") { + print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; + print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; + print ASMOUT "syscall $sys_name, $code_macro\n"; + + } else { + $need_aux = 1; + } + + print PROTOSOUT "extern long $sys_name($+{args});\n"; +} + +if ($need_aux == 1) { + print ASMOUT "#include \n"; + print CODESOUT "#include \n"; +} + +print CODESOUT "#endif /* $codesdef */"; +print PROTOSOUT "#endif /* $protosdef */"; \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S new file mode 100644 index 000000000..04160b7ac --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S @@ -0,0 +1,37 @@ +/** + * This source contains emulation of syscalls + * that are not implemented in the riscv64 Linux kernel + */ + +ENTRY(sys_open) + add a3, x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_openat +END(sys_open) + + +ENTRY(sys_mkdir) + add a3,x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_mkdirat +END(sys_mkdir) + + +ENTRY(sys_rmdir) + addi a2, x0, 0x200 // flags = AT_REMOVEDIR + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_rmdir) + + +ENTRY(sys_unlink) + addi a2, x0, 0 // flags = 0 + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_unlink) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h new file mode 100644 index 000000000..881765bbb --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h @@ -0,0 +1,3 @@ +#ifndef __NR_openat +#define __NR_openat 56 +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S new file mode 100644 index 000000000..fdef3b47a --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S @@ -0,0 +1,17 @@ +#include "common/asm/linkage.h" + +syscall_common: + ecall + ret + +.macro syscall name, nr + ENTRY(\name) + li a7, \nr + j syscall_common + END(\name) +.endm + +ENTRY(__cr_restore_rt) + li a7, __NR_rt_sigreturn + ecall +END(__cr_restore_rt) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def new file mode 100644 index 000000000..17f763e90 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall.def @@ -0,0 +1,125 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name/alias code64 code32 arguments +# ----------------------------------------------------------------------- +# +read 63 3 (int fd, void *buf, unsigned long count) +write 64 4 (int fd, const void *buf, unsigned long count) +open ! 5 (const char *filename, unsigned long flags, unsigned long mode) +close 57 6 (int fd) +lseek 62 19 (int fd, unsigned long offset, unsigned long origin) +mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) +munmap 215 91 (void *addr, unsigned long len) +brk 214 45 (void *addr) +rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +rt_sigreturn 139 173 (void) +ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) +pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) +ptrace 117 26 (long request, pid_t pid, void *addr, void *data) +mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) +mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) +madvise 233 220 (unsigned long start, size_t len, int behavior) +shmat 196 305 (int shmid, void *shmaddr, int shmflag) +pause 1061 29 (void) +nanosleep 101 162 (struct timespec *req, struct timespec *rem) +getitimer 102 105 (int which, const struct itimerval *val) +setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) +getpid 172 20 (void) +socket 198 281 (int domain, int type, int protocol) +connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) +sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) +recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) +shutdown 210 293 (int sockfd, int how) +bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +exit 93 1 (unsigned long error_code) +wait4 260 114 (int pid, int *status, int options, struct rusage *ru) +waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +kill 129 37 (long pid, int sig) +fcntl 25 55 (int fd, int type, long arg) +flock 32 143 (int fd, unsigned long cmd) +mkdir ! 39 (const char *name, int mode) +rmdir ! 40 (const char *name) +unlink ! 10 (char *pathname) +readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) +umask 166 60 (int mask) +getgroups 158 205 (int gsize, unsigned int *groups) +setgroups 159 206 (int gsize, unsigned int *groups) +setresuid 147 164 (int uid, int euid, int suid) +getresuid 148 165 (int *uid, int *euid, int *suid) +setresgid 149 170 (int gid, int egid, int sgid) +getresgid 150 171 (int *gid, int *egid, int *sgid) +getpgid 155 132 (pid_t pid) +setfsuid 151 138 (int fsuid) +setfsgid 152 139 (int fsgid) +getsid 156 147 (void) +capget 90 184 (struct cap_header *h, struct cap_data *d) +capset 91 185 (struct cap_header *h, struct cap_data *d) +rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) +setpriority 140 97 (int which, int who, int nice) +sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) +sigaltstack 132 186 (const void *uss, void *uoss) +personality 92 136 (unsigned int personality) +prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +arch_prctl ! 17 (int option, unsigned long addr) +setrlimit 164 75 (int resource, struct krlimit *rlim) +mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +umount2 39 52 (char *name, int flags) +gettid 178 224 (void) +futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +set_tid_address 96 256 (int *tid_addr) +restart_syscall 128 0 (void) +timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) +timer_getoverrun 109 260 (int timer_id) +timer_delete 111 261 (kernel_timer_t timer_id) +clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +exit_group 94 248 (int error_code) +set_robust_list 99 338 (struct robust_list_head *head, size_t len) +get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) +fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) +setns 268 375 (int fd, int nstype) +kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) +mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) +unlinkat 35 328 (int dirfd, const char *pathname, int flags) +memfd_create 279 385 (const char *name, unsigned int flags) +io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) +io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) +gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) +preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +userfaultfd 282 388 (int flags) +fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) +cacheflush ! 983042 (void *start, void *end, int flags) +ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +fsopen 430 430 (char *fsname, unsigned int flags) +fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) +fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) +pidfd_open 434 434 (pid_t pid, unsigned int flags) +pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) +rseq 293 293 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) +openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) +membarrier 283 283 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S new file mode 100644 index 000000000..715da4612 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S @@ -0,0 +1,112 @@ +/* Autogenerated, don't edit */ +#include +#include "std/syscalls/syscall-common.S" +syscall sys_read, __NR_read +syscall sys_write, __NR_write +syscall sys_close, __NR_close +syscall sys_lseek, __NR_lseek +syscall sys_mmap, __NR_mmap +syscall sys_mprotect, __NR_mprotect +syscall sys_munmap, __NR_munmap +syscall sys_brk, __NR_brk +syscall sys_sigaction, __NR_rt_sigaction +syscall sys_sigprocmask, __NR_rt_sigprocmask +syscall sys_rt_sigreturn, __NR_rt_sigreturn +syscall sys_ioctl, __NR_ioctl +syscall sys_pread64, __NR_pread64 +syscall sys_ptrace, __NR_ptrace +syscall sys_mremap, __NR_mremap +syscall sys_mincore, __NR_mincore +syscall sys_madvise, __NR_madvise +syscall sys_shmat, __NR_shmat +syscall sys_pause, __NR_pause +syscall sys_nanosleep, __NR_nanosleep +syscall sys_getitimer, __NR_getitimer +syscall sys_setitimer, __NR_setitimer +syscall sys_getpid, __NR_getpid +syscall sys_socket, __NR_socket +syscall sys_connect, __NR_connect +syscall sys_sendto, __NR_sendto +syscall sys_recvfrom, __NR_recvfrom +syscall sys_sendmsg, __NR_sendmsg +syscall sys_recvmsg, __NR_recvmsg +syscall sys_shutdown, __NR_shutdown +syscall sys_bind, __NR_bind +syscall sys_setsockopt, __NR_setsockopt +syscall sys_getsockopt, __NR_getsockopt +syscall sys_clone, __NR_clone +syscall sys_exit, __NR_exit +syscall sys_wait4, __NR_wait4 +syscall sys_waitid, __NR_waitid +syscall sys_kill, __NR_kill +syscall sys_fcntl, __NR_fcntl +syscall sys_flock, __NR_flock +syscall sys_readlinkat, __NR_readlinkat +syscall sys_umask, __NR_umask +syscall sys_getgroups, __NR_getgroups +syscall sys_setgroups, __NR_setgroups +syscall sys_setresuid, __NR_setresuid +syscall sys_getresuid, __NR_getresuid +syscall sys_setresgid, __NR_setresgid +syscall sys_getresgid, __NR_getresgid +syscall sys_getpgid, __NR_getpgid +syscall sys_setfsuid, __NR_setfsuid +syscall sys_setfsgid, __NR_setfsgid +syscall sys_getsid, __NR_getsid +syscall sys_capget, __NR_capget +syscall sys_capset, __NR_capset +syscall sys_rt_sigqueueinfo, __NR_rt_sigqueueinfo +syscall sys_setpriority, __NR_setpriority +syscall sys_sched_setscheduler, __NR_sched_setscheduler +syscall sys_sigaltstack, __NR_sigaltstack +syscall sys_personality, __NR_personality +syscall sys_prctl, __NR_prctl +syscall sys_setrlimit, __NR_setrlimit +syscall sys_mount, __NR_mount +syscall sys_umount2, __NR_umount2 +syscall sys_gettid, __NR_gettid +syscall sys_futex, __NR_futex +syscall sys_set_tid_address, __NR_set_tid_address +syscall sys_restart_syscall, __NR_restart_syscall +syscall sys_timer_create, __NR_timer_create +syscall sys_timer_settime, __NR_timer_settime +syscall sys_timer_gettime, __NR_timer_gettime +syscall sys_timer_getoverrun, __NR_timer_getoverrun +syscall sys_timer_delete, __NR_timer_delete +syscall sys_clock_gettime, __NR_clock_gettime +syscall sys_exit_group, __NR_exit_group +syscall sys_set_robust_list, __NR_set_robust_list +syscall sys_get_robust_list, __NR_get_robust_list +syscall sys_signalfd4, __NR_signalfd4 +syscall sys_rt_tgsigqueueinfo, __NR_rt_tgsigqueueinfo +syscall sys_vmsplice, __NR_vmsplice +syscall sys_timerfd_settime, __NR_timerfd_settime +syscall sys_fanotify_init, __NR_fanotify_init +syscall sys_fanotify_mark, __NR_fanotify_mark +syscall sys_open_by_handle_at, __NR_open_by_handle_at +syscall sys_setns, __NR_setns +syscall sys_kcmp, __NR_kcmp +syscall sys_openat, __NR_openat +syscall sys_mkdirat, __NR_mkdirat +syscall sys_unlinkat, __NR_unlinkat +syscall sys_memfd_create, __NR_memfd_create +syscall sys_io_setup, __NR_io_setup +syscall sys_io_submit, __NR_io_submit +syscall sys_io_getevents, __NR_io_getevents +syscall sys_seccomp, __NR_seccomp +syscall sys_gettimeofday, __NR_gettimeofday +syscall sys_preadv_raw, __NR_preadv_raw +syscall sys_userfaultfd, __NR_userfaultfd +syscall sys_fallocate, __NR_fallocate +syscall sys_ppoll, __NR_ppoll +syscall sys_fsopen, __NR_fsopen +syscall sys_fsconfig, __NR_fsconfig +syscall sys_fsmount, __NR_fsmount +syscall sys_clone3, __NR_clone3 +syscall sys_pidfd_open, __NR_pidfd_open +syscall sys_pidfd_getfd, __NR_pidfd_getfd +syscall sys_rseq, __NR_rseq +syscall sys_move_mount, __NR_move_mount +syscall sys_open_tree, __NR_open_tree +syscall sys_openat2, __NR_openat2 +#include diff --git a/compel/arch/riscv64/scripts/compel-pack.lds.S b/compel/arch/riscv64/scripts/compel-pack.lds.S new file mode 100644 index 000000000..a61235b44 --- /dev/null +++ b/compel/arch/riscv64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(riscv) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/cpu.c b/compel/arch/riscv64/src/lib/cpu.c new file mode 100644 index 000000000..9a0291f70 --- /dev/null +++ b/compel/arch/riscv64/src/lib/cpu.c @@ -0,0 +1,78 @@ +#include +#include + +#include "compel-cpu.h" + +#include "common/bitops.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_cpuid(compel_cpuinfo_t *info) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf-host.c b/compel/arch/riscv64/src/lib/handle-elf-host.c new file mode 120000 index 000000000..fe4611886 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf.c b/compel/arch/riscv64/src/lib/handle-elf.c new file mode 100644 index 000000000..22420bc78 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf.c @@ -0,0 +1,32 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + const unsigned char *elf_ident = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + elf_ident_64_le; +#else + elf_ident_64_be; +#endif + + if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) + return handle_elf_riscv64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/cpu.h b/compel/arch/riscv64/src/lib/include/cpu.h new file mode 100644 index 000000000..e69de29bb diff --git a/compel/arch/riscv64/src/lib/include/handle-elf.h b/compel/arch/riscv64/src/lib/include/handle-elf.h new file mode 100644 index 000000000..582770583 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/handle-elf.h @@ -0,0 +1,12 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define __handle_elf handle_elf_riscv64 +#define ELF_RISCV +#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) + +extern int handle_elf_riscv64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/syscall.h b/compel/arch/riscv64/src/lib/include/syscall.h new file mode 100644 index 000000000..53f10525d --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..f2ba799cb --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..ac58567e3 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,7 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..a74decc23 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..192810cac --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,52 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/riscv/include/uapi/asm/ptrace.h + * + * A thread RISC-V CPU context + */ +typedef struct user_regs_struct user_regs_struct_t; +typedef struct __riscv_d_ext_state user_fpregs_struct_t; + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(registers) ((uint64_t)(registers).a0) +#define REG_IP(registers) ((uint64_t)(registers).pc) +#define SET_REG_IP(registers, val) ((registers).pc = (val)) + +/* + * REG_SP is also defined in riscv64-linux-gnu/include/sys/ucontext.h + * with a different meaning, and it's not used in CRIU. So we have to + * undefine it here. + */ +#ifdef REG_SP +#undef REG_SP +#endif + +#define REG_SP(registers) ((uint64_t)((registers).sp)) + +#define REG_SYSCALL_NR(registers) ((uint64_t)(registers).a7) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h new file mode 100644 index 000000000..e231d0465 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h @@ -0,0 +1,26 @@ +#ifndef COMPEL_RELOCATIONS_H__ +#define COMPEL_RELOCATIONS_H__ + +#include + +static inline uint32_t riscv_b_imm(uint32_t val) +{ + return (val & 0x00001000) << 19 | (val & 0x000007e0) << 20 | (val & 0x0000001e) << 7 | (val & 0x00000800) >> 4; +} + +static inline uint32_t riscv_i_imm(uint32_t val) +{ + return val << 20; +} + +static inline uint32_t riscv_u_imm(uint32_t val) +{ + return val & 0xfffff000; +} + +static inline uint32_t riscv_j_imm(uint32_t val) +{ + return (val & 0x00100000) << 11 | (val & 0x000007fe) << 20 | (val & 0x00000800) << 9 | (val & 0x000ff000); +} + +#endif /* COMPEL_RELOCATIONS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 000000000..e40fb6fce --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..761a08f62 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,68 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include + +#include + +#include + +/* Copied from the kernel header arch/riscv/include/uapi/asm/sigcontext.h */ +/* + * Signal context structure + * + * This contains the context saved before a signal handler is invoked; + * it is restored by sys_sigreturn / sys_rt_sigreturn. + */ +// struct sigcontext { +// struct user_regs_struct sc_regs; +// union __riscv_fp_state sc_fpregs; +// /* +// * 4K + 128 reserved for vector state and future expansion. +// * This space is enough to store the vector context whose VLENB +// * is less or equal to 128. +// * (The size of the vector context is 4144 byte as VLENB is 128) +// */ +// __u8 __reserved[4224] __attribute__((__aligned__(16))); +// }; + +#define rt_sigcontext sigcontext + +#include + +/* Copied from the kernel source arch/riscv/kernel/signal.c */ +struct rt_sigframe { + siginfo_t info; + ucontext_t uc; //ucontext_t structure holds the user context, e.g., the signal mask, GP regs +}; + +/* + generates inline assembly code for triggering the rt_sigreturn system call. + used to return from a signal handler back to the normal execution flow of the process. +*/ +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mv sp, %0\n" \ + "li a7, "__stringify(__NR_rt_sigreturn)" \n" \ + "ecall\n" \ + : \ + : "r"(new_sp) \ + : "a7", "memory") +/* clang-format on */ + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.__gregs[REG_PC]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +// #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) +// #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct sigcontext *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) +// #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) // erase the signal mask +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) // copy the signal mask + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c new file mode 100644 index 000000000..01395a205 --- /dev/null +++ b/compel/arch/riscv64/src/lib/infect.c @@ -0,0 +1,222 @@ +#include +#include +#include +#include +#include +#include +#include "common/page.h" +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +unsigned __page_size = 0; +unsigned __page_shift = 0; + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x73, 0x00, 0x00, 0x00, /* ecall */ + 0x73, 0x00, 0x10, 0x00 /* ebreak */ +}; + +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline void __always_unused __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigframe->uc.uc_mcontext.__gregs[0] = regs->pc; + sigframe->uc.uc_mcontext.__gregs[1] = regs->ra; + sigframe->uc.uc_mcontext.__gregs[2] = regs->sp; + sigframe->uc.uc_mcontext.__gregs[3] = regs->gp; + sigframe->uc.uc_mcontext.__gregs[4] = regs->tp; + sigframe->uc.uc_mcontext.__gregs[5] = regs->t0; + sigframe->uc.uc_mcontext.__gregs[6] = regs->t1; + sigframe->uc.uc_mcontext.__gregs[7] = regs->t2; + sigframe->uc.uc_mcontext.__gregs[8] = regs->s0; + sigframe->uc.uc_mcontext.__gregs[9] = regs->s1; + sigframe->uc.uc_mcontext.__gregs[10] = regs->a0; + sigframe->uc.uc_mcontext.__gregs[11] = regs->a1; + sigframe->uc.uc_mcontext.__gregs[12] = regs->a2; + sigframe->uc.uc_mcontext.__gregs[13] = regs->a3; + sigframe->uc.uc_mcontext.__gregs[14] = regs->a4; + sigframe->uc.uc_mcontext.__gregs[15] = regs->a5; + sigframe->uc.uc_mcontext.__gregs[16] = regs->a6; + sigframe->uc.uc_mcontext.__gregs[17] = regs->a7; + sigframe->uc.uc_mcontext.__gregs[18] = regs->s2; + sigframe->uc.uc_mcontext.__gregs[19] = regs->s3; + sigframe->uc.uc_mcontext.__gregs[20] = regs->s4; + sigframe->uc.uc_mcontext.__gregs[21] = regs->s5; + sigframe->uc.uc_mcontext.__gregs[22] = regs->s6; + sigframe->uc.uc_mcontext.__gregs[23] = regs->s7; + sigframe->uc.uc_mcontext.__gregs[24] = regs->s8; + sigframe->uc.uc_mcontext.__gregs[25] = regs->s9; + sigframe->uc.uc_mcontext.__gregs[26] = regs->s10; + sigframe->uc.uc_mcontext.__gregs[27] = regs->s11; + sigframe->uc.uc_mcontext.__gregs[28] = regs->t3; + sigframe->uc.uc_mcontext.__gregs[29] = regs->t4; + sigframe->uc.uc_mcontext.__gregs[30] = regs->t5; + sigframe->uc.uc_mcontext.__gregs[31] = regs->t6; + + memcpy(sigframe->uc.uc_mcontext.__fpregs.__d.__f, fpregs->f, sizeof(fpregs->f)); + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpregs->fcsr; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret = -1; + + pr_info("Dumping FPU registers for %d\n", pid); + + iov.iov_base = fpsimd; + iov.iov_len = sizeof(*fpsimd); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + return -1; + } + + ret = save(arg, regs, fpsimd); + return ret; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.a7 = (unsigned long)nr; + regs.a0 = arg1; + regs.a1 = arg2; + regs.a2 = arg3; + regs.a3 = arg4; + regs.a4 = arg5; + regs.a5 = arg6; + regs.a6 = 0; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.a0; + return err; +} + +/* + * Calling the mmap system call in the context of the target (victim) process using the compel_syscall function. + * Used during the infection process to allocate memory for the parasite code. +*/ +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->sp = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here. + */ + return true; +} + +/* + * Fetch the signal alternate stack (sigaltstack), + * sas is a separate memory area for the signal handler to run on, + * avoiding potential issues with the main process stack +*/ +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Task size is the maximum virtual address space size that a process can occupy in the memory + * Refer to linux kernel arch/riscv/include/asm/pgtable.h, + * task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * + * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V + * Instruction Set Manual Volume II: Privileged Architecture" states that + * "load and store effective addresses, which are 64bits, must have bits + * 63–48 all equal to bit 47, or else a page-fault exception will occur." +*/ +#define TASK_SIZE 0x800000000000UL // hardcoded for SV48 MMU + +unsigned long compel_task_size(void) +{ + return TASK_SIZE; +} + +/* + * Get task registers (overwrites weak function) + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} diff --git a/compel/src/main.c b/compel/src/main.c index bc16c0ab4..21e06d7dd 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -60,6 +60,9 @@ static const flags_t flags = { #elif defined CONFIG_LOONGARCH64 .arch = "loongarch64", .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_RISCV64 + .arch = "riscv64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index 55c5be307..603c322cf 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -21,6 +21,7 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ -e s/aarch64.*/aarch64/ \ + -e s/riscv64.*/riscv64/ \ -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH From 1d028ef44e9de6f8ec9c86eb43753f9156edd1f2 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 11:59:13 -0700 Subject: [PATCH 505/775] images: add riscv64 core image Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- images/Makefile | 1 + images/core-riscv64.proto | 53 +++++++++++++++++++++++++++++++++++++++ images/core.proto | 3 +++ 3 files changed, 57 insertions(+) create mode 100644 images/core-riscv64.proto diff --git a/images/Makefile b/images/Makefile index 855d894da..1e40b8a8f 100644 --- a/images/Makefile +++ b/images/Makefile @@ -7,6 +7,7 @@ proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o proto-obj-y += core-s390.o +proto-obj-y += core-riscv64.o proto-obj-y += cpuinfo.o proto-obj-y += inventory.o proto-obj-y += fdinfo.o diff --git a/images/core-riscv64.proto b/images/core-riscv64.proto new file mode 100644 index 000000000..1ddfdd8bd --- /dev/null +++ b/images/core-riscv64.proto @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +// Refer to riscv-gnu-toolchain/linux-headers/include/asm/ptrace.h +message user_riscv64_regs_entry { + required uint64 pc = 1; + required uint64 ra = 2; + required uint64 sp = 3; + required uint64 gp = 4; + required uint64 tp = 5; + required uint64 t0 = 6; + required uint64 t1 = 7; + required uint64 t2 = 8; + required uint64 s0 = 9; + required uint64 s1 = 10; + required uint64 a0 = 11; + required uint64 a1 = 12; + required uint64 a2 = 13; + required uint64 a3 = 14; + required uint64 a4 = 15; + required uint64 a5 = 16; + required uint64 a6 = 17; + required uint64 a7 = 18; + required uint64 s2 = 19; + required uint64 s3 = 20; + required uint64 s4 = 21; + required uint64 s5 = 22; + required uint64 s6 = 23; + required uint64 s7 = 24; + required uint64 s8 = 25; + required uint64 s9 = 26; + required uint64 s10 = 27; + required uint64 s11 = 28; + required uint64 t3 = 29; + required uint64 t4 = 30; + required uint64 t5 = 31; + required uint64 t6 = 32; +} + +message user_riscv64_d_ext_entry { + repeated uint64 f = 1; + required uint32 fcsr = 2; +} + +message thread_info_riscv64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_riscv64_regs_entry gpregs = 3[(criu).hex = true]; + required user_riscv64_d_ext_entry fpsimd = 4; +} diff --git a/images/core.proto b/images/core.proto index 5b07b5c44..1fa23868b 100644 --- a/images/core.proto +++ b/images/core.proto @@ -9,6 +9,7 @@ import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; import "core-loongarch64.proto"; +import "core-riscv64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -126,6 +127,7 @@ message core_entry { S390 = 5; MIPS = 6; LOONGARCH64 = 7; + RISCV64 = 8; } required march mtype = 1; @@ -136,6 +138,7 @@ message core_entry { optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; optional thread_info_loongarch64 ti_loongarch64 = 12; + optional thread_info_riscv64 ti_riscv64 = 13; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; From 6d970ed047592a2dacb51d81338ca7e9ecc21005 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:06:00 -0700 Subject: [PATCH 506/775] criu: add riscv64 support to parasite and restorer Co-authored-by: Yixue Zhao Co-authored-by: stove Signed-off-by: Haorong Lu --- criu/arch/riscv64/Makefile | 8 + criu/arch/riscv64/cpu.c | 40 ++++ criu/arch/riscv64/crtools.c | 171 ++++++++++++++++++ criu/arch/riscv64/include/asm/dump.h | 15 ++ criu/arch/riscv64/include/asm/int.h | 6 + criu/arch/riscv64/include/asm/kerndat.h | 7 + .../riscv64/include/asm/parasite-syscall.h | 6 + criu/arch/riscv64/include/asm/parasite.h | 16 ++ criu/arch/riscv64/include/asm/restore.h | 29 +++ criu/arch/riscv64/include/asm/restorer.h | 150 +++++++++++++++ .../arch/riscv64/include/asm/thread_pointer.h | 27 +++ criu/arch/riscv64/include/asm/types.h | 40 ++++ criu/arch/riscv64/include/asm/vdso.h | 28 +++ criu/arch/riscv64/restorer.c | 14 ++ criu/arch/riscv64/sigframe.c | 8 + criu/arch/riscv64/vdso-lookup.S | 15 ++ criu/arch/riscv64/vdso-pie.c | 159 ++++++++++++++++ criu/pie/Makefile | 8 + criu/pie/Makefile.library | 4 + 19 files changed, 751 insertions(+) create mode 100644 criu/arch/riscv64/Makefile create mode 100644 criu/arch/riscv64/cpu.c create mode 100644 criu/arch/riscv64/crtools.c create mode 100644 criu/arch/riscv64/include/asm/dump.h create mode 100644 criu/arch/riscv64/include/asm/int.h create mode 100644 criu/arch/riscv64/include/asm/kerndat.h create mode 100644 criu/arch/riscv64/include/asm/parasite-syscall.h create mode 100644 criu/arch/riscv64/include/asm/parasite.h create mode 100644 criu/arch/riscv64/include/asm/restore.h create mode 100644 criu/arch/riscv64/include/asm/restorer.h create mode 100644 criu/arch/riscv64/include/asm/thread_pointer.h create mode 100644 criu/arch/riscv64/include/asm/types.h create mode 100644 criu/arch/riscv64/include/asm/vdso.h create mode 100644 criu/arch/riscv64/restorer.c create mode 100644 criu/arch/riscv64/sigframe.c create mode 100644 criu/arch/riscv64/vdso-lookup.S create mode 100644 criu/arch/riscv64/vdso-pie.c diff --git a/criu/arch/riscv64/Makefile b/criu/arch/riscv64/Makefile new file mode 100644 index 000000000..d19895471 --- /dev/null +++ b/criu/arch/riscv64/Makefile @@ -0,0 +1,8 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o +obj-y += vdso-lookup.o \ No newline at end of file diff --git a/criu/arch/riscv64/cpu.c b/criu/arch/riscv64/cpu.c new file mode 100644 index 000000000..97a883b8c --- /dev/null +++ b/criu/arch/riscv64/cpu.c @@ -0,0 +1,40 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include "cpu.h" + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpu_dump_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpu_validate_image_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpuinfo_dump(void) +{ + return -ENOTSUP; +} + +int cpuinfo_check(void) +{ + return -ENOTSUP; +} diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c new file mode 100644 index 000000000..b2d6d2951 --- /dev/null +++ b/criu/arch/riscv64/crtools.c @@ -0,0 +1,171 @@ +#include +#include + +#include + +#include "types.h" +#include + +#include +#include "asm/restorer.h" +#include "common/compiler.h" +#include +#include "asm/dump.h" +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "restorer.h" +#include "compel/infect.h" + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +{ + int i; + CoreEntry *core = x; + + // Save riscv64 gprs + assign_reg(core->ti_riscv64->gpregs, regs, pc); + assign_reg(core->ti_riscv64->gpregs, regs, ra); + assign_reg(core->ti_riscv64->gpregs, regs, sp); + assign_reg(core->ti_riscv64->gpregs, regs, gp); + assign_reg(core->ti_riscv64->gpregs, regs, tp); + assign_reg(core->ti_riscv64->gpregs, regs, t0); + assign_reg(core->ti_riscv64->gpregs, regs, t1); + assign_reg(core->ti_riscv64->gpregs, regs, t2); + assign_reg(core->ti_riscv64->gpregs, regs, s0); + assign_reg(core->ti_riscv64->gpregs, regs, s1); + assign_reg(core->ti_riscv64->gpregs, regs, a0); + assign_reg(core->ti_riscv64->gpregs, regs, a1); + assign_reg(core->ti_riscv64->gpregs, regs, a2); + assign_reg(core->ti_riscv64->gpregs, regs, a3); + assign_reg(core->ti_riscv64->gpregs, regs, a4); + assign_reg(core->ti_riscv64->gpregs, regs, a5); + assign_reg(core->ti_riscv64->gpregs, regs, a6); + assign_reg(core->ti_riscv64->gpregs, regs, a7); + assign_reg(core->ti_riscv64->gpregs, regs, s2); + assign_reg(core->ti_riscv64->gpregs, regs, s3); + assign_reg(core->ti_riscv64->gpregs, regs, s4); + assign_reg(core->ti_riscv64->gpregs, regs, s5); + assign_reg(core->ti_riscv64->gpregs, regs, s6); + assign_reg(core->ti_riscv64->gpregs, regs, s7); + assign_reg(core->ti_riscv64->gpregs, regs, s8); + assign_reg(core->ti_riscv64->gpregs, regs, s9); + assign_reg(core->ti_riscv64->gpregs, regs, s10); + assign_reg(core->ti_riscv64->gpregs, regs, s11); + assign_reg(core->ti_riscv64->gpregs, regs, t3); + assign_reg(core->ti_riscv64->gpregs, regs, t4); + assign_reg(core->ti_riscv64->gpregs, regs, t5); + assign_reg(core->ti_riscv64->gpregs, regs, t6); + + // Save riscv64 fprs + for (i = 0; i < 32; ++i) + assign_reg(core->ti_riscv64->fpsimd, fpsimd, f[i]); + assign_reg(core->ti_riscv64->fpsimd, fpsimd, fcsr); + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoRiscv64 *ti_riscv64; + UserRiscv64RegsEntry *gpregs; + UserRiscv64DExtEntry *fpsimd; + + ti_riscv64 = xmalloc(sizeof(*ti_riscv64)); + if (!ti_riscv64) + goto err; + thread_info_riscv64__init(ti_riscv64); + core->ti_riscv64 = ti_riscv64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_riscv64_regs_entry__init(gpregs); + + ti_riscv64->gpregs = gpregs; + + fpsimd = xmalloc(sizeof(*fpsimd)); + if (!fpsimd) + goto err; + user_riscv64_d_ext_entry__init(fpsimd); + ti_riscv64->fpsimd = fpsimd; + fpsimd->f = xmalloc(32 * sizeof(fpsimd->f[0])); + fpsimd->n_f = 32; + if (!fpsimd->f) + goto err; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (core->ti_riscv64) { + if (core->ti_riscv64->fpsimd) { + xfree(core->ti_riscv64->fpsimd->f); + xfree(core->ti_riscv64->fpsimd); + } + xfree(core->ti_riscv64->gpregs); + xfree(core->ti_riscv64); + core->ti_riscv64 = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + int i; + UserRiscv64DExtEntry *fpsimd = core->ti_riscv64->fpsimd; + + if (fpsimd->n_f != 32) + return 1; + + for (i = 0; i < 32; ++i) + sigframe->uc.uc_mcontext.__fpregs.__d.__f[i] = fpsimd->f[i]; + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpsimd->fcsr; + + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r) +{ + f->uc.uc_mcontext.__gregs[0] = r->pc; + f->uc.uc_mcontext.__gregs[1] = r->ra; + f->uc.uc_mcontext.__gregs[2] = r->sp; + f->uc.uc_mcontext.__gregs[3] = r->gp; + f->uc.uc_mcontext.__gregs[4] = r->tp; + f->uc.uc_mcontext.__gregs[5] = r->t0; + f->uc.uc_mcontext.__gregs[6] = r->t1; + f->uc.uc_mcontext.__gregs[7] = r->t2; + f->uc.uc_mcontext.__gregs[8] = r->s0; + f->uc.uc_mcontext.__gregs[9] = r->s1; + f->uc.uc_mcontext.__gregs[10] = r->a0; + f->uc.uc_mcontext.__gregs[11] = r->a1; + f->uc.uc_mcontext.__gregs[12] = r->a2; + f->uc.uc_mcontext.__gregs[13] = r->a3; + f->uc.uc_mcontext.__gregs[14] = r->a4; + f->uc.uc_mcontext.__gregs[15] = r->a5; + f->uc.uc_mcontext.__gregs[16] = r->a6; + f->uc.uc_mcontext.__gregs[17] = r->a7; + f->uc.uc_mcontext.__gregs[18] = r->s2; + f->uc.uc_mcontext.__gregs[19] = r->s3; + f->uc.uc_mcontext.__gregs[20] = r->s4; + f->uc.uc_mcontext.__gregs[21] = r->s5; + f->uc.uc_mcontext.__gregs[22] = r->s6; + f->uc.uc_mcontext.__gregs[23] = r->s7; + f->uc.uc_mcontext.__gregs[24] = r->s8; + f->uc.uc_mcontext.__gregs[25] = r->s9; + f->uc.uc_mcontext.__gregs[26] = r->s10; + f->uc.uc_mcontext.__gregs[27] = r->s11; + f->uc.uc_mcontext.__gregs[28] = r->t3; + f->uc.uc_mcontext.__gregs[29] = r->t4; + f->uc.uc_mcontext.__gregs[30] = r->t5; + f->uc.uc_mcontext.__gregs[31] = r->t6; + + return 0; +} diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h new file mode 100644 index 000000000..c2988f9bf --- /dev/null +++ b/criu/arch/riscv64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_riscv64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/riscv64/include/asm/int.h b/criu/arch/riscv64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/riscv64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/riscv64/include/asm/kerndat.h b/criu/arch/riscv64/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/riscv64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/riscv64/include/asm/parasite-syscall.h b/criu/arch/riscv64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..6008c3792 --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/riscv64/include/asm/parasite.h b/criu/arch/riscv64/include/asm/parasite.h new file mode 100644 index 000000000..4798cfd8a --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite.h @@ -0,0 +1,16 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* + * This function is used to retrieve the value of the thread pointer (tp) + * in RISC-V architecture, which is typically used for thread-local storage (TLS). + * The value is then stored in the provided tls_t pointer. + */ +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm("mv %0, tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/riscv64/include/asm/restore.h b/criu/arch/riscv64/include/asm/restore.h new file mode 100644 index 000000000..e4f25a57b --- /dev/null +++ b/criu/arch/riscv64/include/asm/restore.h @@ -0,0 +1,29 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "and sp, %0, ~15 \n" \ + "mv a0, %2 \n" \ + "jr %1 \n" \ + : \ + : "r"(new_sp), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "a0", "memory") +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_riscv64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/riscv64/include/asm/restorer.h b/criu/arch/riscv64/include/asm/restorer.h new file mode 100644 index 000000000..45fe847a9 --- /dev/null +++ b/criu/arch/riscv64/include/asm/restorer.h @@ -0,0 +1,150 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include + +#include "asm/types.h" +#include "images/core.pb-c.h" + +#include + +// kernel arg order for clone +// unsigned long clone_flags, +// unsigned long newsp, +// int __user * parent_tidptr, +// unsigned long tls, +// int __user * child_tidptr +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld a1, %2 \n" \ + "andi a1, a1, ~15 \n" \ + "addi a1, a1, -16 \n" \ + "sd %5, 0(a1) \n" \ + "sd %6, 8(a1) \n" \ + "mv a0, %1 \n" \ + "mv a2, %3 \n" \ + "mv a3, %4 \n" \ + "li a7, "__stringify(__NR_clone)" \n" \ + "ecall \n" \ + \ + "beqz a0, thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone_end \n" \ + \ + "thread_run: \n" \ + "ld a1, 0(sp) \n" \ + "ld a0, 8(sp) \n" \ + "jr a1 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "a0", "a1", "a2", "a3", "a7", "memory") + +/* + * Based on sysdeps/unix/sysv/linux/riscv/clone.S + * + * int clone(int (*fn)(void *arg), x0 + * void *child_stack, x1 + * int flags, x2 + * void *arg, x3 + * pid_t *ptid, x4 + * struct user_desc *tls, x5 + * pid_t *ctid); x6 + * + * int clone3(struct clone_args *args, x0 + * size_t size); x1 + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + */ +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* In contrast to the clone() wrapper above this does not put + * the thread function and its arguments on the child stack, + * but uses registers to pass these parameters to the child process. + * Based on the glibc clone() wrapper at + * sysdeps/unix/sysv/linux/riscv/clone.S. + */ \ + "clone3_emul: \n" \ + /* + * Based on the glibc clone() wrapper, which uses x10 and x11 + * to save the arguments for the child process, this does the same. + * x10 for the thread function and x11 for the thread arguments. + */ \ + "mv t0, %3 /* clone_restore_fn */ \n" \ + "mv t1, %4 /* args */ \n" \ + "mv a0, %1 /* &clone_args */ \n" \ + "mv a1, %2 /* size */ \n" \ + /* Load syscall number */ \ + "li a7, "__stringify(__NR_clone3)" \n" \ + /* Do the syscall */ \ + "ecall \n" \ + \ + "beqz a0, clone3_thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + /* Move args to a0 */ \ + "mv a0, t1 \n" \ + /* Jump to clone_restore_fn */ \ + "jr t0 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "a0", "a1", "a7", "t0", "t1", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "mv sp, %0 \n" \ + "li a0, 0 \n" \ + "jr x0 \n" \ + : \ + : "r"(ret) \ + : "sp", "a0", "memory") +/* clang-format on */ + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r); +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r); + +static inline void restore_tls(tls_t *ptls) +{ + asm("mv tp, %0" : : "r"(*ptls)); +} + +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif \ No newline at end of file diff --git a/criu/arch/riscv64/include/asm/thread_pointer.h b/criu/arch/riscv64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/riscv64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/riscv64/include/asm/types.h b/criu/arch/riscv64/include/asm/types.h new file mode 100644 index 000000000..83bb5f65f --- /dev/null +++ b/criu/arch/riscv64/include/asm/types.h @@ -0,0 +1,40 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#define core_is_compat(core) false + +typedef UserRiscv64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__RISCV64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_riscv64 + +#define TI_SP(core) ((core)->ti_riscv64->gpregs->sp) + +#define TI_IP(core) ((core)->ti_riscv64->gpregs->pc) + +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} + +#define AT_VECTOR_SIZE 64 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/riscv64/include/asm/vdso.h b/criu/arch/riscv64/include/asm/vdso.h new file mode 100644 index 000000000..322149c6e --- /dev/null +++ b/criu/arch/riscv64/include/asm/vdso.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "common/compiler.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_GTOD 2 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *rv64_vdso_symbol1 = "__vdso_clock_getres"; \ + const char *rv64_vdso_symbol2 = "__vdso_clock_gettime"; \ + const char *rv64_vdso_symbol3 = "__vdso_gettimeofday"; \ + const char *rv64_vdso_symbol4 = "__vdso_getcpu"; \ + const char *rv64_vdso_symbol5 = "__vdso_flush_icache"; \ + const char *rv64_vdso_symbol6 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + rv64_vdso_symbol1, rv64_vdso_symbol2, rv64_vdso_symbol3, rv64_vdso_symbol4, rv64_vdso_symbol5, rv64_vdso_symbol6 + +extern void write_intraprocedure_branch(unsigned long to, unsigned long from); + +#endif /* __CR_ASM_VDSO_H__ */ \ No newline at end of file diff --git a/criu/arch/riscv64/restorer.c b/criu/arch/riscv64/restorer.c new file mode 100644 index 000000000..d605f048d --- /dev/null +++ b/criu/arch/riscv64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" + +#include +#include "log.h" +#include +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r) +{ + return 0; +} diff --git a/criu/arch/riscv64/sigframe.c b/criu/arch/riscv64/sigframe.c new file mode 100644 index 000000000..8096fab66 --- /dev/null +++ b/criu/arch/riscv64/sigframe.c @@ -0,0 +1,8 @@ +#include "asm/types.h" +#include +#include "asm/sigframe.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/riscv64/vdso-lookup.S b/criu/arch/riscv64/vdso-lookup.S new file mode 100644 index 000000000..50d4ecf08 --- /dev/null +++ b/criu/arch/riscv64/vdso-lookup.S @@ -0,0 +1,15 @@ +#include "common/asm/linkage.h" + +.section .text + +/* Expects t0 to hold the index into the lookup table. */ +GLOBAL(riscv_vdso_lookup) + /* Get the beginning of the lookup table */ + la t1, riscv_vdso_lookup_end + /* Scale the index */ + slli t0, t0, 3 + add t1, t0, t1 + ld t2, 0(t1) + jr t2 + +GLOBAL(riscv_vdso_lookup_end) \ No newline at end of file diff --git a/criu/arch/riscv64/vdso-pie.c b/criu/arch/riscv64/vdso-pie.c new file mode 100644 index 000000000..aa9272fb5 --- /dev/null +++ b/criu/arch/riscv64/vdso-pie.c @@ -0,0 +1,159 @@ +#include + +#include "asm/types.h" + +#include +#include +#include +#include +#include "atomic.h" +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* These symbols are defined in vdso-lookup.S */ +extern char *riscv_vdso_lookup, *riscv_vdso_lookup_end; + +/* + * li t0, INDEX + * jal x0, riscv_vdso_lookup + */ +#define TRAMP_CALL_SIZE (2 * sizeof(uint32_t)) + +static inline void invalidate_caches(void) +{ + // We're supposed to use the VDSO as the officially sanctioned ABI. But oh well. + int ret; + __smp_mb(); + asm volatile("li a0, 0\n" + "li a1, 0\n" + "li a2, 1\n" /* SYS_RISCV_FLUSH_ICACHE_ALL */ + "li a7, 259\n" /* __NR_arch_specific_syscall */ + "ecall\n" + : "=r"(ret) + : + : "a7"); +} + +static inline size_t vdso_trampoline_size(void) +{ + return (size_t)&riscv_vdso_lookup_end - (size_t)&riscv_vdso_lookup; +} + +static uint64_t put_trampoline(uint64_t at, struct vdso_symtable *sym) +{ + int i, j; + uint64_t total_size, trampoline_size; + uint64_t trampoline = 0; + + /* First of all we have to find a place where to put the trampoline + * code. + */ + trampoline_size = vdso_trampoline_size(); + total_size = trampoline_size + VDSO_SYMBOL_MAX * sizeof(uint64_t); + + for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { + if (vdso_symbol_empty(&sym->symbols[i])) + continue; + + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); + + /* find the nearest following symbol we are interested in */ + for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i == j || vdso_symbol_empty(&sym->symbols[j])) + continue; + + if (sym->symbols[j].offset <= sym->symbols[i].offset) + /* this symbol is above the current one */ + continue; + + if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { + /* we have a major issue here since we cannot + * even put the trampoline call for this symbol + */ + pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); + return 0; + } + + if (trampoline) + /* no need to put it twice */ + continue; + + if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= total_size) + /* not enough place */ + continue; + + /* We can put the trampoline there */ + trampoline = at + sym->symbols[i].offset; + trampoline += TRAMP_CALL_SIZE; + + pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); + memcpy((void *)trampoline, &riscv_vdso_lookup, trampoline_size); + invalidate_caches(); + return trampoline; + } + } + + return 0; +} + +static inline void put_trampoline_call(uint64_t from, uint64_t to, uint64_t trampoline, unsigned int idx) +{ + size_t trampoline_size = vdso_trampoline_size(); + uint64_t *lookup_table = NULL; + /* + * li t0, INDEX + * addi t0, x0 INDEX + * jal x0, riscv_vdso_lookup + */ + uint32_t trampoline_call[2] = { + 0x00000293, + 0x0000006f, + }; + const size_t insts_len = ARRAY_SIZE(trampoline_call); + uint32_t *call_addr = (uint32_t *)from; + // Offset from the jal instruction to the lookup trampoline. + ssize_t trampoline_offset = trampoline - (from + sizeof(uint32_t)); + + trampoline_call[0] = trampoline_call[0] | (idx << 24); + trampoline_call[1] = trampoline_call[1] | riscv_j_imm(trampoline_offset); + + for (unsigned int i = 0; i < insts_len; i++) { + call_addr[i] = trampoline_call[i]; + } + + // Set the lookup table pointer for this vdso symbol. + lookup_table = (uint64_t *)(trampoline + trampoline_size); + lookup_table[idx] = to; +} + +int vdso_redirect_calls(uint64_t base_to, uint64_t base_from, struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i, valid_idx = 0; + + uint64_t trampoline = (uint64_t)put_trampoline(base_from, from); + if (!trampoline) + return 1; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, + to->symbols[i].offset, i, from->symbols[i].name); + + put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline, + valid_idx); + valid_idx++; + } + + invalidate_caches(); + + return 0; +} \ No newline at end of file diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 912fab24b..60c7f1e94 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -23,6 +23,10 @@ ifeq ($(ARCH),x86) ccflags-y += -mshstk endif +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o @@ -43,6 +47,10 @@ ifeq ($(ARCH),ppc64) restorer-obj-y += ./$(ARCH_DIR)/vdso-trampoline.o endif +ifeq ($(ARCH),riscv64) + restorer-obj-y += ./$(ARCH_DIR)/vdso-lookup.o +endif + define gen-pie-rules $(1)-obj-y += $(1).o $(1)-obj-e += pie.lib.a diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library index da2a2fab3..d96a7ac32 100644 --- a/criu/pie/Makefile.library +++ b/criu/pie/Makefile.library @@ -27,3 +27,7 @@ CFLAGS += $(CFLAGS_PIE) ifeq ($(ARCH),mips) CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic endif + +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif \ No newline at end of file From bb29067de9ee853fd132f88b4b3e62dbd87aa915 Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:09:16 -0700 Subject: [PATCH 507/775] zdtm: add riscv64 support Signed-off-by: Haorong Lu --- .../lib/arch/riscv64/include/asm/atomic.h | 107 ++++++++++++++++++ test/zdtm/lib/test.c | 2 +- test/zdtm/static/fanotify00.c | 2 +- test/zdtm/static/netns-nf.desc | 2 +- test/zdtm/static/netns-nft-ipt.desc | 2 +- .../static/socket-tcp-closed-last-ack.desc | 4 +- test/zdtm/static/socket-tcp-reseted.desc | 6 +- test/zdtm/static/socket-tcp-syn-sent.desc | 4 +- 8 files changed, 118 insertions(+), 11 deletions(-) create mode 100644 test/zdtm/lib/arch/riscv64/include/asm/atomic.h diff --git a/test/zdtm/lib/arch/riscv64/include/asm/atomic.h b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h new file mode 100644 index 000000000..a4faf1322 --- /dev/null +++ b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h @@ -0,0 +1,107 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)v); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + *v = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(*v), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(*ptr) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index a5ba38b2d..95017e42e 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -406,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64 || __riscv) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); diff --git a/test/zdtm/static/fanotify00.c b/test/zdtm/static/fanotify00.c index 69ead43e7..0400cc74b 100644 --- a/test/zdtm/static/fanotify00.c +++ b/test/zdtm/static/fanotify00.c @@ -22,7 +22,7 @@ #elif defined(__PPC64__) #define __NR_fanotify_init 323 #define __NR_fanotify_mark 324 -#elif __aarch64__ +#elif (__aarch64__ || __riscv) #define __NR_fanotify_init 262 #define __NR_fanotify_mark 263 #elif __s390x__ diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index e7e73b1ae..c99696d1c 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -1,6 +1,6 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns-nft-ipt.desc b/test/zdtm/static/netns-nft-ipt.desc index 4120f74d6..6d04589b3 100644 --- a/test/zdtm/static/netns-nft-ipt.desc +++ b/test/zdtm/static/netns-nft-ipt.desc @@ -2,7 +2,7 @@ 'deps': [ '/bin/sh', '/usr/sbin/nft', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index d4cfe5064..309854fa5 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -1,7 +1,7 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index 3ebdfeef8..4aa48ad87 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -1,8 +1,8 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', - '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/riscv64-linux-gnu/xtables/libipt_REJECT.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 4cc23c8fc..71cd26d72 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -1,7 +1,7 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', From da6b1807ef76fee6b744f0d3f8f50af26c492baa Mon Sep 17 00:00:00 2001 From: Haorong Lu Date: Tue, 1 Aug 2023 12:10:46 -0700 Subject: [PATCH 508/775] ci: add workflow for riscv64 Signed-off-by: Haorong Lu --- .github/workflows/cross-compile-daily.yml | 2 +- .github/workflows/cross-compile.yml | 1 + .../build/Dockerfile.riscv64-stable-cross.hdr | 5 ++ .../Dockerfile.riscv64-stable-cross.tmpl | 57 +++++++++++++++++++ scripts/build/Makefile | 2 +- scripts/ci/riscv64-cross/amd64-sources.list | 10 ++++ scripts/ci/riscv64-cross/riscv64-sources.list | 42 ++++++++++++++ 7 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 scripts/build/Dockerfile.riscv64-stable-cross.hdr create mode 100644 scripts/build/Dockerfile.riscv64-stable-cross.tmpl create mode 100644 scripts/ci/riscv64-cross/amd64-sources.list create mode 100644 scripts/ci/riscv64-cross/riscv64-sources.list diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index b8c8c86d4..c709cca00 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross] + target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, riscv64-stable-cross] branches: [criu-dev, master] steps: diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 06b812823..96672b294 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -21,6 +21,7 @@ jobs: aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, + riscv64-stable-cross, ] include: - experimental: true diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.hdr b/scripts/build/Dockerfile.riscv64-stable-cross.hdr new file mode 100644 index 000000000..d4c414023 --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.hdr @@ -0,0 +1,5 @@ +FROM ubuntu:jammy + +ENV ARCH=riscv64 +ENV DEBIAN_ARCH=riscv64 +ENV CROSS_TRIPLET=riscv64-linux-gnu diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl new file mode 100644 index 000000000..39a0c33c6 --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -0,0 +1,57 @@ +COPY scripts/ci/apt-install /bin/apt-install + +# Add the cross compiler sources +RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 + +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 871920D1991BC93C 8D69674688B6CB36 B523E5F3FC4E5F2C + +COPY scripts/ci/riscv64-cross/amd64-sources.list /etc/apt/sources.list + +COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ + +RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ + apt-get update -y + +# Install required packages +RUN apt-get install -y --no-install-recommends \ + build-essential \ + pkg-config \ + git \ + crossbuild-essential-${DEBIAN_ARCH} \ + libc6-dev-${DEBIAN_ARCH}-cross \ + libc6-${DEBIAN_ARCH}-cross \ + libbz2-dev:${DEBIAN_ARCH} \ + libexpat1-dev:${DEBIAN_ARCH} \ + ncurses-dev:${DEBIAN_ARCH} \ + libssl-dev:${DEBIAN_ARCH} \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf \ + libnl-3-dev:${DEBIAN_ARCH} \ + libprotobuf-dev:${DEBIAN_ARCH} \ + libnet-dev:${DEBIAN_ARCH} \ + libprotobuf-c-dev:${DEBIAN_ARCH} \ + libcap-dev:${DEBIAN_ARCH} \ + libaio-dev:${DEBIAN_ARCH} \ + libnl-route-3-dev:${DEBIAN_ARCH} \ + libnftables-dev:${DEBIAN_ARCH} \ + libgnutls28-dev:${DEBIAN_ARCH} \ + iproute2:${DEBIAN_ARCH} + +ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLET} \ + AS=/usr/bin/${CROSS_TRIPLET}-as \ + AR=/usr/bin/${CROSS_TRIPLET}-ar \ + CC=/usr/bin/${CROSS_TRIPLET}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ + LD=/usr/bin/${CROSS_TRIPLET}-ld \ + FC=/usr/bin/${CROSS_TRIPLET}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Makefile b/scripts/build/Makefile index bc4a59db1..389315227 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,5 +1,5 @@ ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 -STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross +STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) CREATE_DOCKERFILES := $(ARCHES) $(NON_CLANG) diff --git a/scripts/ci/riscv64-cross/amd64-sources.list b/scripts/ci/riscv64-cross/amd64-sources.list new file mode 100644 index 000000000..72dad920c --- /dev/null +++ b/scripts/ci/riscv64-cross/amd64-sources.list @@ -0,0 +1,10 @@ +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security main restricted +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security universe +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security multiverse \ No newline at end of file diff --git a/scripts/ci/riscv64-cross/riscv64-sources.list b/scripts/ci/riscv64-cross/riscv64-sources.list new file mode 100644 index 000000000..67b8067b6 --- /dev/null +++ b/scripts/ci/riscv64-cross/riscv64-sources.list @@ -0,0 +1,42 @@ +# See http://help.ubuntu.com/community/UpgradeNotes for how to upgrade to +# newer versions of the distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted + +## Major bug fix updates produced after the final release of the +## distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team. Also, please note that software in universe WILL NOT receive any +## review or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team, and may not be under a free licence. Please satisfy yourself as to +## your rights to use the software. Also, please note that software in +## multiverse WILL NOT receive any review or updates from the Ubuntu +## security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse + +## N.B. software from this repository may not have been tested as +## extensively as that contained in the main release, although it includes +## newer versions of some applications which may provide useful features. +## Also, please note that software in backports WILL NOT receive any review +## or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse + +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse \ No newline at end of file From 2be958d22ea4d78f5ba718688025ff0509a47ac2 Mon Sep 17 00:00:00 2001 From: Cryolitia PukNgae Date: Mon, 14 Oct 2024 01:35:44 +0800 Subject: [PATCH 509/775] include: don't use GCC's __builtin_ffs on riscv64 Link: https://github.com/SerenityOS/serenity/commit/e300da4db42e2484d98f4982d03150d83436304e Signed-off-by: PukNgae Cryolitia --- - cherry-picked Signed-off-by: Alexander Mikhalitsyn --- include/common/arch/riscv64/asm/bitops.h | 111 ++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h index 400cc3e15..eabab27c7 100644 --- a/include/common/arch/riscv64/asm/bitops.h +++ b/include/common/arch/riscv64/asm/bitops.h @@ -2,7 +2,116 @@ #define __CR_ASM_BITOPS_H__ #include "common/compiler.h" -#include "common/asm-generic/bitops.h" +#include "common/asm/bitsperlong.h" + +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m"(*(volatile long *)(x)) +#else +#define BITOP_ADDR(x) "+m"(*(volatile long *)(x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr |= (1UL << (nr % BITS_PER_LONG)); +} + +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr ^= (1UL << (nr % BITS_PER_LONG)); +} + +static inline int test_bit(int nr, volatile const unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + return (*addr & (1UL << (nr % BITS_PER_LONG))) ? -1 : 0; +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr &= ~(1UL << (nr % BITS_PER_LONG)); +} + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + int p = 0; + + for (; p < 8*sizeof(word); ++p) { + if (word & 1) { + break; + } + + word >>= 1; + } + + return p; +} + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Find the next set bit in a memory region. + */ +static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG - 1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) + #define BITS_PER_LONG 64 From 9052ef93c79f92634eb163080292d79af496617a Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Sat, 12 Oct 2024 15:29:40 +0800 Subject: [PATCH 510/775] uffd: Disable image deduplication after fork After a fork, both the child and parent processes may trigger a page fault (#PF) at the same virtual address, referencing the same position in the page image. If deduplication is enabled, the last process to trigger the page fault will fail. Therefore, deduplication should be disabled after a fork to prevent this issue. Signed-off-by: Liu Hua --- criu/include/pagemap.h | 5 +++++ criu/pagemap.c | 11 ++++++++++- criu/uffd.c | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 8c7180559..3ae15deb9 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -58,6 +58,9 @@ struct page_read { /* Whether or not pages can be read in PIE code */ bool pieok; + /* Whether or not disable image deduplication*/ + bool disable_dedup; + /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; @@ -112,6 +115,8 @@ int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); */ extern void dup_page_read(struct page_read *src, struct page_read *dst); +extern void page_read_disable_dedup(struct page_read *pr); + extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) diff --git a/criu/pagemap.c b/criu/pagemap.c index 83f69bba3..85bb92259 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -261,7 +261,7 @@ static int read_local_page(struct page_read *pr, unsigned long vaddr, unsigned l break; } - if (opts.auto_dedup) { + if (opts.auto_dedup && !pr->disable_dedup) { ret = punch_hole(pr, pr->pi_off, len, false); if (ret == -1) return -1; @@ -792,6 +792,7 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p pr->bunch.iov_base = NULL; pr->pmes = NULL; pr->pieok = false; + pr->disable_dedup = false; pr->pmi = open_image_at(dfd, i_typ, O_RSTR, img_id); if (!pr->pmi) @@ -852,6 +853,14 @@ int open_page_read(unsigned long img_id, struct page_read *pr, int pr_flags) #define DUP_IDS_BASE 1000 +void page_read_disable_dedup(struct page_read *pr) +{ + pr_debug("disable dedup, id: %d\n", pr->id); + pr->disable_dedup = true; + if (pr->parent) + page_read_disable_dedup(pr->parent); +} + void dup_page_read(struct page_read *src, struct page_read *dst) { static int dup_ids = 1; diff --git a/criu/uffd.c b/criu/uffd.c index e07b21b69..98c2b7e07 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -1098,6 +1098,8 @@ static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg) lpi_get(lpi->parent); + page_read_disable_dedup(&parent_lpi->pr); + page_read_disable_dedup(&lpi->pr); return 1; out: From 622b43392fcb330343243a0f2842dd2919f977cc Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Fri, 18 Oct 2024 18:51:18 +0200 Subject: [PATCH 511/775] criu: Initialize util before service worker starts When restoring dumps in new mount + pid namespaces where multiple dumps share the same network namespace, CRIU may fail due to conflicting unix socket names. This happens because the service worker creates sockets using a pattern that includes criu_run_id, but util_init() is called after cr_service_work() starts. The socket naming pattern "crtools-fd-%d-%d" uses the restore PID and criu_run_id, however criu_run_id is always 0 when not initialized, leading to conflicts when multiple restores run simultaneously either in the same CRIU process or because of multiple CRIU processes doing the same operation in different PID namespaces. Fix this by: - Moving util_init() before cr_service_work() starts - Adding a second util_init() call in the service worker fork to ensure unique IDs across multiple worker runs - Making sure that dump and restore operations have util_init() called early to generate unique socket names With this fix, socket names always include the namespace ID, preventing conflicts when multiple processes with the same pid share a network namespace. Fixes #2499 [ avagin: minore code changes ] Signed-off-by: Lorenzo Fontana Signed-off-by: Andrei Vagin --- criu/cr-service.c | 8 ++++++++ criu/crtools.c | 10 +++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 61a04c5ff..b9d11ced2 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1310,6 +1310,14 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); + more: opts.mode = CR_SWRK; diff --git a/criu/crtools.c b/criu/crtools.c index 94657f418..6f493850b 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -169,7 +169,13 @@ int main(int argc, char *argv[], char *envp[]) pr_err("unknown command: %s\n", argv[optind]); goto usage; } - + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); @@ -254,8 +260,6 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - util_init(); - if (log_init(opts.output)) return 1; From ff9dbef902361bfdda8e30e46c8f6b0df710de9f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Nov 2024 19:30:26 +0000 Subject: [PATCH 512/775] seize: fix error handling for check_freezer_cgroup When `check_freezer_cgroup()` has non-zero return value, `goto err` calls `return ret`. However, the value of `ret` has been set to `0` in the lines above and CRIU does not handle the error properly. This problem is related to https://github.com/checkpoint-restore/criu/issues/2508 Signed-off-by: Radostin Stoyanov --- criu/seize.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index edeb57cc8..ab394f9ca 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1009,7 +1009,7 @@ static int cgroup_version(void) int collect_pstree(void) { pid_t pid = root_item->pid->real; - int ret = -1; + int ret, exit_code = -1; struct proc_status_creds creds; struct pstree_item *iter; @@ -1069,7 +1069,6 @@ int collect_pstree(void) if (opts.freeze_cgroup && !freeze_cgroup_disabled && freezer_wait_processes()) { - ret = -1; goto err; } @@ -1081,12 +1080,12 @@ int collect_pstree(void) goto err; } - ret = 0; + exit_code = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); err: /* Freezing stage finished in time - disable timer. */ alarm(0); - return ret; + return exit_code; } From 4196268eef099833cd77c0fb93d367bcd8ce1463 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 4 Nov 2024 19:57:30 +0000 Subject: [PATCH 513/775] seize: enable support for frozen containers Container runtimes like CRI-O and containerd utilize the freezer cgroup to create a consistent snapshot of container root filesystem (rootfs) changes. In this case, the container is frozen before invoking CRIU. After CRIU successfully completes, a copy of the container rootfs diff is saved, and the container is then unfrozen. However, the `cuda-checkpoint` tool is not able to perform a 'lock' action on frozen threads. To support GPU checkpointing with these container runtimes, we need to unfreeze the cgroup and return it to its original state once the checkpointing is complete. To reflect this new behavior, the following changes are applied: - `dont_use_freeze_cgroup(void)` -> `set_compel_interrupt_only_mode(void)` - `bool freeze_cgroup_disabled` -> `bool compel_interrupt_only_mode` - `check_freezer_cgroup(void)` -> `prepare_freezer_for_interrupt_only_mode(void)` Note that when `compel_interrupt_only_mode` is set to `true`, `compel_interrupt_task()` is used instead of `freeze_processes()` to prevent tasks from running during `criu dump`. Fixes: #2508 Signed-off-by: Radostin Stoyanov --- criu/fault-injection.c | 4 +-- criu/include/fault-injection.h | 2 +- criu/include/seize.h | 2 +- criu/seize.c | 46 +++++++++++++++++++--------------- plugins/cuda/cuda_plugin.c | 2 +- test/jenkins/criu-fault.sh | 2 +- 6 files changed, 32 insertions(+), 26 deletions(-) diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 2272e6d84..5dd9acf60 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -24,8 +24,8 @@ int fault_injection_init(void) fi_strategy = start; switch (fi_strategy) { - case FI_DISABLE_FREEZE_CGROUP: - dont_use_freeze_cgroup(); + case FI_COMPEL_INTERRUPT_ONLY_MODE: + set_compel_interrupt_only_mode(); break; default: break; diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 59adf05b9..e987c18ce 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -21,7 +21,7 @@ enum faults { FI_CORRUPT_EXTREGS = 134, FI_DONT_USE_PAGEMAP_SCAN = 135, FI_DUMP_CRASH = 136, - FI_DISABLE_FREEZE_CGROUP = 137, + FI_COMPEL_INTERRUPT_ONLY_MODE = 137, FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; diff --git a/criu/include/seize.h b/criu/include/seize.h index f5ea76b16..64e8d2d12 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -9,6 +9,6 @@ extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); -extern void dont_use_freeze_cgroup(void); +extern void set_compel_interrupt_only_mode(void); #endif diff --git a/criu/seize.c b/criu/seize.c index ab394f9ca..9bd1832d9 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -25,17 +25,17 @@ #include "xmalloc.h" #include "util.h" -static bool freeze_cgroup_disabled; +static bool compel_interrupt_only_mode; /* * Disables the use of freeze cgroups for process seizing, even if explicitly - * requested via the --freeze-cgroup option. This is necessary for plugins - * (e.g., CUDA) that do not function correctly when processes are frozen using - * cgroups. + * requested via the --freeze-cgroup option or already set in a frozen state. + * This is necessary for plugins (e.g., CUDA) that do not function correctly + * when processes are frozen using cgroups. */ -void __attribute__((used)) dont_use_freeze_cgroup(void) +void __attribute__((used)) set_compel_interrupt_only_mode(void) { - freeze_cgroup_disabled = true; + compel_interrupt_only_mode = true; } char *task_comm_info(pid_t pid, char *comm, size_t size) @@ -410,7 +410,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup || freeze_cgroup_disabled) + if (!opts.freeze_cgroup || compel_interrupt_only_mode) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -505,29 +505,35 @@ static int log_unfrozen_stacks(char *root) return 0; } -static int check_freezer_cgroup(void) +static int prepare_freezer_for_interrupt_only_mode(void) { enum freezer_state state = THAWED; int fd; + int exit_code = -1; - BUG_ON(!freeze_cgroup_disabled); + BUG_ON(!compel_interrupt_only_mode); fd = freezer_open(); if (fd < 0) return -1; state = get_freezer_state(fd); - close(fd); if (state == FREEZER_ERROR) { - return -1; + goto err; } + origin_freezer_state = state == FREEZING ? FROZEN : state; + if (state != THAWED) { - pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n"); - return -1; + pr_warn("unfreezing cgroup for plugin compatibility\n"); + if (freezer_write_state(fd, THAWED)) + goto err; } - return 0; + exit_code = 0; +err: + close(fd); + return exit_code; } static int freeze_processes(void) @@ -681,7 +687,7 @@ static int collect_children(struct pstree_item *item) goto free; } - if (!opts.freeze_cgroup || freeze_cgroup_disabled) + if (!opts.freeze_cgroup || compel_interrupt_only_mode) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -869,7 +875,7 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && + if ((!opts.freeze_cgroup || compel_interrupt_only_mode) && compel_interrupt_task(pid)) continue; @@ -926,7 +932,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup && !freeze_cgroup_disabled) + if (opts.freeze_cgroup && !compel_interrupt_only_mode) attempts = 1; /* @@ -1032,11 +1038,11 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && !freeze_cgroup_disabled) { + if (opts.freeze_cgroup && !compel_interrupt_only_mode) { if (freeze_processes()) goto err; } else { - if (opts.freeze_cgroup && check_freezer_cgroup()) + if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); @@ -1067,7 +1073,7 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && !freeze_cgroup_disabled && + if (opts.freeze_cgroup && !compel_interrupt_only_mode && freezer_wait_processes()) { goto err; } diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index c4fc67fa9..3d624750e 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -509,7 +509,7 @@ int cuda_plugin_init(int stage) INIT_LIST_HEAD(&cuda_pids); } - dont_use_freeze_cgroup(); + set_compel_interrupt_only_mode(); return 0; } diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index fc0eddc2b..8cb71d8ca 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -40,7 +40,7 @@ fi # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail -# check dont_use_freeze_cgroup +# check set_compel_interrupt_only_mode ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst From 36a53fe23c4092ee1ad68144e4e216ab8979b3ab Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 8 Nov 2024 13:41:20 +0000 Subject: [PATCH 514/775] ci: test interrupt-only mode with frozen cgroup Signed-off-by: Radostin Stoyanov --- test/jenkins/criu-fault.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 8cb71d8ca..6ee7ce33a 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -43,6 +43,8 @@ fi # check set_compel_interrupt_only_mode ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 ./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst +# check set_compel_interrupt_only_mode when test cgroup is frozen +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:f --fault 137 if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then fail From b1cac7a8e580bb023d84d07a9c6f738f9eaf602d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 2 Nov 2024 08:29:43 +0000 Subject: [PATCH 515/775] cuda: fix check for GPU device availability The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 3d624750e..718db3025 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) +/** + * Check if a CUDA device is available on the system + */ +static bool is_cuda_device_available(void) +{ + const char *gpu_path = "/proc/driver/nvidia/gpus/"; + struct stat sb; + + if (stat(gpu_path, &sb) != 0) + return false; + + return S_ISDIR(sb.st_mode); +} + int cuda_plugin_init(int stage) { int ret; @@ -481,8 +495,8 @@ int cuda_plugin_init(int stage) } } - if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) { - pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n"); + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) { + pr_info("No GPU device found; CUDA plugin is disabled\n"); plugin_disabled = true; return 0; } From 7125bfc69579a93e2df9720a615b7ad29d79120b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 6 Nov 2024 22:08:24 +0530 Subject: [PATCH 516/775] pidfd: one process creates a helper and opens all fds to it Currently, the `waitpid()` call on the tmp process can be made by a process which is not its parent. This causes restore to fail. This patch instead selects one process to create the tmp process and open all the fds that point to it. These fds are sent to the correct process(es). Fixes: #2496 Signed-off-by: Andrei Vagin Signed-off-by: Bhavik Sachdev --- criu/files.c | 7 +-- criu/include/pidfd.h | 2 +- criu/pidfd.c | 128 +++++++++++++++++++++---------------------- 3 files changed, 64 insertions(+), 73 deletions(-) diff --git a/criu/files.c b/criu/files.c index a57fb860f..31e705bcc 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1811,11 +1811,6 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); - - if (init_dead_pidfd_hash()) { - pr_err("Could not initialise hash map for dead pidfds\n"); - return -1; - } - + init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h index 4d2d71700..bcc0fb45a 100644 --- a/criu/include/pidfd.h +++ b/criu/include/pidfd.h @@ -7,7 +7,7 @@ extern const struct fdtype_ops pidfd_dump_ops; extern struct collect_image_info pidfd_cinfo; extern int is_pidfd_link(char *link); -extern int init_dead_pidfd_hash(void); +extern void init_dead_pidfd_hash(void); struct pidfd_dump_info { PidfdEntry pidfe; pid_t pid; diff --git a/criu/pidfd.c b/criu/pidfd.c index 3ea3c9309..53b9bcf71 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -21,32 +21,26 @@ struct pidfd_info { PidfdEntry *pidfe; struct file_desc d; + + struct dead_pidfd *dead; + struct pidfd_info *next; }; struct dead_pidfd { unsigned int ino; - int pid; - size_t count; - mutex_t pidfd_lock; + int creator_id; + struct hlist_node hash; + struct pidfd_info *list; }; #define DEAD_PIDFD_HASH_SIZE 32 static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; -static mutex_t *dead_pidfd_hash_lock; -int init_dead_pidfd_hash(void) +void init_dead_pidfd_hash(void) { for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) INIT_HLIST_HEAD(&dead_pidfd_hash[i]); - - dead_pidfd_hash_lock = shmalloc(sizeof(*dead_pidfd_hash_lock)); - if (!dead_pidfd_hash_lock) - return -1; - - mutex_init(dead_pidfd_hash_lock); - - return 0; } static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) @@ -54,15 +48,12 @@ static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) struct dead_pidfd *dead; struct hlist_head *chain; - mutex_lock(dead_pidfd_hash_lock); chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; hlist_for_each_entry(dead, chain, hash) { if (dead->ino == ino) { - mutex_unlock(dead_pidfd_hash_lock); return dead; } } - mutex_unlock(dead_pidfd_hash_lock); return NULL; } @@ -142,7 +133,7 @@ static int create_tmp_process(void) return tmp_process; } -static int free_dead_pidfd(struct dead_pidfd *dead) +static int kill_helper(pid_t pid) { int status; sigset_t blockmask, oldmask; @@ -160,15 +151,13 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - if (kill(dead->pid, SIGKILL) < 0) { - pr_perror("Could not kill temporary process with pid: %d", - dead->pid); + if (kill(pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", pid); goto err; } - if (waitpid(dead->pid, &status, 0) != dead->pid) { - pr_perror("Could not wait on temporary process with pid: %d", - dead->pid); + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Could not wait on temporary process with pid: %d", pid); goto err; } @@ -188,9 +177,6 @@ static int free_dead_pidfd(struct dead_pidfd *dead) goto err; } - mutex_lock(dead_pidfd_hash_lock); - hlist_del(&dead->hash); - mutex_unlock(dead_pidfd_hash_lock); return 0; err: return -1; @@ -198,8 +184,9 @@ err: static int open_one_pidfd(struct file_desc *d, int *new_fd) { - struct pidfd_info *info; + struct pidfd_info *info, *child; struct dead_pidfd *dead = NULL; + pid_t pid; int pidfd; info = container_of(d, struct pidfd_info, d); @@ -215,34 +202,44 @@ static int open_one_pidfd(struct file_desc *d, int *new_fd) dead = lookup_dead_pidfd(info->pidfe->ino); BUG_ON(!dead); - mutex_lock(&dead->pidfd_lock); - BUG_ON(dead->count == 0); - dead->count--; - if (dead->pid == -1) { - dead->pid = create_tmp_process(); - if (dead->pid < 0) { - mutex_unlock(&dead->pidfd_lock); - goto err_close; + if (info->dead && info->dead->creator_id != info->pidfe->id) { + int ret = recv_desc_from_peer(&info->d, &pidfd); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd\n"); + return ret; } + goto out; } - pidfd = pidfd_open(dead->pid, info->pidfe->flags); + pid = create_tmp_process(); + if (pid < 0) + goto err_close; + + for (child = dead->list; child; child = child->next) { + if (child == info) + continue; + pidfd = pidfd_open(pid, child->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", child->pidfe->nspid); + goto err_close; + } + + if (send_desc_to_peer(pidfd, &child->d)) { + pr_perror("Can't send file descriptor"); + close(pidfd); + return -1; + } + close(pidfd); + } + + pidfd = pidfd_open(pid, info->pidfe->flags); if (pidfd < 0) { pr_perror("Could not open pidfd for %d", info->pidfe->nspid); - mutex_unlock(&dead->pidfd_lock); goto err_close; } - - if (dead->count == 0) { - if (free_dead_pidfd(dead)) { - pr_err("Failed to delete dead_pidfd struct\n"); - mutex_unlock(&dead->pidfd_lock); - close(pidfd); - goto err_close; - } - } - mutex_unlock(&dead->pidfd_lock); - + if (kill_helper(pid)) + goto err_close; out: if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { goto err_close; @@ -269,32 +266,31 @@ static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) info->pidfe = pb_msg(msg, PidfdEntry); pr_info_pidfd("Collected ", info->pidfe); + info->dead = NULL; if (info->pidfe->nspid != -1) goto out; dead = lookup_dead_pidfd(info->pidfe->ino); - if (dead) { - mutex_lock(&dead->pidfd_lock); - dead->count++; - mutex_unlock(&dead->pidfd_lock); - goto out; - } - - dead = shmalloc(sizeof(*dead)); if (!dead) { - pr_err("Could not allocate shared memory..\n"); - return -1; + dead = xmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->list = NULL; + dead->ino = info->pidfe->ino; + dead->creator_id = info->pidfe->id; + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); } - INIT_HLIST_NODE(&dead->hash); - dead->ino = info->pidfe->ino; - dead->count = 1; - dead->pid = -1; - mutex_init(&dead->pidfd_lock); + info->dead = dead; + info->next = dead->list; + dead->list = info; + if (dead->creator_id > info->pidfe->id) + dead->creator_id = info->pidfe->id; - mutex_lock(dead_pidfd_hash_lock); - hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); - mutex_unlock(dead_pidfd_hash_lock); out: return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); } From 498bcf28067624b1fca1b939000f8314574a5e80 Mon Sep 17 00:00:00 2001 From: Bhavik Sachdev Date: Wed, 6 Nov 2024 22:10:08 +0530 Subject: [PATCH 517/775] zdtm: Check many processes with common dead pidfd We have multiple processes open a pidfd to a common dead process. After C/R we check that the inode numbers for these pidfds are equal or not. Signed-off-by: Bhavik Sachdev --- test/zdtm/static/Makefile | 1 + test/zdtm/static/pidfd_diffdead.c | 228 ++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 test/zdtm/static/pidfd_diffdead.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 44ac64fe5..71a1b6a53 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -56,6 +56,7 @@ TST_NOFILE := \ pidfd_self \ pidfd_of_thread \ pidfd_dead \ + pidfd_diffdead \ pidfd_child \ pidfd_kill \ fd_from_pidfd \ diff --git a/test/zdtm/static/pidfd_diffdead.c b/test/zdtm/static/pidfd_diffdead.c new file mode 100644 index 000000000..5bc1911a5 --- /dev/null +++ b/test/zdtm/static/pidfd_diffdead.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of processes that point to a common dead pidfd\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main and child open a pidfd for grandchild. + * Before C/R we kill grandchild. + * We end up with two pidfds in two diff processes that point to the same dead process. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char *argv[]) +{ +#define READ 0 +#define WRITE 1 + + int child, ret, gchild, status; + struct statx stat; + task_waiter_t t; + unsigned long long ino; + + /* + * We use the inop pipe to send the inode number of the + * pidfd opened in the child to the main process for + * comparison. + */ + int p[2]; + int pidfd; + + test_init(argc, argv); + task_waiter_init(&t); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild; + gchild = test_fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[READ]); + close(p[WRITE]); + while (1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + task_waiter_complete(&t, 1); + + test_waitsig(); + + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &ino, sizeof(ino)) != sizeof(ino)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + close(pidfd); + + /* ino number should be same because both pidfds were for the same process */ + if (ino != stat.stx_ino) { + exit(1); + } + exit(0); + } + } + + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + /* + * We kill grandchild process only after opening pidfd. + */ + if (pidfd_send_signal(pidfd, SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + return 1; + } + + /* Wait for child to waitpid on gchild */ + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + close(p[READ]); + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + goto err; + } + + /* Send inode number of pidfd to child for comparison */ + if (write(p[WRITE], &stat.stx_ino, sizeof(stat.stx_ino)) != sizeof(stat.stx_ino)) { + pr_perror("write"); + goto err; + } + close(p[WRITE]); + + if (kill(child, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto err; + } + + if (!WIFEXITED(status)) { + fail("Expected child to terminate normally"); + goto err; + } + + if (WEXITSTATUS(status) != 0) { + fail("Child failed"); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} From 28c2cb3fd6121f3280484665915d1ef5d8b9df14 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 12 Nov 2024 13:04:31 +0000 Subject: [PATCH 518/775] cuda: enable checkpoint support for paused tasks If a CUDA process is already in a "locked" or "checkpointed" state during criu dump, the CUDA plugin currently fails with an error because it attempts an unnecessary "lock" action using the cuda-checkpoint tool. This patch extends the CUDA plugin to handle such cases by first verifying the initial state of the CUDA processes and skipping unnecessary "lock" and "checkpoint" actions when a process has been locked or checkpointed before CRIU is invoked. In particular, CUDA tasks may already be in a "locked" or "checkpointed" state to ensure consistent checkpoint/restore for distributed workloads, such as model training, where multiple containers run across different cluster nodes. Another use case for this functionality is optimizing resource utilization, where CUDA tasks with low-priority are preempted immediately to release GPU resources needed by high-priority tasks, and the paused workloads are later resumed or migrated to another node. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 155 +++++++++++++++++++++++++++---------- 1 file changed, 116 insertions(+), 39 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 718db3025..7764cf3c7 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -26,6 +26,13 @@ #define ACTION_RESTORE "restore" #define ACTION_UNLOCK "unlock" +typedef enum { + CUDA_TASK_RUNNING = 0, + CUDA_TASK_LOCKED, + CUDA_TASK_CHECKPOINTED, + CUDA_TASK_UNKNOWN = -1 +} cuda_task_state_t; + #define CUDA_CKPT_BUF_SIZE (128) #ifdef LOG_PREFIX @@ -43,6 +50,7 @@ bool plugin_added_to_inventory = false; struct pid_info { int pid; char checkpointed; + cuda_task_state_t initial_task_state; struct list_head list; }; @@ -62,7 +70,7 @@ static void dealloc_pid_buffer(struct list_head *pid_buf) } } -static int add_pid_to_buf(struct list_head *pid_buf, int pid) +static int add_pid_to_buf(struct list_head *pid_buf, int pid, cuda_task_state_t state) { struct pid_info *new = xmalloc(sizeof(*new)); @@ -72,25 +80,12 @@ static int add_pid_to_buf(struct list_head *pid_buf, int pid) new->pid = pid; new->checkpointed = 0; + new->initial_task_state = state; list_add_tail(&new->list, pid_buf); return 0; } -static int update_checkpointed_pid(struct list_head *pid_buf, int pid) -{ - struct pid_info *info; - - list_for_each_entry(info, pid_buf, list) { - if (info->pid == pid) { - info->checkpointed = 1; - return 0; - } - } - - return -1; -} - static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) { #define READ 0 @@ -231,6 +226,37 @@ static int get_cuda_restore_tid(int root_pid) return atoi(pid_out); } +static cuda_task_state_t get_task_state_enum(const char *state_str) +{ + if (strncmp(state_str, "running", 7) == 0) + return CUDA_TASK_RUNNING; + + if (strncmp(state_str, "locked", 6) == 0) + return CUDA_TASK_LOCKED; + + if (strncmp(state_str, "checkpointed", 12) == 0) + return CUDA_TASK_CHECKPOINTED; + + pr_err("Unknown CUDA state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; +} + +static cuda_task_state_t get_cuda_state(pid_t pid) +{ + char pid_buf[16]; + char state_str[CUDA_CKPT_BUF_SIZE]; + const char *args[] = { CUDA_CHECKPOINT, "--get-state", "--pid", pid_buf, NULL }; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + if (launch_cuda_checkpoint(args, state_str, sizeof(state_str))) { + pr_err("Failed to launch cuda-checkpoint to retrieve state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; + } + + return get_task_state_enum(state_str); +} + static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, int buf_size) { @@ -319,6 +345,8 @@ int cuda_plugin_checkpoint_devices(int pid) int int_ret; int status; k_rtsigset_t save_sigset; + struct pid_info *task_info; + bool pid_found = false; if (plugin_disabled) { return -ENOTSUP; @@ -336,6 +364,26 @@ int cuda_plugin_checkpoint_devices(int pid) return 0; } + /* Check if the process is already in a checkpointed state */ + list_for_each_entry(task_info, &cuda_pids, list) { + if (task_info->pid == pid) { + if (task_info->initial_task_state == CUDA_TASK_CHECKPOINTED) { + pr_info("pid %d already in a checkpointed state\n", pid); + return 0; + } + pid_found = true; + break; + } + } + + if (pid_found == false) { + /* We return an error here. The task should be restored + * to its original state at cuda_plugin_fini(). + */ + pr_err("Failed to track pid %d\n", pid); + return -1; + } + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); /* We need to resume the checkpoint thread to prepare the mappings for * checkpointing @@ -348,22 +396,8 @@ int cuda_plugin_checkpoint_devices(int pid) pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } - status = update_checkpointed_pid(&cuda_pids, pid); - if (status) { - pr_err("Failed to track checkpointed pid %d\n", pid); - status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); - } - } - if (!status && !plugin_added_to_inventory) { - status = add_inventory_plugin(CR_PLUGIN_DESC.name); - if (status) - pr_err("Failed to add CUDA plugin to inventory image\n"); - else - plugin_added_to_inventory = true; - } + task_info->checkpointed = 1; interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); @@ -376,6 +410,7 @@ int cuda_plugin_pause_devices(int pid) { int restore_tid; char msg_buf[CUDA_CKPT_BUF_SIZE]; + cuda_task_state_t task_state; if (plugin_disabled) { return -ENOTSUP; @@ -388,6 +423,34 @@ int cuda_plugin_pause_devices(int pid) return 0; } + task_state = get_cuda_state(restore_tid); + if (task_state == CUDA_TASK_UNKNOWN) { + pr_err("Failed to get CUDA state for PID %d\n", restore_tid); + return -1; + } + + if (!plugin_added_to_inventory) { + if (add_inventory_plugin(CR_PLUGIN_DESC.name)) { + pr_err("Failed to add CUDA plugin to inventory image\n"); + return -1; + } + plugin_added_to_inventory = true; + } + + if (task_state == CUDA_TASK_LOCKED) { + pr_info("pid %d already in a locked state\n", pid); + /* Leave this PID in a "locked" state at resume_device() */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_LOCKED); + return 0; + } + + if (task_state == CUDA_TASK_CHECKPOINTED) { + /* We need to skip this PID in cuda_plugin_checkpoint_devices(), + * and leave it in a "checkpoined" state at resume_device(). */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_CHECKPOINTED); + return 0; + } + pr_info("pausing devices on pid %d\n", pid); int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); if (status) { @@ -397,7 +460,7 @@ int cuda_plugin_pause_devices(int pid) return -1; } - if (add_pid_to_buf(&cuda_pids, pid)) { + if (add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_RUNNING)) { pr_err("unable to track paused pid %d\n", pid); goto unlock; } @@ -412,7 +475,7 @@ unlock: } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) -int resume_device(int pid, int checkpointed) +int resume_device(int pid, int checkpointed, cuda_task_state_t initial_task_state) { char msg_buf[CUDA_CKPT_BUF_SIZE]; int status; @@ -420,6 +483,11 @@ int resume_device(int pid, int checkpointed) int int_ret; k_rtsigset_t save_sigset; + if (initial_task_state == CUDA_TASK_UNKNOWN) { + pr_info("skip resume for PID %d (unknown state)\n", pid); + return 0; + } + int restore_tid = get_cuda_restore_tid(pid); if (restore_tid == -1) { pr_info("No need to resume devices on pid %d\n", pid); @@ -439,7 +507,8 @@ int resume_device(int pid, int checkpointed) return -1; } - if (checkpointed) { + if (checkpointed && (initial_task_state == CUDA_TASK_RUNNING || initial_task_state == CUDA_TASK_LOCKED)) { + /* If the process was "locked" or "running" before checkpointing it, we need to restore it */ status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); @@ -448,10 +517,13 @@ int resume_device(int pid, int checkpointed) } } - status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); - ret = -1; + if (initial_task_state == CUDA_TASK_RUNNING) { + /* If the process was "running" before we paused it, we need to unlock it */ + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } } interrupt: @@ -466,7 +538,12 @@ int cuda_plugin_resume_devices_late(int pid) return -ENOTSUP; } - return resume_device(pid, 1); + /* RESUME_DEVICES_LATE is used during `criu restore`. + * Here, we assume that users expect the target process + * to be in a "running" state after restore, even if it was + * in a "locked" or "checkpointed" state during `criu dump`. + */ + return resume_device(pid, 1, CUDA_TASK_RUNNING); } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) @@ -542,7 +619,7 @@ void cuda_plugin_fini(int stage, int ret) if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { struct pid_info *info; list_for_each_entry(info, &cuda_pids, list) { - resume_device(info->pid, info->checkpointed); + resume_device(info->pid, info->checkpointed, info->initial_task_state); } } if (stage == CR_PLUGIN_STAGE__DUMP) { From 21e5f4cfd55b8d6837d0f01441d9772c3f09f707 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 12 Nov 2024 15:14:51 +0000 Subject: [PATCH 519/775] test: add get-state to mocked cuda-checkpoint tool Signed-off-by: Radostin Stoyanov --- test/cuda-checkpoint/cuda-checkpoint.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c index f35a4b41d..3b7ce8b9f 100644 --- a/test/cuda-checkpoint/cuda-checkpoint.c +++ b/test/cuda-checkpoint/cuda-checkpoint.c @@ -11,6 +11,7 @@ int main(int argc, char *argv[]) int option_index = 0; static struct option long_options[] = { { "pid", required_argument, 0, 'p' }, + { "get-state", no_argument, 0, 's' }, { "get-restore-tid", no_argument, 0, 'g' }, { "action", required_argument, 0, 'a' }, { "timeout", required_argument, 0, 't' }, @@ -31,6 +32,9 @@ int main(int argc, char *argv[]) case 'a': case 't': break; + case 's': + printf("running\n"); + break; case 'h': printf("--action - execute an action"); break; From 399d7bdcbb94bdcbbca2ec7bef881cdfd6c9f404 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 17 Nov 2024 16:10:20 +0000 Subject: [PATCH 520/775] compel: fix gitignore and remove autogenerated code We don't need to have compel/arch/riscv64/plugins/std/syscalls/syscalls.S tracked in git. It is autogenerated. We also need to update our .gitignore to ignore autogenerated files with syscall tables. Signed-off-by: Alexander Mikhalitsyn --- compel/.gitignore | 3 + .../riscv64/plugins/std/syscalls/syscalls.S | 112 ------------------ 2 files changed, 3 insertions(+), 112 deletions(-) delete mode 100644 compel/arch/riscv64/plugins/std/syscalls/syscalls.S diff --git a/compel/.gitignore b/compel/.gitignore index eab3337d6..5e770a86c 100644 --- a/compel/.gitignore +++ b/compel/.gitignore @@ -4,6 +4,9 @@ arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S +arch/mips/plugins/std/syscalls/syscalls-64.S +arch/loongarch64/plugins/std/syscalls/syscalls-64.S +arch/riscv64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S b/compel/arch/riscv64/plugins/std/syscalls/syscalls.S deleted file mode 100644 index 715da4612..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/syscalls.S +++ /dev/null @@ -1,112 +0,0 @@ -/* Autogenerated, don't edit */ -#include -#include "std/syscalls/syscall-common.S" -syscall sys_read, __NR_read -syscall sys_write, __NR_write -syscall sys_close, __NR_close -syscall sys_lseek, __NR_lseek -syscall sys_mmap, __NR_mmap -syscall sys_mprotect, __NR_mprotect -syscall sys_munmap, __NR_munmap -syscall sys_brk, __NR_brk -syscall sys_sigaction, __NR_rt_sigaction -syscall sys_sigprocmask, __NR_rt_sigprocmask -syscall sys_rt_sigreturn, __NR_rt_sigreturn -syscall sys_ioctl, __NR_ioctl -syscall sys_pread64, __NR_pread64 -syscall sys_ptrace, __NR_ptrace -syscall sys_mremap, __NR_mremap -syscall sys_mincore, __NR_mincore -syscall sys_madvise, __NR_madvise -syscall sys_shmat, __NR_shmat -syscall sys_pause, __NR_pause -syscall sys_nanosleep, __NR_nanosleep -syscall sys_getitimer, __NR_getitimer -syscall sys_setitimer, __NR_setitimer -syscall sys_getpid, __NR_getpid -syscall sys_socket, __NR_socket -syscall sys_connect, __NR_connect -syscall sys_sendto, __NR_sendto -syscall sys_recvfrom, __NR_recvfrom -syscall sys_sendmsg, __NR_sendmsg -syscall sys_recvmsg, __NR_recvmsg -syscall sys_shutdown, __NR_shutdown -syscall sys_bind, __NR_bind -syscall sys_setsockopt, __NR_setsockopt -syscall sys_getsockopt, __NR_getsockopt -syscall sys_clone, __NR_clone -syscall sys_exit, __NR_exit -syscall sys_wait4, __NR_wait4 -syscall sys_waitid, __NR_waitid -syscall sys_kill, __NR_kill -syscall sys_fcntl, __NR_fcntl -syscall sys_flock, __NR_flock -syscall sys_readlinkat, __NR_readlinkat -syscall sys_umask, __NR_umask -syscall sys_getgroups, __NR_getgroups -syscall sys_setgroups, __NR_setgroups -syscall sys_setresuid, __NR_setresuid -syscall sys_getresuid, __NR_getresuid -syscall sys_setresgid, __NR_setresgid -syscall sys_getresgid, __NR_getresgid -syscall sys_getpgid, __NR_getpgid -syscall sys_setfsuid, __NR_setfsuid -syscall sys_setfsgid, __NR_setfsgid -syscall sys_getsid, __NR_getsid -syscall sys_capget, __NR_capget -syscall sys_capset, __NR_capset -syscall sys_rt_sigqueueinfo, __NR_rt_sigqueueinfo -syscall sys_setpriority, __NR_setpriority -syscall sys_sched_setscheduler, __NR_sched_setscheduler -syscall sys_sigaltstack, __NR_sigaltstack -syscall sys_personality, __NR_personality -syscall sys_prctl, __NR_prctl -syscall sys_setrlimit, __NR_setrlimit -syscall sys_mount, __NR_mount -syscall sys_umount2, __NR_umount2 -syscall sys_gettid, __NR_gettid -syscall sys_futex, __NR_futex -syscall sys_set_tid_address, __NR_set_tid_address -syscall sys_restart_syscall, __NR_restart_syscall -syscall sys_timer_create, __NR_timer_create -syscall sys_timer_settime, __NR_timer_settime -syscall sys_timer_gettime, __NR_timer_gettime -syscall sys_timer_getoverrun, __NR_timer_getoverrun -syscall sys_timer_delete, __NR_timer_delete -syscall sys_clock_gettime, __NR_clock_gettime -syscall sys_exit_group, __NR_exit_group -syscall sys_set_robust_list, __NR_set_robust_list -syscall sys_get_robust_list, __NR_get_robust_list -syscall sys_signalfd4, __NR_signalfd4 -syscall sys_rt_tgsigqueueinfo, __NR_rt_tgsigqueueinfo -syscall sys_vmsplice, __NR_vmsplice -syscall sys_timerfd_settime, __NR_timerfd_settime -syscall sys_fanotify_init, __NR_fanotify_init -syscall sys_fanotify_mark, __NR_fanotify_mark -syscall sys_open_by_handle_at, __NR_open_by_handle_at -syscall sys_setns, __NR_setns -syscall sys_kcmp, __NR_kcmp -syscall sys_openat, __NR_openat -syscall sys_mkdirat, __NR_mkdirat -syscall sys_unlinkat, __NR_unlinkat -syscall sys_memfd_create, __NR_memfd_create -syscall sys_io_setup, __NR_io_setup -syscall sys_io_submit, __NR_io_submit -syscall sys_io_getevents, __NR_io_getevents -syscall sys_seccomp, __NR_seccomp -syscall sys_gettimeofday, __NR_gettimeofday -syscall sys_preadv_raw, __NR_preadv_raw -syscall sys_userfaultfd, __NR_userfaultfd -syscall sys_fallocate, __NR_fallocate -syscall sys_ppoll, __NR_ppoll -syscall sys_fsopen, __NR_fsopen -syscall sys_fsconfig, __NR_fsconfig -syscall sys_fsmount, __NR_fsmount -syscall sys_clone3, __NR_clone3 -syscall sys_pidfd_open, __NR_pidfd_open -syscall sys_pidfd_getfd, __NR_pidfd_getfd -syscall sys_rseq, __NR_rseq -syscall sys_move_mount, __NR_move_mount -syscall sys_open_tree, __NR_open_tree -syscall sys_openat2, __NR_openat2 -#include From 40b7f04b7c0475813d3e4809cfee2b918715f9c9 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 17 Nov 2024 18:32:03 +0000 Subject: [PATCH 521/775] compel/arch/riscv64: properly implement compel_task_size() We need to dynamically calculate TASK_SIZE depending on the MMU on RISC-V system. [We are using analogical approach on aarch64/ppc64le.] This change was tested on physical machine: StarFive VisionFive 2 isa : rv64imafdc_zicntr_zicsr_zifencei_zihpm_zca_zcd_zba_zbb mmu : sv39 uarch : sifive,u74-mc mvendorid : 0x489 marchid : 0x8000000000000007 mimpid : 0x4210427 hart isa : rv64imafdc_zicntr_zicsr_zifencei_zihpm_zca_zcd_zba_zbb Signed-off-by: Alexander Mikhalitsyn --- compel/arch/riscv64/src/lib/infect.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c index 01395a205..861fe3b2f 100644 --- a/compel/arch/riscv64/src/lib/infect.c +++ b/compel/arch/riscv64/src/lib/infect.c @@ -181,20 +181,22 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) * Task size is the maximum virtual address space size that a process can occupy in the memory * Refer to linux kernel arch/riscv/include/asm/pgtable.h, * task size is: - * - 0x9fc00000 (~2.5GB) for RV32. - * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu - * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu - * - * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V - * Instruction Set Manual Volume II: Privileged Architecture" states that - * "load and store effective addresses, which are 64bits, must have bits - * 63–48 all equal to bit 47, or else a page-fault exception will occur." -*/ -#define TASK_SIZE 0x800000000000UL // hardcoded for SV48 MMU + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * - 0x100000000000000 ( 64PB) for RV64 using SV57 mmu + */ +#define TASK_SIZE_MIN (1UL << 38) +#define TASK_SIZE_MAX (1UL << 56) unsigned long compel_task_size(void) { - return TASK_SIZE; + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; } /* From ed560a3491079157b9044d3e14aa522159e9450b Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 23 Nov 2024 22:29:45 +0000 Subject: [PATCH 522/775] pidfd: add missing include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix for the following error when building CRIU on Rocky Linux 8 criu/pidfd.c: In function ‘pidfd_open’: criu/pidfd.c:119:17: error: ‘__NR_pidfd_open’ undeclared (first use in this function); did you mean ‘pidfd_open’? return syscall(__NR_pidfd_open, pid, flags); ^~~~~~~~~~~~~~~ pidfd_open criu/pidfd.c:119:17: note: each undeclared identifier is reported only once for each function it appears in criu/pidfd.c:120:1: error: control reaches end of non-void function [-Werror=return-type] } ^ criu/pidfd.c: At top level: cc1: error: unrecognized command line option ‘-Wno-unknown-warning-option’ [-Werror] cc1: error: unrecognized command line option ‘-Wno-dangling-pointer’ [-Werror] cc1: all warnings being treated as errors Signed-off-by: Radostin Stoyanov --- criu/pidfd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/pidfd.c b/criu/pidfd.c index 53b9bcf71..ae32025b0 100644 --- a/criu/pidfd.c +++ b/criu/pidfd.c @@ -11,6 +11,8 @@ #include "common/bug.h" #include "rst-malloc.h" +#include "compel/plugins/std/syscall-codes.h" + #undef LOG_PREFIX #define LOG_PREFIX "pidfd: " From 8ee2eba47c0c540026311b24af7a74784e370750 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 5 Dec 2024 22:17:38 +0000 Subject: [PATCH 523/775] vdso: handle vvar_vclock vma-s The vvar_vclock was introduced by [1]. Basically, the old vvar vma has been splited on two parts. In term of C/R, these two vma-s can be still treated as one. [1] e93d2521b27f ("x86/vdso: Split virtual clock pages into dedicated mapping") Signed-off-by: Andrei Vagin --- criu/include/util-vdso.h | 1 + criu/pie/parasite-vdso.c | 19 ++++++++++++++++++- criu/proc_parse.c | 23 +++++++++++++++++++---- criu/vdso.c | 28 +++++++++++++++++++++------- 4 files changed, 59 insertions(+), 12 deletions(-) diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h index c4386cf8e..9fd9a6de4 100644 --- a/criu/include/util-vdso.h +++ b/criu/include/util-vdso.h @@ -30,6 +30,7 @@ struct vdso_symbol { struct vdso_symtable { unsigned long vdso_size; unsigned long vvar_size; + unsigned long vvar_vclock_size; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; bool vdso_before_vvar; /* order of vdso/vvar pair */ }; diff --git a/criu/pie/parasite-vdso.c b/criu/pie/parasite-vdso.c index 355007fa9..f3ad3107f 100644 --- a/criu/pie/parasite-vdso.c +++ b/criu/pie/parasite-vdso.c @@ -45,6 +45,7 @@ static int remap_one(char *who, unsigned long *from, unsigned long to, size_t si static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) { unsigned long vvar_size = rt->sym.vvar_size; + unsigned long vvar_vclock_size = rt->sym.vvar_vclock_size; unsigned long vdso_size = rt->sym.vdso_size; int ret; @@ -54,8 +55,24 @@ static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) std_log_set_gettimeofday(NULL); /* stop using vdso for timings */ - if (vvar) + if (vvar) { + /* + * v6.13-rc1~172^2~9 splits the vvar vma in two parts vvar and + * vvar_clock. The last one is mapped right after the first + * one. + */ + if (vvar_vclock_size) { + unsigned long from; + + vvar_size -= vvar_vclock_size; + from = rt->vvar_start + vvar_size; + + ret = remap_one("rt-vvar", &from, vvar + vvar_size, vvar_vclock_size); + if (ret) + return ret; + } ret = remap_one("rt-vvar", &rt->vvar_start, vvar, vvar_size); + } if (!ret) vdso_update_gtod_addr(rt); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index eb869dbbd..be0c3d531 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -579,7 +579,8 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat } else if (!strcmp(file_path, "[vdso]")) { if (handle_vdso_vma(vma_area)) goto err; - } else if (!strcmp(file_path, "[vvar]")) { + } else if (!strcmp(file_path, "[vvar]") || + !strcmp(file_path, "[vvar_vclock]")) { if (handle_vvar_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[heap]")) { @@ -771,7 +772,7 @@ static int task_size_check(pid_t pid, VmaEntry *entry) int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap) { - struct vma_area *vma_area = NULL; + struct vma_area *vma_area = NULL, *prev_vma_area = NULL; unsigned long start, end, pgoff, prev_end = 0; char r, w, x, s; int ret = -1, vm_file_fd = -1; @@ -813,8 +814,22 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du continue; } - if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) - goto err; + if (vma_area && vma_area_is(vma_area, VMA_AREA_VVAR) && + prev_vma_area && vma_area_is(prev_vma_area, VMA_AREA_VVAR)) { + if (prev_vma_area->e->end != vma_area->e->start) { + pr_err("two nonconsecutive vvar vma-s: " + "%" PRIx64 "-%" PRIx64 " %" PRIx64 "-%" PRIx64 "\n", + prev_vma_area->e->start, prev_vma_area->e->end, + vma_area->e->start, vma_area->e->end); + goto err; + } + /* Merge all vvar vma-s into one. */ + prev_vma_area->e->end = vma_area->e->end; + } else { + if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) + goto err; + prev_vma_area = vma_area; + } if (eof) break; diff --git a/criu/vdso.c b/criu/vdso.c index 7de2fae78..d4d351131 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -310,7 +310,7 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) while (1) { unsigned long start, end; - char *has_vdso, *has_vvar; + char *has_vdso, *has_vvar, *has_vvar_vclock; buf = breadline(&f); if (buf == NULL) @@ -318,13 +318,19 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) if (IS_ERR(buf)) goto err; - has_vdso = strstr(buf, "[vdso]"); - if (!has_vdso) + has_vvar = NULL; + has_vvar_vclock = NULL; + do { + has_vdso = strstr(buf, "[vdso]"); + if (has_vdso) + break; has_vvar = strstr(buf, "[vvar]"); - else - has_vvar = NULL; + if (has_vvar) + break; + has_vvar_vclock = strstr(buf, "[vvar_vclock]"); + } while (0); - if (!has_vdso && !has_vvar) + if (!has_vdso && !has_vvar && !has_vvar_vclock) continue; if (sscanf(buf, "%lx-%lx", &start, &end) != 2) { @@ -339,13 +345,21 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) } s->vdso_start = start; s->sym.vdso_size = end - start; - } else { + } else if (has_vvar) { if (s->vvar_start != VVAR_BAD_ADDR) { pr_err("Got second VVAR entry\n"); goto err; } s->vvar_start = start; s->sym.vvar_size = end - start; + } else { + if (s->vvar_start == VDSO_BAD_ADDR || + s->vvar_start + s->sym.vvar_size != start) { + pr_err("VVAR and VVAR_VCLOCK entries are not subsequent\n"); + goto err; + } + s->sym.vvar_vclock_size = end - start; + s->sym.vvar_size += s->sym.vvar_vclock_size; } } From dc6cef0b4cb1c2de60ee0300fa9705835dff0f45 Mon Sep 17 00:00:00 2001 From: Jesus Ramos Date: Tue, 10 Dec 2024 12:11:57 -0800 Subject: [PATCH 524/775] cuda: Fix return value from CHECKPOINT_DEVICES hook so that dump's fail properly cuda-checkpoint returns the positive CUDA error code when it runs into an issue and passing that along as the return value would cause errors to get ignored Signed-off-by: Jesus Ramos --- plugins/cuda/cuda_plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 7764cf3c7..e78828b18 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -402,7 +402,7 @@ int cuda_plugin_checkpoint_devices(int pid) interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? status : int_ret; + return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); From 15c81c12629c516a1e58097de0cee157515e2401 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 7 Dec 2024 00:08:59 +0000 Subject: [PATCH 525/775] test/java: increate the ghost file limit Right now, this test fails with this error: Error (criu/files-reg.c:1031): Can't dump ghost file /criu/test/javaTests/omrvmem_000000626_Mlm48x of 2097152 size, increase limit Signed-off-by: Andrei Vagin --- scripts/build/Dockerfile.openj9-ubuntu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index c2cf20a36..e190c2792 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -24,9 +24,10 @@ RUN apt-install protobuf-c-compiler \ gcc \ maven +RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT mvn -f test/javaTests/pom.xml test From a8754905c05a08f560194d90bb53ba86b27577d2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 13 Dec 2024 09:03:42 -0800 Subject: [PATCH 526/775] test: run scm06 in the ns and uns flavors The kernel releases a test socket asynchronously, so the restore can fail if it is executed before the kernel actually destroys the socket. Fixes #2537 Signed-off-by: Andrei Vagin --- test/zdtm/static/scm06.desc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/scm06.desc b/test/zdtm/static/scm06.desc index 2eac7e654..38cc3be51 100644 --- a/test/zdtm/static/scm06.desc +++ b/test/zdtm/static/scm06.desc @@ -1 +1,4 @@ -{'flags': 'suid'} +# This test isn't executed in the host flavor (in the same network namespace, +# because the kernel releases a test socket asynchronously, so the restore +# can fail if it is executed before the kernel actually destroys the socket. +{'flags': 'suid', 'flavor': 'ns uns'} From 99e1fbd8a2cfd6eaf3193062c2925ac80a9893b6 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 16 Dec 2024 16:38:31 -0800 Subject: [PATCH 527/775] criu/seize.c: clang-format it Done using clang-format 19.1.5 with .clang-format obtained via scripts/fetch-clang-format.sh. Signed-off-by: Kir Kolyshkin --- criu/seize.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/criu/seize.c b/criu/seize.c index 9bd1832d9..529fff562 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -87,7 +87,10 @@ static const char frozen[] = "FROZEN"; static const char freezing[] = "FREEZING"; static const char thawed[] = "THAWED"; -enum freezer_state { FREEZER_ERROR = -1, THAWED, FROZEN, FREEZING }; +enum freezer_state { FREEZER_ERROR = -1, + THAWED, + FROZEN, + FREEZING }; /* Track if we are running on cgroup v2 system. */ static bool cgroup_v2 = false; From 82f4ecda6922020346a6a544b53476e0d527d366 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Dec 2024 17:29:34 -0800 Subject: [PATCH 528/775] freeze_processes: fix logic There are a few issues with the freeze_processes logic: 1. Commit 9fae23fbe2 grossly (by 1000x) miscalculated the number of attempts required, as a result, we are seeing something like this: > (00.000340) freezing processes: 100000 attempts with 100 ms steps > (00.000351) freezer.state=THAWED > (00.000358) freezer.state=FREEZING > (00.100446) freezer.state=FREEZING > ...close to 100 lines skipped... > (09.915110) freezer.state=FREEZING > (10.000432) Error (criu/cr-dump.c:1467): Timeout reached. Try to interrupt: 0 > (10.000563) freezer.state=FREEZING For 10s with 100ms steps we only need 100 attempts, not 100000. 2. When the timeout is hit, the "failed to freeze cgroup" error is not printed, and the log_unfrozen_stacks is not called either. 3. The nanosleep at the last iteration is useless (this was hidden by issue 1 above, as the timeout was hit first). Fix all these. While at it, 4. Amend the error message with the number of attempts, sleep duration, and timeout. 5. Modify the "freezing cgroup" debug message to be in sync with the above error. Was: > freezing processes: 100000 attempts with 100 ms steps Now: > freezing cgroup some/name: 100 x 100ms attempts, timeout: 10s Signed-off-by: Kir Kolyshkin --- criu/seize.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 529fff562..6701446ae 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -545,7 +545,8 @@ static int freeze_processes(void) enum freezer_state state = THAWED; static const unsigned long step_ms = 100; - unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; + /* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */ + unsigned long nr_attempts = (opts.timeout * 1000) / step_ms; unsigned long i = 0; const struct timespec req = { @@ -554,14 +555,12 @@ static int freeze_processes(void) }; if (unlikely(!nr_attempts)) { - /* - * If timeout is turned off, lets - * wait for at least 10 seconds. - */ - nr_attempts = (10 * 1000000) / step_ms; + /* If the timeout is 0, wait for at least 10 seconds. */ + nr_attempts = (10 * 1000) / step_ms; } - pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms); + pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n", + opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout); fd = freezer_open(); if (fd < 0) @@ -588,22 +587,22 @@ static int freeze_processes(void) * not read @tasks pids while freezer in * transition stage. */ - for (; i <= nr_attempts; i++) { + while (1) { state = get_freezer_state(fd); if (state == FREEZER_ERROR) { close(fd); return -1; } - if (state == FROZEN) + if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; - if (alarm_timeouted()) - goto err; + nanosleep(&req, NULL); } - if (i > nr_attempts) { - pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); + if (state != FROZEN) { + pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n", + opts.freeze_cgroup, i, step_ms, opts.timeout); if (!pr_quelled(LOG_DEBUG)) log_unfrozen_stacks(opts.freeze_cgroup); goto err; From 94b9b3c5daf1237493f75d2e538d72d81013c2a2 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 12 Dec 2024 17:34:17 -0800 Subject: [PATCH 529/775] freeze_processes: implement kludges for cgroup v1 Cgroup v1 freezer has always been problematic, failing to freeze a cgroup. In runc, we have implemented a few kludges to increase the chance of succeeding, but those are used when runc freezes a cgroup for its own purposes (for "runc pause" and to modify device properties for cgroup v1). When criu is used, it fails to freeze a cgroup from time to time (see [1], [2]). Let's try adding kludges similar to ones in runc. Alas, I have absolutely no way to test this, so please review carefully. [1]: https://github.com/opencontainers/runc/issues/4273 [2]: https://github.com/opencontainers/runc/issues/4457 Signed-off-by: Kir Kolyshkin --- criu/seize.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/criu/seize.c b/criu/seize.c index 6701446ae..829d7c278 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -539,6 +539,34 @@ err: return exit_code; } +static void cgroupv1_freezer_kludges(int fd, int iter, const struct timespec *req) { + /* As per older kernel docs (freezer-subsystem.txt before + * the kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + * userspace should either retry or thaw. While current + * kernel cgroup v1 docs no longer mention a need to retry, + * even recent kernels can't reliably freeze a cgroup v1. + * + * Let's keep asking the kernel to freeze from time to time. + * In addition, do occasional thaw/sleep/freeze. + * + * This is still a game of chances (the real fix belongs to the kernel) + * but these kludges might improve the probability of success. + * + * Cgroup v2 does not have this problem. + */ + switch (iter % 32) { + case 9: + case 20: + freezer_write_state(fd, FROZEN); + break; + case 31: + freezer_write_state(fd, THAWED); + nanosleep(req, NULL); + freezer_write_state(fd, FROZEN); + break; + } +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -597,6 +625,9 @@ static int freeze_processes(void) if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; + if (!cgroup_v2) + cgroupv1_freezer_kludges(fd, i, &req); + nanosleep(&req, NULL); } From 6f8efad304a4a65bc5b45ec5985fc7ac3763b5ff Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Thu, 19 Dec 2024 08:16:36 +0000 Subject: [PATCH 530/775] cr: Task CapAmb support Signed-off-by: Liu Chao --- criu/cr-restore.c | 2 ++ criu/include/parasite.h | 1 + criu/include/prctl.h | 9 +++++++++ criu/include/proc_parse.h | 1 + criu/include/restorer.h | 1 + criu/parasite-syscall.c | 3 +++ criu/pie/parasite.c | 13 +++++++++++++ criu/pie/restorer.c | 16 ++++++++++++++++ criu/proc_parse.c | 11 +++++++++-- criu/pstree.c | 3 +++ images/creds.proto | 2 ++ 11 files changed, 60 insertions(+), 2 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 646300bdb..ddca6b8ec 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2992,6 +2992,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; + args->creds.cap_amb = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; @@ -2999,6 +3000,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); + copy_caps(args->cap_amb, ce->cap_amb, ce->n_cap_amb); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 1244220f6..b33d6710f 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -148,6 +148,7 @@ struct parasite_dump_creds { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; int uids[4]; int gids[4]; diff --git a/criu/include/prctl.h b/criu/include/prctl.h index 4c2a548b1..f5f23c969 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -36,6 +36,15 @@ #ifndef PR_SET_NO_NEW_PRIVS #define PR_SET_NO_NEW_PRIVS 38 #endif +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +#endif +#ifndef PR_CAP_AMBIENT_IS_SET +#define PR_CAP_AMBIENT_IS_SET 1 +#endif +#ifndef PR_CAP_AMBIENT_RAISE +#define PR_CAP_AMBIENT_RAISE 2 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0c334a190..0bd79bf55 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -81,6 +81,7 @@ struct proc_status_creds { u32 cap_prm[PROC_CAP_SIZE]; u32 cap_eff[PROC_CAP_SIZE]; u32 cap_bnd[PROC_CAP_SIZE]; + u32 cap_amb[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 3fb5322a4..a4fb7ea79 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -75,6 +75,7 @@ struct thread_creds_args { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; char *lsm_profile; unsigned int *groups; diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index a88f8a66f..6db9d21fe 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -103,16 +103,19 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0])); BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0])); BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0])); + BUILD_BUG_ON(sizeof(ce->cap_amb[0]) != sizeof(c->cap_amb[0])); BUG_ON(ce->n_cap_inh != CR_CAP_SIZE); BUG_ON(ce->n_cap_prm != CR_CAP_SIZE); BUG_ON(ce->n_cap_eff != CR_CAP_SIZE); BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE); + BUG_ON(ce->n_cap_amb != CR_CAP_SIZE); memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE); memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE); memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + memcpy(ce->cap_amb, c->cap_amb, sizeof(c->cap_amb[0]) * CR_CAP_SIZE); if (c->no_new_privs > 0) { ce->no_new_privs = c->no_new_privs; diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index e151ed656..1bc03dc2a 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -324,6 +324,7 @@ static int dump_creds(struct parasite_dump_creds *args) args->cap_prm[i] = data[i].prm; args->cap_inh[i] = data[i].inh; args->cap_bnd[i] = 0; + args->cap_amb[i] = 0; for (j = 0; j < 32; j++) { if (j + i * 32 > args->cap_last_cap) @@ -336,6 +337,18 @@ static int dump_creds(struct parasite_dump_creds *args) if (ret) args->cap_bnd[i] |= (1 << j); } + + for (j = 0; j < 32; j++) { + if (j + i * 32 > args->cap_last_cap) + break; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, j + i * 32, 0, 0); + if (ret < 0) { + pr_err("Unable to read ambient capability %d: %d\n", j + i * 32, ret); + return -1; + } + if (ret) + args->cap_amb[i] |= (1 << j); + } } args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 51ed6ed4c..0a6a7977c 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -347,6 +347,22 @@ skip_xids: return -1; } + for (b = 0; b < CR_CAP_SIZE; b++) { + for (i = 0; i < 32; i++) { + if (b * 32 + i > args->cap_last_cap) + break; + if ((args->cap_amb[b] & (1 << i)) == 0) + /* don't set */ + continue; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + if (!ret) + continue; + pr_err("Unable to raise ambient capability %d: %d\n", i + b * 32, ret); + return -1; + } + } + + if (lsm_type != LSMTYPE__SELINUX) { /* * SELinux does not support setting the process context for diff --git a/criu/proc_parse.c b/criu/proc_parse.c index be0c3d531..99dc518a5 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1071,7 +1071,7 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) if (bfdopenr(&f)) return -1; - while (done < 13) { + while (done < 14) { str = breadline(&f); if (str == NULL) break; @@ -1155,6 +1155,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(str, "CapAmb:", 7)) { + if (cap_parse(str + 8, cr->cap_amb)) + goto err_parse; + done++; + continue; + } + if (!strncmp(str, "Seccomp:", 8)) { if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { goto err_parse; @@ -1198,7 +1205,7 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 12 : 11); + expected_done = (parsed_seccomp ? 13 : 12); if (kdat.has_nspid) expected_done++; if (done == expected_done) diff --git a/criu/pstree.c b/criu/pstree.c index 8c44e7134..41df846ed 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -63,6 +63,7 @@ CoreEntry *core_entry_alloc(int th, int tsk) sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_amb[0]); /* * @groups are dynamic and allocated * on demand. @@ -122,10 +123,12 @@ CoreEntry *core_entry_alloc(int th, int tsk) ce->n_cap_prm = CR_CAP_SIZE; ce->n_cap_eff = CR_CAP_SIZE; ce->n_cap_bnd = CR_CAP_SIZE; + ce->n_cap_amb = CR_CAP_SIZE; ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0])); ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0])); ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + ce->cap_amb = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_amb[0])); if (arch_alloc_thread_info(core)) { xfree(core); diff --git a/images/creds.proto b/images/creds.proto index 220ed3858..932a40ccf 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -25,4 +25,6 @@ message creds_entry { optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; optional uint32 no_new_privs = 18; + + repeated uint32 cap_amb = 19; } From 260c08418bf5540477b55b485ddcf9c86652cb62 Mon Sep 17 00:00:00 2001 From: Liu Chao Date: Fri, 3 Jan 2025 03:33:27 +0000 Subject: [PATCH 531/775] zdtm: Check CapAmb is restored correctly after C/R This test sets CapAmb according to CapPrm and CapInh and check CapAmb after C/R. Signed-off-by: Liu Chao --- test/zdtm/static/Makefile | 1 + test/zdtm/static/caps01.c | 168 +++++++++++++++++++++++++++++++++++ test/zdtm/static/caps01.desc | 1 + 3 files changed, 170 insertions(+) create mode 100644 test/zdtm/static/caps01.c create mode 100644 test/zdtm/static/caps01.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 71a1b6a53..78f96430e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -8,6 +8,7 @@ TST_NOFILE := \ sleeping00 \ pid00 \ caps00 \ + caps01 \ wait00 \ zombie00 \ zombie01 \ diff --git a/test/zdtm/static/caps01.c b/test/zdtm/static/caps01.c new file mode 100644 index 000000000..0f8a7101e --- /dev/null +++ b/test/zdtm/static/caps01.c @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that CapAmb are preserved"; +const char *test_author = "Liu Chao "; + +struct cap_hdr { + unsigned int version; + int pid; +}; + +struct cap_data { + unsigned int eff; + unsigned int prm; + unsigned int inh; +}; + +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#define _LINUX_CAPABILITY_U32S_3 2 +#define CAP_DAC_OVERRIDE 1 +#define PR_CAP_AMBIENT 47 +#define PR_CAP_AMBIENT_IS_SET 1 +#define PR_CAP_AMBIENT_RAISE 2 +#define PR_CAP_AMBIENT_LOWER 3 + +int capget(struct cap_hdr *hdrp, struct cap_data *datap); +int capset(struct cap_hdr *hdrp, const struct cap_data *datap); + +static int cap_last_cap = 63; + +int main(int argc, char **argv) +{ + task_waiter_t t; + int pid, result_pipe[2]; + unsigned int amb[_LINUX_CAPABILITY_U32S_3]; + unsigned int amb_2[_LINUX_CAPABILITY_U32S_3]; + char res = 'x'; + FILE *f; + + test_init(argc, argv); + task_waiter_init(&t); + + f = fopen("/proc/sys/kernel/cap_last_cap", "r"); + if (f) { + if (fscanf(f, "%d", &cap_last_cap) != 1) { + pr_perror("Unable to read cal_last_cap"); + fclose(f); + return 1; + } + fclose(f); + } else + test_msg("/proc/sys/kernel/cap_last_cap is not available\n"); + + if (pipe(result_pipe)) { + pr_perror("Can't create pipe"); + return 1; + } + + pid = test_fork(); + if (pid == 0) { + int b, i, ret; + struct cap_hdr hdr; + struct cap_data data[_LINUX_CAPABILITY_U32S_3]; + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data) < 0) { + pr_perror("capget"); + return -1; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + data[0].eff &= ~((1 << CAP_CHOWN) | (1 << CAP_DAC_OVERRIDE)); + data[0].prm &= ~(1 << CAP_DAC_OVERRIDE); + data[0].inh = data[0].prm; + data[1].inh = data[1].prm; + + if (capset(&hdr, data) < 0) { + pr_perror("capset"); + return -1; + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb[b] = data[b].prm; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + if ((amb[b] & (1 << i)) > 0) + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + else + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, i + b * 32, 0, 0); + if (ret) { + pr_perror("Unable to set ambient capability %d to %d: %d", i + b * 32, amb[b] & (1 << i), ret); + return -1; + } + } + } + + task_waiter_complete_current(&t); + task_waiter_wait4(&t, getppid()); + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb_2[b] = 0; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i + b * 32, 0, 0); + if (ret < 0) { + pr_perror("Unable to read ambient capability %d: %d", i + b * 32, ret); + goto bad; + } + + amb_2[b] |= (ret << i); + } + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + if (amb[b] != amb_2[b]) { + res = '1'; + goto bad; + } + } + + res = '0'; + bad: + write(result_pipe[1], &res, 1); + + if (res != '0') { + write(result_pipe[1], amb, sizeof(amb)); + write(result_pipe[1], amb_2, sizeof(amb_2)); + } + + close(result_pipe[0]); + close(result_pipe[1]); + _exit(0); + } + + task_waiter_wait4(&t, pid); + + test_daemon(); + test_waitsig(); + + task_waiter_complete_current(&t); + + read(result_pipe[0], &res, 1); + + if (res == '0') + pass(); + else { + read(result_pipe[0], amb, sizeof(amb)); + read(result_pipe[0], amb_2, sizeof(amb_2)); + test_msg("amb[]=%08x, %08x\n", amb[0], amb[1]); + test_msg("amb[]=%08x, %08x\n", amb_2[0], amb_2[1]); + fail("Fail: %c", res); + } + close(result_pipe[0]); + close(result_pipe[1]); + + return 0; +} diff --git a/test/zdtm/static/caps01.desc b/test/zdtm/static/caps01.desc new file mode 100644 index 000000000..2eac7e654 --- /dev/null +++ b/test/zdtm/static/caps01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} From 6dce80c533df445c7926e952387edd59209ff3ee Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 16 Jan 2025 07:52:42 +0000 Subject: [PATCH 532/775] util: added cleanup_file attribute. Signed-off-by: Adrian Reber --- criu/include/util.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/criu/include/util.h b/criu/include/util.h index ae293a68c..4793f7f20 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -406,6 +406,14 @@ static inline void cleanup_freep(void *p) free(*pp); } +#define cleanup_file __attribute__((cleanup(cleanup_filep))) +static inline void cleanup_filep(FILE **f) +{ + FILE *file = *f; + if (file) + (void)fclose(file); +} + extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args); /* From 97398068b16fd24e244deeb6f8b0b52061b46bbd Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 17 Dec 2024 08:52:46 +0100 Subject: [PATCH 533/775] net: redirect nftables stdout and stderr to CRIU's log file When using the nftables network locking backend and restoring a process a second time the network locking has already been deleted by the first restore. The second restore will print out to the console text like: Error: Could not process rule: No such file or directory delete table inet CRIU-202621 With this change CRIU's log FD is used by libnftables stdout and stderr. Signed-off-by: Adrian Reber --- criu/net.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/criu/net.c b/criu/net.c index eee331108..efd52db32 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3066,9 +3066,43 @@ err: return ret; } +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +static inline FILE *redirect_nftables_output(struct nft_ctx *nft) +{ + FILE *fp; + int fd; + + fd = dup(log_get_fd()); + if (fd < 0) { + pr_perror("dup() to redirect nftables output failed"); + return NULL; + } + + fp = fdopen(fd, "w"); + if (!fp) { + pr_perror("fdopen() to redirect nftables output failed"); + return NULL; + } + + /** + * Without setvbuf() the output from libnftables will be + * somewhere in the log file, probably at the end. + * With setvbuf() potential output will be at the correct + * position. + */ + setvbuf(fp, NULL, _IONBF, 0); + + nft_ctx_set_output(nft, fp); + nft_ctx_set_error(nft, fp); + + return fp; +} +#endif + static inline int nftables_lock_network_internal(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; int ret = 0; char table[32]; @@ -3081,6 +3115,10 @@ static inline int nftables_lock_network_internal(void) if (!nft) return -1; + fp = redirect_nftables_output(nft); + if (!fp) + goto out; + snprintf(buf, sizeof(buf), "create table %s", table); if (NFT_RUN_CMD(nft, buf)) goto err2; @@ -3168,6 +3206,7 @@ static inline int nftables_network_unlock(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) int ret = 0; + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; char table[32]; char buf[128]; @@ -3179,6 +3218,10 @@ static inline int nftables_network_unlock(void) if (!nft) return -1; + fp = redirect_nftables_output(nft); + if (!fp) + return -1; + snprintf(buf, sizeof(buf), "delete table %s", table); if (NFT_RUN_CMD(nft, buf)) ret = -1; From 6fdac508186fd645cc0a05a8bf82ea17e4662eb9 Mon Sep 17 00:00:00 2001 From: Yuanhong Peng Date: Thu, 19 Dec 2024 14:30:41 +0800 Subject: [PATCH 534/775] seize: Adjust the position of the log message Based on the code, the `ret` variable at this point does not represent the task state, so this log message should be moved to a position after the `compel_wait_task()` function. Signed-off-by: Yuanhong Peng --- criu/seize.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index 829d7c278..007e8e580 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -707,8 +707,6 @@ static int collect_children(struct pstree_item *item) goto free; } - pr_info("Seized task %d, state %d\n", pid, ret); - c = alloc_pstree_item(); if (c == NULL) { ret = -1; @@ -746,6 +744,8 @@ static int collect_children(struct pstree_item *item) if (ret == TASK_STOPPED) c->pid->stop_signo = compel_parse_stop_signo(pid); + pr_info("Seized task %d, state %d\n", pid, ret); + c->pid->real = pid; c->parent = item; c->pid->state = ret; From 2b74924805730c10d4fea7f6e332c3f09167c628 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 21 Jan 2025 15:05:42 +0100 Subject: [PATCH 535/775] files-reg: fix buffer overflow on aarch64 Running the zdtm/static/unlink_regular00 test on Ubuntu 24.04 on aarch64 results in following error: # ./zdtm.py run -t zdtm/static/unlink_regular00 -k always userns is supported === Run 1/1 ================ zdtm/static/unlink_regular00 ==================== Run zdtm/static/unlink_regular00 in ns ==================== Skipping rtc at root Start test Test is SUID ./unlink_regular00 --pidfile=unlink_regular00.pid --outfile=unlink_regular00.out --dirname=unlink_regular00.test Run criu dump *** buffer overflow detected ***: terminated ############# Test zdtm/static/unlink_regular00 FAIL at CRIU dump ############## Test output: ================================ <<< ================================ Send the 9 signal to 47 Wait for zdtm/static/unlink_regular00(47) to die for 0.100000 ##################################### FAIL ##################################### According to the backtrace: #0 __pthread_kill_implementation (threadid=281473158467616, signo=signo@entry=6, no_tid=no_tid@entry=0) at ./nptl/pthread_kill.c:44 #1 0x0000ffff93477690 in __pthread_kill_internal (signo=6, threadid=) at ./nptl/pthread_kill.c:78 #2 0x0000ffff9342cb3c in __GI_raise (sig=sig@entry=6) at ../sysdeps/posix/raise.c:26 #3 0x0000ffff93417e00 in __GI_abort () at ./stdlib/abort.c:79 #4 0x0000ffff9346abf0 in __libc_message_impl (fmt=fmt@entry=0xffff93552a78 "*** %s ***: terminated\n") at ../sysdeps/posix/libc_fatal.c:132 #5 0x0000ffff934e81a8 in __GI___fortify_fail (msg=msg@entry=0xffff93552a28 "buffer overflow detected") at ./debug/fortify_fail.c:24 #6 0x0000ffff934e79e4 in __GI___chk_fail () at ./debug/chk_fail.c:28 #7 0x0000ffff934e9070 in ___snprintf_chk (s=s@entry=0xffffc6ed04a3 "testfile", maxlen=maxlen@entry=4056, flag=flag@entry=2, slen=slen@entry=4053, format=format@entry=0xaaaacffe3888 "link_remap.%d") at ./debug/snprintf_chk.c:29 #8 0x0000aaaacff4b8b8 in snprintf (__fmt=0xaaaacffe3888 "link_remap.%d", __n=4056, __s=0xffffc6ed04a3 "testfile") at /usr/include/aarch64-linux-gnu/bits/stdio2.h:54 #9 create_link_remap (path=path@entry=0xffffc6ed2901 "/zdtm/static/unlink_regular00.test/subdir/testfile", len=len@entry=60, lfd=lfd@entry=20, idp=idp@entry=0xffffc6ed14ec, nsid=nsid@entry=0xaaaada2bac00, parms=parms@entry=0xffffc6ed2808, fallback=0xaaaacff4c6c0 , fallback@entry=0xffffc6ed2797) at criu/files-reg.c:1164 #10 0x0000aaaacff4c6c0 in dump_linked_remap (path=path@entry=0xffffc6ed2901 "/zdtm/static/unlink_regular00.test/subdir/testfile", len=len@entry=60, parms=parms@entry=0xffffc6ed2808, lfd=lfd@entry=20, id=id@entry=12, nsid=nsid@entry=0xaaaada2bac00, fallback=fallback@entry=0xffffc6ed2797) at criu/files-reg.c:1198 #11 0x0000aaaacff4d8b0 in check_path_remap (nsid=0xaaaada2bac00, id=12, lfd=20, parms=0xffffc6ed2808, link=) at criu/files-reg.c:1426 #12 dump_one_reg_file (lfd=20, id=12, p=0xffffc6ed2808) at criu/files-reg.c:1827 #13 0x0000aaaacff51078 in dump_one_file (pid=, fd=4, lfd=20, opts=opts@entry=0xaaaada2ba2c0, ctl=ctl@entry=0xaaaada2c4d50, e=e@entry=0xffffc6ed39c8, dfds=dfds@entry=0xaaaada2c3d40) at criu/files.c:581 #14 0x0000aaaacff5176c in dump_task_files_seized (ctl=ctl@entry=0xaaaada2c4d50, item=item@entry=0xaaaada2b8f80, dfds=dfds@entry=0xaaaada2c3d40) at criu/files.c:657 #15 0x0000aaaacff3d3c0 in dump_one_task (parent_ie=0x0, item=0xaaaada2b8f80) at criu/cr-dump.c:1679 #16 cr_dump_tasks (pid=) at criu/cr-dump.c:2224 #17 0x0000aaaacff163a0 in main (argc=, argv=0xffffc6ed40e8, envp=) at criu/crtools.c:293 This line is the problem: snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); The problem was that the `-1` was on the inside of the braces and not on the outside. This way the destination size was increase by 1 instead of being decreased by 1 which triggered the buffer overflow detection. Signed-off-by: Adrian Reber --- criu/files-reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/files-reg.c b/criu/files-reg.c index fc6149350..66c0e6cda 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -1150,7 +1150,7 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ - snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); + snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name) - 1, "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); From aad66a4f7c7affb59d4d59823c35849e1b3421f5 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Mon, 20 Jan 2025 19:00:16 +0100 Subject: [PATCH 536/775] test: fix cmdlinenv00 on aarch64 On aarch64 the test cmdlinenv00 was failing with: FAIL: cmdlinenv00.c:120: auxv corrupted on restore (errno = 11 (Resource temporarily unavailable)) Starting with Linux kernel version 6.3 the size of AUXV was changed: commit 28c8e088427ad30b4260953f3b6f908972b77c2d Author: Mathieu Desnoyers Date: Wed Jan 4 14:20:54 2023 -0500 rseq: Increase AT_VECTOR_SIZE_BASE to match rseq auxvec entries Two new auxiliary vector entries are introduced for rseq without matching increment of the AT_VECTOR_SIZE_BASE, which causes failures with CONFIG_HARDENED_USERCOPY=y. Fixes: 317c8194e6ae ("rseq: Introduce feature size and alignment ELF auxiliary vector entries") With this change AT_VECTOR_SIZE increases from 40 to 50 on aarch64. CRIU uses AT_VECTOR_SIZE to read the content of /proc/PID/auxv auxv_t mm_saved_auxv[AT_VECTOR_SIZE]; ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv)); Now the tests works again on aarch64. Signed-off-by: Adrian Reber --- criu/arch/aarch64/include/asm/types.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h index 363c1cae2..db118cafd 100644 --- a/criu/arch/aarch64/include/asm/types.h +++ b/criu/arch/aarch64/include/asm/types.h @@ -33,7 +33,16 @@ static inline uint64_t encode_pointer(void *p) return (uint64_t)p; } -#define AT_VECTOR_SIZE 40 +/** + * See also: + * * arch/arm64/include/uapi/asm/auxvec.h + * * include/linux/auxvec.h + * * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 22 +#define AT_VECTOR_SIZE_ARCH 2 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + typedef uint64_t auxv_t; typedef uint64_t tls_t; From 09dc2e9584b70e345fd12402dc3467a806ceb9e8 Mon Sep 17 00:00:00 2001 From: Austin Kuo <104871462+hckuo@users.noreply.github.com> Date: Tue, 7 Jan 2025 04:31:05 +0000 Subject: [PATCH 537/775] timer: Refine itimer_armed logic and improve timer value handling Right now, CRIU skips timers non-periodic timers. This change addresses this issue. Signed-off-by: Austin Kuo --- criu/pie/restorer.c | 2 +- criu/timer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0a6a7977c..6d048c3f1 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -2226,7 +2226,7 @@ __visible long __export_restore_task(struct task_restore_args *args) * code below doesn't fail due to bad timing values. */ -#define itimer_armed(args, i) (args->itimers[i].it_interval.tv_sec || args->itimers[i].it_interval.tv_usec) +#define itimer_armed(args, i) (args->itimers[i].it_value.tv_sec || args->itimers[i].it_value.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); diff --git a/criu/timer.c b/criu/timer.c index e94cf0280..0413e2a72 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -16,7 +16,7 @@ static inline int timeval_valid(struct timeval *tv) static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) { - if (ie->isec == 0 && ie->iusec == 0) { + if (ie->isec == 0 && ie->iusec == 0 && ie->vsec == 0 && ie->vusec == 0) { memzero_p(val); return 0; } From 061f4266e80dbfc00e5ac58d3f1432e8d833b000 Mon Sep 17 00:00:00 2001 From: Austin Kuo Date: Tue, 21 Jan 2025 12:04:33 -0800 Subject: [PATCH 538/775] test/zdtm: add a new test to check non-periodic timers It creates a few timers with log expiration intervals, waites for C/R and check that timers are armed and their intervals have been restored. Signed-off-by: Austin Kuo --- test/zdtm/static/Makefile | 1 + test/zdtm/static/timers01.c | 74 +++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 test/zdtm/static/timers01.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 78f96430e..f72fb2a77 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -24,6 +24,7 @@ TST_NOFILE := \ sse20 \ mprotect00 \ timers \ + timers01 \ timerfd \ unbound_sock \ sched_prio00 \ diff --git a/test/zdtm/static/timers01.c b/test/zdtm/static/timers01.c new file mode 100644 index 000000000..10ecc3481 --- /dev/null +++ b/test/zdtm/static/timers01.c @@ -0,0 +1,74 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks non-periodic timers\n"; +const char *test_author = "Andrei Vagin "; + +static struct { + const int timer_type; + const int signal; + volatile sig_atomic_t count; +} timer_tests[] = { + /* from slowest to fastest */ + { ITIMER_VIRTUAL, SIGVTALRM }, + { ITIMER_PROF, SIGPROF }, + { ITIMER_REAL, SIGALRM }, +}; + +#define NUM_TIMERS (sizeof(timer_tests) / sizeof(timer_tests[0])) +#define TIMER_TIMEOUT 3600 +#define TIMER_ALLOWED_DELTA 300 + +static void setup_timers(void) +{ + int i; + struct itimerval tv = { + .it_interval = { .tv_sec = 0, .tv_usec = 0 }, + .it_value = { .tv_sec = TIMER_TIMEOUT, .tv_usec = 0 }, + }; + + for (i = 0; i < NUM_TIMERS; i++) { + if (setitimer(timer_tests[i].timer_type, &tv, NULL) < 0) { + pr_perror("can't set timer %d", i); + exit(1); + } + } +} + +static void check_timers(void) +{ + int i; + + for (i = 0; i < NUM_TIMERS; i++) { + struct itimerval tv = {}; + + if (getitimer(timer_tests[i].timer_type, &tv)) { + pr_perror("gettimer"); + exit(1); + } + if (tv.it_value.tv_sec > TIMER_TIMEOUT || + tv.it_value.tv_sec < TIMER_TIMEOUT - TIMER_ALLOWED_DELTA) { + fail("%ld isn't in [%d, %d]", (long)tv.it_value.tv_sec, + TIMER_TIMEOUT, + TIMER_TIMEOUT - TIMER_ALLOWED_DELTA); + exit(1); + } + } + pass(); +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + setup_timers(); + + test_daemon(); + test_waitsig(); + + check_timers(); + return 0; +} From 815ef68848ad79642273026e68761e6680c047a1 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 24 Jan 2025 09:27:16 +0100 Subject: [PATCH 539/775] ci: two check-commits.yml changes * Switch to v4 actions/checkout (from v3) * Use our apt wrapper to gracefully handle temporary repository errors Signed-off-by: Adrian Reber --- .github/workflows/check-commits.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index be2fbd285..94861ab52 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -12,14 +12,14 @@ jobs: # Check if pull request does not have label "not-selfcontained-ok" if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # Needed to rebase against the base branch fetch-depth: 0 # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo apt-get install -y libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" From 54795f174b606cbe2c134e7be2094ea53be3559d Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 23 Jan 2025 09:26:15 +0000 Subject: [PATCH 540/775] criu: use libuuid for criu_run_id generation criu_run_id will be used in upcoming changes to create and remove network rules for network locking. Instead of trying to come up with a way to create unique IDs, just use an existing library. libuuid should be installed on most systems as it is indirectly required by systemd (via libmount). Signed-off-by: Adrian Reber --- .cirrus.yml | 2 +- .github/workflows/check-commits.yml | 2 +- compel/include/uapi/infect-util.h | 11 ++++++++++- compel/src/lib/infect-util.c | 2 +- compel/src/lib/infect.c | 2 +- criu/Makefile.packages | 4 +++- criu/fdstore.c | 2 +- criu/files.c | 2 +- criu/include/util.h | 4 +++- criu/pidfd-store.c | 2 +- criu/unittest/mock.c | 4 +++- criu/util.c | 17 +++++++---------- scripts/build/Dockerfile.alpine | 3 ++- scripts/build/Dockerfile.amd-rocm | 1 + scripts/build/Dockerfile.archlinux | 1 + scripts/build/Dockerfile.hotspot-alpine | 1 + scripts/build/Dockerfile.hotspot-ubuntu | 1 + scripts/build/Dockerfile.linux32.tmpl | 1 + scripts/build/Dockerfile.openj9-ubuntu | 1 + .../build/Dockerfile.riscv64-stable-cross.tmpl | 1 + scripts/build/Dockerfile.stable-cross.tmpl | 1 + scripts/build/Dockerfile.tmpl | 1 + scripts/build/Dockerfile.unstable-cross.tmpl | 1 + scripts/ci/prepare-for-fedora-rawhide.sh | 1 + scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- 26 files changed, 48 insertions(+), 24 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 5e30ca2c2..a4b53a54b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -36,7 +36,7 @@ task: ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 94861ab52..354873909 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev + run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h index ace6f6b6b..658df9393 100644 --- a/compel/include/uapi/infect-util.h +++ b/compel/include/uapi/infect-util.h @@ -3,11 +3,20 @@ #include "common/compiler.h" +/** + * The length of the hash is based on what libuuid provides. + * According to the manpage this is: + * + * The uuid_unparse() function converts the supplied UUID uu from the binary + * representation into a 36-byte string (plus trailing '\0') + */ +#define RUN_ID_HASH_LENGTH 37 + /* * compel_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other processes. */ -extern uint64_t compel_run_id; +extern char compel_run_id[RUN_ID_HASH_LENGTH]; struct parasite_ctl; extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c index 00a7c83f7..dc57e28f7 100644 --- a/compel/src/lib/infect-util.c +++ b/compel/src/lib/infect-util.c @@ -7,7 +7,7 @@ #include "infect-rpc.h" #include "infect-util.h" -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 1e3ffb967..caf54e03f 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -427,7 +427,7 @@ static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) int sun_len; saddr->sun_family = AF_UNIX; - snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%" PRIx64, key, compel_run_id); + snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%s", key, compel_run_id); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 7f6113c8f..3e2e6efd1 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,6 +6,7 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel +REQ-RPM-PKG-NAMES += libuuid-devel REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -16,6 +17,7 @@ REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev +REQ-DEB-PKG-NAMES += uuid-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev @@ -25,7 +27,7 @@ REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet +export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet -luuid check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/fdstore.c b/criu/fdstore.c index d615ad15d..6ac639c55 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -58,7 +58,7 @@ int fdstore_init(void) } addr.sun_family = AF_UNIX; - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%" PRIx64, st.st_ino, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%s", st.st_ino, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/files.c b/criu/files.c index 31e705bcc..f16ec32a2 100644 --- a/criu/files.c +++ b/criu/files.c @@ -978,7 +978,7 @@ static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%" PRIx64, pid, criu_run_id); + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%s", pid, criu_run_id); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } diff --git a/criu/include/util.h b/criu/include/util.h index 4793f7f20..194e94dee 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -21,6 +21,8 @@ #include "log.h" #include "common/err.h" +#include "compel/infect-util.h" + #define PREF_SHIFT_OP(pref, op, size) ((size)op(pref##BYTES_SHIFT)) #define KBYTES_SHIFT 10 #define MBYTES_SHIFT 20 @@ -420,7 +422,7 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void * criu_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other CRIU processes. */ -extern uint64_t criu_run_id; +extern char criu_run_id[RUN_ID_HASH_LENGTH]; extern void util_init(void); extern char *resolve_mountpoint(char *path); diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index 9fdc74cb7..110f7802a 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -99,7 +99,7 @@ int init_pidfd_store_sk(pid_t pid, int sk) goto err; } - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%" PRIx64, pid, sk, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%s", pid, sk, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/unittest/mock.c b/criu/unittest/mock.c index e517720e4..b2d507278 100644 --- a/criu/unittest/mock.c +++ b/criu/unittest/mock.c @@ -5,6 +5,8 @@ #include #include +#include "compel/infect-util.h" + int add_external(char *key) { return 0; @@ -141,4 +143,4 @@ int check_mount_v2(void) return 0; } -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; diff --git a/criu/util.c b/criu/util.c index d2bc9a865..58c18e20b 100644 --- a/criu/util.c +++ b/criu/util.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "linux/mount.h" @@ -2026,20 +2027,16 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) return fret; } -uint64_t criu_run_id; +char criu_run_id[RUN_ID_HASH_LENGTH]; void util_init(void) { - struct stat statbuf; + uuid_t uuid; - criu_run_id = getpid(); - if (!stat("/proc/self/ns/pid", &statbuf)) - criu_run_id |= (uint64_t)statbuf.st_ino << 32; - else if (errno != ENOENT) - pr_perror("Can't stat /proc/self/ns/pid - CRIU run id might not be unique"); - - compel_run_id = criu_run_id; - pr_info("CRIU run id = %#" PRIx64 "\n", criu_run_id); + uuid_generate(uuid); + uuid_unparse(uuid, criu_run_id); + pr_info("CRIU run id = %s\n", criu_run_id); + memcpy(compel_run_id, criu_run_id, sizeof(criu_run_id)); } /* diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 329d7791d..d843793ea 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -24,7 +24,8 @@ RUN apk update && apk add \ sudo \ libcap-utils \ libdrm-dev \ - util-linux + util-linux \ + util-linux-dev COPY . /criu WORKDIR /criu diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c466a73d2..ed66ae4fe 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -56,6 +56,7 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta python-protobuf \ python3-minimal \ python-ipaddress \ + uuid-dev \ curl \ wget \ vim \ diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 405651489..9d11194bb 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -35,6 +35,7 @@ RUN pacman -Syu --noconfirm \ python-junit-xml \ python-importlib-metadata \ libdrm \ + util-linux-libs \ diffutils COPY . /criu diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index cb9332fd0..6caf9d0b1 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -19,6 +19,7 @@ RUN apk update && apk add \ maven \ ip6tables \ iptables \ + util-linux-dev \ bash COPY . /criu diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 0318f650f..67de916ac 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -22,6 +22,7 @@ RUN apt-install protobuf-c-compiler \ pkg-config \ iptables \ gcc \ + uuid-dev \ maven COPY . /criu diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index 13e992642..d218e0641 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -21,6 +21,7 @@ RUN apt-install \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ + uuid-dev \ python3-minimal COPY . /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index e190c2792..0ae4727d2 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -22,6 +22,7 @@ RUN apt-install protobuf-c-compiler \ pkg-config \ iptables \ gcc \ + uuid-dev \ maven RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index 39a0c33c6..e95a43306 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -33,6 +33,7 @@ RUN apt-get install -y --no-install-recommends \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libnl-route-3-dev:${DEBIAN_ARCH} \ libnftables-dev:${DEBIAN_ARCH} \ libgnutls28-dev:${DEBIAN_ARCH} \ diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 078372c38..65ae55833 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -18,6 +18,7 @@ RUN apt-install \ libnl-3-dev:${DEBIAN_ARCH} \ libprotobuf-dev:${DEBIAN_ARCH} \ libnet-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ libaio-dev:${DEBIAN_ARCH} \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 9b53a76aa..3d6de1044 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -29,6 +29,7 @@ RUN apt-install \ protobuf-compiler \ python3-minimal \ python3-protobuf \ + uuid-dev \ python3-yaml COPY . /criu diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index dacfd96ef..3504b0433 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -17,6 +17,7 @@ RUN apt-install \ python3-protobuf \ libnl-3-dev:${DEBIAN_ARCH} \ libprotobuf-dev:${DEBIAN_ARCH} \ + uuid-dev:${DEBIAN_ARCH} \ libnet-dev:${DEBIAN_ARCH} \ libprotobuf-c-dev:${DEBIAN_ARCH} \ libcap-dev:${DEBIAN_ARCH} \ diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 09085c403..42252c93c 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -36,6 +36,7 @@ dnf install -y \ e2fsprogs \ rubygem-asciidoctor \ libdrm-devel \ + libuuid-devel \ kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index b472e954c..611ff7803 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -4,7 +4,7 @@ set -x -e CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml + libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev libperl-dev pkg-config python3-protobuf python3-pip python3-importlib-metadata python3-junit.xml libdrm-dev) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 3904c51d2..ed5a01178 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -39,7 +39,7 @@ setup() { ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml + rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default cat /proc/cmdline From f83931542afc3eff5bc02344a522bb8662425d4f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 23 Jan 2025 17:42:45 +0000 Subject: [PATCH 541/775] net: remember the name of the lock chain (nftables) Using libnftables the chain to lock the network is composed of ("CRIU-%d", real_pid). This leads to around 40 zdtm tests failing with errors like this: Error: No such file or directory; did you mean table 'CRIU-62' in family inet? delete table inet CRIU-86 The reason is that as soon as a process is running in a namespace the real PID can be anything and only the PID in the namespace is restored correctly. Relying on the real PID does not work for the chain name. Using the PID of the innermost namespace would lead to the chain be called 'CRIU-1' most of the time which is also not really unique. With this commit the change is now named using the already existing CRIU run ID. To be able to correctly restore the process and delete the locking table, the CRIU run id during checkpointing is now stored in the inventory as dump_criu_run_id. Signed-off-by: Adrian Reber --- criu/image.c | 30 ++++++++++++++++++++++++++++++ criu/include/util.h | 2 ++ criu/netfilter.c | 20 +++++++++++++++++++- images/inventory.proto | 4 ++++ 4 files changed, 55 insertions(+), 1 deletion(-) diff --git a/criu/image.c b/criu/image.c index 9589167fb..f3747d6ff 100644 --- a/criu/image.c +++ b/criu/image.c @@ -25,6 +25,7 @@ bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +char dump_criu_run_id[RUN_ID_HASH_LENGTH]; struct inventory_plugin { struct list_head node; @@ -120,6 +121,24 @@ int check_img_inventory(bool restore) goto out_err; } } + + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + if (he->dump_criu_run_id) { + strncpy(dump_criu_run_id, he->dump_criu_run_id, sizeof(dump_criu_run_id) - 1); + pr_info("Dump CRIU run id = %s\n", dump_criu_run_id); + } else { + /** + * If restoring from an old image this is a marker + * that no dump_criu_run_id exists. + */ + dump_criu_run_id[0] = NO_DUMP_CRIU_RUN_ID; + } + } ret = 0; @@ -367,6 +386,17 @@ int prepare_inventory(InventoryEntry *he) he->has_network_lock_method = true; he->network_lock_method = opts.network_lock_method; + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + he->dump_criu_run_id = xstrdup(criu_run_id); + + if (!he->dump_criu_run_id) + return -1; + return 0; } diff --git a/criu/include/util.h b/criu/include/util.h index 194e94dee..55ad5b63c 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -424,6 +424,8 @@ extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void */ extern char criu_run_id[RUN_ID_HASH_LENGTH]; extern void util_init(void); +#define NO_DUMP_CRIU_RUN_ID 0x7f +extern char dump_criu_run_id[RUN_ID_HASH_LENGTH]; extern char *resolve_mountpoint(char *path); diff --git a/criu/netfilter.c b/criu/netfilter.c index 9e78dc4b0..e2c82764f 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -299,7 +299,25 @@ int nftables_lock_connection(struct inet_sk_desc *sk) int nftables_get_table(char *table, int n) { - if (snprintf(table, n, "inet CRIU-%d", root_item->pid->real) < 0) { + int ret; + + switch(dump_criu_run_id[0]) { + case 0: + /* This is not a restore.*/ + ret = snprintf(table, n, "inet CRIU-%s", criu_run_id); + break; + case NO_DUMP_CRIU_RUN_ID: + /** + * This is a restore from an older image with no + * dump_criu_run_id available. Let's use the old ID. + */ + ret = snprintf(table, n, "inet CRIU-%d", root_item->pid->real); + break; + default: + ret = snprintf(table, n, "inet CRIU-%s", dump_criu_run_id); + } + + if (ret < 0) { pr_err("Cannot generate CRIU's nftables table name\n"); return -1; } diff --git a/images/inventory.proto b/images/inventory.proto index 7f655031b..1e18815bb 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -29,4 +29,8 @@ message inventory_entry { optional bool tcp_close = 10; optional uint32 network_lock_method = 11; optional plugins_entry plugins_entry = 12; + // Remember the criu_run_id when CRIU dumped the process. + // This is currently used to delete the correct nftables + // network locking rule. + optional string dump_criu_run_id = 13; } From 02056bf41aaef1522a6d9fae18cd45c3f119ca83 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 14 Dec 2024 21:14:58 +0000 Subject: [PATCH 542/775] cuda: prevent task lockup on timeout error When creating a checkpoint of large models, the `checkpoint` action of `cuda-checkpoint` can exceed the CRIU timeout. This causes CRIU to fail with the following error, leaving the CUDA task in a locked state: cuda_plugin: Checkpointing CUDA devices on pid 84145 restore_tid 84202 Error (criu/cr-dump.c:1791): Timeout reached. Try to interrupt: 0 Error (cuda_plugin.c:139): cuda_plugin: Unable to read output of cuda-checkpoint: Interrupted system call Error (cuda_plugin.c:396): cuda_plugin: CHECKPOINT_DEVICES failed with net: Unlock network cuda_plugin: finished cuda_plugin stage 0 err -1 cuda_plugin: resuming devices on pid 84145 cuda_plugin: Restore thread pid 84202 found for real pid 84145 Unfreezing tasks into 1 Unseizing 84145 into 1 Error (criu/cr-dump.c:2111): Dumping FAILED. To fix this, we set `task_info->checkpointed` before invoking the `checkpoint` action to ensure that the CUDA task is resumed even if CRIU times out. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index e78828b18..976ce824c 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -391,14 +391,14 @@ int cuda_plugin_checkpoint_devices(int pid) if (resume_restore_thread(restore_tid, &save_sigset)) { return -1; } + + task_info->checkpointed = 1; status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } - task_info->checkpointed = 1; - interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); From 7f0d107fe576e7e0b521c2df6e96fe1501a8e1f6 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 21 Dec 2024 14:17:35 +0000 Subject: [PATCH 543/775] seize: use separate checkpoint_devices function Move `run_plugins(CHECKPOINT_DEVICES)` out of `collect_pstree()` to ensure that the function's sole responsibility is to use the cgroup freezer for the process tree. This allows us to avoid a time-out error when checkpointing applications with large GPU state. v2: This patch calls `checkpoint_devices()` only for `criu dump`. Support for GPU checkpointing with `pre-dump` will be introduced in a separate patch. Suggested-by: Andrei Vagin Suggested-by: Jesus Ramos Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 3 +++ criu/include/seize.h | 1 + criu/seize.c | 27 ++++++++++++++++++--------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 1bc5d934f..302078caa 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2192,6 +2192,9 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree()) goto err; + if (checkpoint_devices()) + goto err; + if (collect_pstree_ids()) goto err; diff --git a/criu/include/seize.h b/criu/include/seize.h index 64e8d2d12..fc7facad3 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,6 +2,7 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +extern int checkpoint_devices(void); struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); diff --git a/criu/seize.c b/criu/seize.c index 007e8e580..f56357ac7 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1050,7 +1050,6 @@ int collect_pstree(void) pid_t pid = root_item->pid->real; int ret, exit_code = -1; struct proc_status_creds creds; - struct pstree_item *iter; timing_start(TIME_FREEZING); @@ -1111,14 +1110,6 @@ int collect_pstree(void) goto err; } - for_each_pstree_item(iter) { - if (!task_alive(iter)) - continue; - ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); - if (ret < 0 && ret != -ENOTSUP) - goto err; - } - exit_code = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); @@ -1128,3 +1119,21 @@ err: alarm(0); return exit_code; } + +int checkpoint_devices(void) +{ + struct pstree_item *iter; + int ret, exit_code = -1; + + for_each_pstree_item(iter) { + if (!task_alive(iter)) + continue; + ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + + exit_code = 0; +err: + return exit_code; +} \ No newline at end of file From 82b03429b71d2334f7444cec80a827b2604cd981 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 15 Jan 2025 20:54:10 +0000 Subject: [PATCH 544/775] cuda: disable CUDA plugin for pre-dump Temporarily disable CUDA plugin for `criu pre-dump`. pre-dump currently fails with the following error: Handling VMA with the following smaps entry: 1822c000-18da5000 rw-p 00000000 00:00 0 [heap] Handling VMA with the following smaps entry: 200000000-200200000 ---p 00000000 00:00 0 Handling VMA with the following smaps entry: 200200000-200400000 rw-s 00000000 00:06 895 /dev/nvidia0 Error (criu/proc_parse.c:116): handle_device_vma plugin failed: No such file or directory Error (criu/proc_parse.c:632): Can't handle non-regular mapping on 705693's map 200200000 Error (criu/cr-dump.c:1486): Collect mappings (pid: 705693) failed with -1 We plan to enable support for pre-dump by skipping nvidia mappings in a separate patch. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 976ce824c..99e4caf74 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -565,6 +565,12 @@ int cuda_plugin_init(int stage) { int ret; + /* Disable CUDA checkpointing with pre-dump */ + if (stage == CR_PLUGIN_STAGE__PRE_DUMP) { + plugin_disabled = true; + return 0; + } + if (stage == CR_PLUGIN_STAGE__RESTORE) { if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { plugin_disabled = true; From e2dffcbc8e717c8a837f476c8b9f552821a58753 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 19 Dec 2024 10:33:54 +0000 Subject: [PATCH 545/775] lib: do not set protobuf has_* field too early For two cases libcriu was setting the RPC protobuf field `has_*` before checking if the given parameter is valid. This can lead to situations, if the caller doesn't check the return value, that we pass as RPC struct to CRIU which has the `has_*` protobuf field set to true, but does not have a verified value (or non at all) set for the actual RPC entry. Signed-off-by: Adrian Reber --- lib/c/criu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/c/criu.c b/lib/c/criu.c index 7f766db85..c16fe5dcd 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -352,8 +352,8 @@ int criu_set_parent_images(const char *path) int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode) { - opts->rpc->has_pre_dump_mode = true; if (mode == CRIU_PRE_DUMP_SPLICE || mode == CRIU_PRE_DUMP_READ) { + opts->rpc->has_pre_dump_mode = true; opts->rpc->pre_dump_mode = (CriuPreDumpMode)mode; return 0; } @@ -1867,8 +1867,8 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { - opts->rpc->has_network_lock = true; if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { + opts->rpc->has_network_lock = true; opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } From d226bd4f670b6a001d1b6809c90495e8710e387a Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 29 Jan 2025 15:19:16 +0000 Subject: [PATCH 546/775] ci: handle results from latest codespell CI pulls in a newer version of codespell. This fixes complaints from that codespell version. Signed-off-by: Adrian Reber --- .codespellrc | 2 +- criu/include/rbtree.h | 2 +- criu/include/rst_info.h | 2 +- criu/page-xfer.c | 4 ++-- test/zdtm/static/packet_sock.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.codespellrc b/.codespellrc index dd31dd851..15e6fc7bc 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/criu/include/rbtree.h b/criu/include/rbtree.h index ba0a8100e..6981aa8f9 100644 --- a/criu/include/rbtree.h +++ b/criu/include/rbtree.h @@ -14,7 +14,7 @@ #define RB_MASK 3 struct rb_node { - unsigned long rb_parent_color; /* Keeps both parent anc color */ + unsigned long rb_parent_color; /* Keeps both parent and color */ struct rb_node *rb_right; struct rb_node *rb_left; } __aligned(sizeof(long)); diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 59b891fa2..df9f9de01 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -22,7 +22,7 @@ struct fdt { pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr - * The fdt table was restrored, if fdt_lock is equal to nr + 1 + * The fdt table was restored, if fdt_lock is equal to nr + 1 */ futex_t fdt_lock; }; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 94f477414..0314963e6 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -1421,7 +1421,7 @@ int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) if (opts.ps_socket != -1) { ask = opts.ps_socket; - pr_info("Re-using ps socket %d\n", ask); + pr_info("Reusing ps socket %d\n", ask); goto no_server; } @@ -1467,7 +1467,7 @@ static int connect_to_page_server(void) if (opts.ps_socket != -1) { page_server_sk = opts.ps_socket; - pr_info("Re-using ps socket %d\n", page_server_sk); + pr_info("Reusing ps socket %d\n", page_server_sk); goto out; } diff --git a/test/zdtm/static/packet_sock.c b/test/zdtm/static/packet_sock.c index 4a9078f81..c1c94ac21 100644 --- a/test/zdtm/static/packet_sock.c +++ b/test/zdtm/static/packet_sock.c @@ -5,7 +5,7 @@ const char *test_author = "Pavel Emelyanov "; /* * Description: - * Create and bind several packet sockets, check thet getname + * Create and bind several packet sockets, check that getname * reports same result before and after c/r cycle. This is enough * for _basic_ packet functionality only, but still. */ From 9c40781c2674ff99fb56b339fabe62301a0f6ea4 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 10:25:24 +0800 Subject: [PATCH 547/775] net/sysctl: put common multiplier outside the brackets Also add an explanation of the logic behind this calculation. Signed-off-by: Pavel Tikhomirov --- criu/net.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/criu/net.c b/criu/net.c index efd52db32..97c53f84f 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2149,10 +2149,16 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) list_for_each_entry(p, &ns->net.ids, node) i++; + /* + * Here we allocate one single big buffer for storing multiple arrays + * of protobuf entries and pointers to entries in it and we later use + * xptr_pull_s to claim a part of this buffer of proper size for each + * particular array. Next we read data from sysctl files to those + * arrays and then finally save them into images. + */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - size4 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - size6 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - sizex * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); + (size4 * 2 + size6 * 2 + sizex) * + (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; From 4ca74b9aff301b1db3696077b5bea7e32d466152 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 11:00:28 +0800 Subject: [PATCH 548/775] net/sysctl: c/r ipv4/ping_group_range value It is per net namespace, we need it to allow creation of unprivileged ICMP sockets. Note: in case this sysctl was disabled after unprivileged ICMP socket was created we still need to somehow handle it on restore. Signed-off-by: Pavel Tikhomirov --- criu/net.c | 103 +++++++++++++++++++++++++++++++++++++++++++- images/netdev.proto | 1 + 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 97c53f84f..ee46f1c49 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2128,6 +2128,79 @@ nft_ctx_free_out: } #endif +static const char *ipv4_sysctl_entries[] = { + "ping_group_range", +}; + +#define IPV4_SYSCTL_BASE "net/ipv4" +#define IPV4_SYSCTL_FMT IPV4_SYSCTL_BASE"/%s" +#define MAX_IPV4_SYSCTL_OPT 32 +#define MAX_IPV4_SYSCTL_PATH (sizeof(IPV4_SYSCTL_FMT) + MAX_IPV4_SYSCTL_OPT - 2) +#define MAX_STR_IPV4_SYSCTL_LEN 200 + +static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) +{ + int i, ret = -1, flags = 0; + char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; + struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; + SysctlEntry **sysctl = *rsysctl; + size_t n = *pn; + + if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { + pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + return -EINVAL; + } + + if (opts.weak_sysctls || op == CTL_READ) + flags = CTL_FLAGS_OPTIONAL; + + for (i = 0; i < n; i++) { + snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[i].name = path[i]; + req[i].flags = flags; + + switch (sysctl[i]->type) { + case SYSCTL_TYPE__CTL_STR: + req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + + /* skip write if have no value */ + if (op == CTL_WRITE && !sysctl[i]->sarg) + continue; + + req[i].arg = sysctl[i]->sarg; + break; + default: + pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); + return -1; + } + } + + ret = sysctl_op(req, n, op, CLONE_NEWNET); + if (ret < 0) { + pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + return -1; + } + + if (op == CTL_READ) { + bool has_entries = false; + + for (i = 0; i < n; i++) { + if (req[i].flags & CTL_FLAGS_HAS) { + sysctl[i]->has_iarg = true; + if (!has_entries) + has_entries = true; + } + } + + if (!has_entries) { + *pn = 0; + *rsysctl = NULL; + } + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2142,6 +2215,9 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) int size6 = ARRAY_SIZE(devconfs6); char def_stable_secret[MAX_STR_CONF_LEN + 1] = {}; char all_stable_secret[MAX_STR_CONF_LEN + 1] = {}; + SysctlEntry *ipv4_sysctls = NULL; + size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); + char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; NetnsId *ids; struct netns_id *p; @@ -2157,7 +2233,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) * arrays and then finally save them into images. */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - (size4 * 2 + size6 * 2 + sizex) * + (2 * size4 + 2 * size6 + sizex + ipv4_sysctl_size) * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; @@ -2223,6 +2299,21 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) netns.unix_conf[i]->type = SYSCTL_TYPE__CTL_32; } + netns.n_ipv4_sysctl = ipv4_sysctl_size; + netns.ipv4_sysctl = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry *)); + ipv4_sysctls = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry)); + for (i = 0; i < ipv4_sysctl_size; i++) { + sysctl_entry__init(&ipv4_sysctls[i]); + netns.ipv4_sysctl[i] = &ipv4_sysctls[i]; + if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { + netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; + netns.ipv4_sysctl[i]->sarg = ping_group_range; + } else { + /* Need to handle this case when we have more sysctls */ + BUG(); + } + } + ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; @@ -2241,6 +2332,10 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + ret = ipv4_sysctls_op(&netns.ipv4_sysctl, &netns.n_ipv4_sysctl, CTL_READ); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); @@ -2593,6 +2688,12 @@ static int restore_netns_conf(struct ns_id *ns) goto out; } + if ((netns)->ipv4_sysctl) { + ret = ipv4_sysctls_op(&(netns)->ipv4_sysctl, &(netns)->n_ipv4_sysctl, CTL_WRITE); + if (ret) + goto out; + } + ns->net.netns = netns; out: return ret; diff --git a/images/netdev.proto b/images/netdev.proto index 748fd0200..42e2bc7d7 100644 --- a/images/netdev.proto +++ b/images/netdev.proto @@ -74,4 +74,5 @@ message netns_entry { repeated netns_id nsids = 7; optional string ext_key = 8; repeated sysctl_entry unix_conf = 9; + repeated sysctl_entry ipv4_sysctl = 10; } From 6710cfce10d32a4ebabae61420fa495e42385966 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 16 Jan 2025 11:56:00 +0800 Subject: [PATCH 549/775] zdtm/netns_sub_sysctl: add ipv4/ping_group_range sysctl check Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/sysctl.c | 43 +++++++++++++++++++++ test/zdtm/lib/sysctl.h | 2 + test/zdtm/static/netns_sub_sysctl.c | 58 +++++++++++++++++++++++------ 3 files changed, 91 insertions(+), 12 deletions(-) diff --git a/test/zdtm/lib/sysctl.c b/test/zdtm/lib/sysctl.c index 9583ec3df..3b1ebc168 100644 --- a/test/zdtm/lib/sysctl.c +++ b/test/zdtm/lib/sysctl.c @@ -3,6 +3,49 @@ #include "zdtmtst.h" #include "sysctl.h" +int sysctl_read_str(const char *name, char *data, size_t size) +{ + int fd, ret; + + fd = open(name, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = read(fd, data, size - 1); + if (ret < 0) { + pr_perror("Can't read %s", name); + close(fd); + return -1; + } + data[ret] = '\0'; + close(fd); + + return 0; +} + +int sysctl_write_str(const char *name, char *data) +{ + int fd, ret; + + fd = open(name, O_WRONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = write(fd, data, strlen(data)); + if (ret < 0) { + pr_perror("Can't write %s into %s", data, name); + close(fd); + return -1; + } + close(fd); + + return 0; +} + int sysctl_read_int(const char *name, int *data) { int fd; diff --git a/test/zdtm/lib/sysctl.h b/test/zdtm/lib/sysctl.h index 67129102f..d435bd7e9 100644 --- a/test/zdtm/lib/sysctl.h +++ b/test/zdtm/lib/sysctl.h @@ -3,5 +3,7 @@ extern int sysctl_read_int(const char *name, int *data); extern int sysctl_write_int(const char *name, int val); +extern int sysctl_read_str(const char *name, char *data, size_t size); +extern int sysctl_write_str(const char *name, char *data); #endif diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 545a17308..0f94c40a7 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -3,18 +3,33 @@ #include "zdtmtst.h" #include "sysctl.h" -const char *test_doc = "Check dump and restore a net.unix.max_dgram_qlen sysctl parameter in subns"; +const char *test_doc = "Check dump and restore of sysctls in subns"; const char *test_author = "Alexander Mikhalitsyn "; +#define MAX_STR_SYSCTL_LEN 200 + +enum { + SYSCTL_INT, + SYSCTL_STR, +}; + typedef struct { const char *path; + int type; int old; int new; + char s_old[MAX_STR_SYSCTL_LEN]; + char s_new[MAX_STR_SYSCTL_LEN]; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" +#define IPV4_SYSCTL_BASE "/proc/sys/net/ipv4" -static sysctl_opt_t net_unix_params[] = { { CONF_UNIX_BASE "/max_dgram_qlen", 0, 0 }, { NULL, 0, 0 } }; +static sysctl_opt_t net_unix_params[] = { + {CONF_UNIX_BASE "/max_dgram_qlen", SYSCTL_INT}, + {IPV4_SYSCTL_BASE "/ping_group_range", SYSCTL_STR, 0, 0, "40000\t50000\n"}, + {NULL, 0, 0} +}; int main(int argc, char **argv) { @@ -23,10 +38,17 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { - p->old = (((unsigned)lrand48()) % 1023) + 1; - if (sysctl_write_int(p->path, p->old)) { - pr_perror("Can't change %s", p->path); - return -1; + if (p->type == SYSCTL_INT) { + p->old = (((unsigned)lrand48()) % 1023) + 1; + if (sysctl_write_int(p->path, p->old)) { + pr_perror("Can't change %s", p->path); + return -1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_write_str(p->path, p->s_old)) { + pr_perror("Can't change %s", p->path); + return -1; + } } } @@ -34,13 +56,25 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { - if (sysctl_read_int(p->path, &p->new)) - ret = 1; + if (p->type == SYSCTL_INT) { + if (sysctl_read_int(p->path, &p->new)) + ret = 1; - if (p->old != p->new) { - errno = EINVAL; - pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); - ret = 1; + if (p->old != p->new) { + errno = EINVAL; + pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); + ret = 1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_read_str(p->path, p->s_new, MAX_STR_SYSCTL_LEN)) { + ret = 1; + } else { + if (strcmp(p->s_old, p->s_new)) { + errno = EINVAL; + pr_perror("%s changed: %s ---> %s", p->path, p->s_old, p->s_new); + ret = 1; + } + } } } From 8a06ca27cc9ac711faf818b8ac4d061be4d810a8 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 22 Jan 2025 14:35:26 +0100 Subject: [PATCH 550/775] vdso: switch from DT_HASH to DT_GNU_HASH (aarch64) Trying to run latest CRIU on CentOS Stream 10 or Ubuntu 24.04 (aarch64) fails like this: # criu/criu check -v4 [...] (00.096460) vdso: Parsing at ffffb2e2a000 ffffb2e2c000 (00.096539) vdso: PT_LOAD p_vaddr: 0 (00.096567) vdso: DT_STRTAB: 1d0 (00.096592) vdso: DT_SYMTAB: 128 (00.096616) vdso: DT_STRSZ: 8a (00.096640) vdso: DT_SYMENT: 18 (00.096663) Error (criu/pie-util-vdso.c:193): vdso: Not all dynamic entries are present (00.096688) Error (criu/vdso.c:627): vdso: Failed to fill self vdso symtable (00.096713) Error (criu/kerndat.c:1906): kerndat_vdso_fill_symtable failed when initializing kerndat. (00.096812) Found mmap_min_addr 0x10000 (00.096881) files stat: fs/nr_open 1073741816 (00.096908) Error (criu/crtools.c:267): Could not initialize kernel features detection. This seems to be related to the kernel (6.12.0-41.el10.aarch64). The Ubuntu user-space is running in a container on the same kernel. Looking at the kernel this seems to be related to: commit 48f6430505c0b0498ee9020ce3cf9558b1caaaeb Author: Fangrui Song Date: Thu Jul 18 10:34:23 2024 -0700 arm64/vdso: Remove --hash-style=sysv glibc added support for .gnu.hash in 2006 and .hash has been obsoleted for more than one decade in many Linux distributions. Using --hash-style=sysv might imply unaddressed issues and confuse readers. Just drop the option and rely on the linker default, which is likely "both", or "gnu" when the distribution really wants to eliminate sysv hash overhead. Similar to commit 6b7e26547fad ("x86/vdso: Emit a GNU hash"). The commit basically does: -ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \ +ldflags-y := -shared -soname=linux-vdso.so.1 \ Which results in only a GNU hash being added to the ELF header. This change has been merged with 6.11. Looking at the referenced x86 commit: commit 6b7e26547fad7ace3dcb27a5babd2317fb9d1e12 Author: Andy Lutomirski Date: Thu Aug 6 14:45:45 2015 -0700 x86/vdso: Emit a GNU hash Some dynamic loaders may be slightly faster if a GNU hash is available. Strangely, this seems to have no effect at all on the vdso size. This is unlikely to have any measurable effect on the time it takes to resolve vdso symbols (since there are so few of them). In some contexts, it can be a win for a different reason: if every DSO has a GNU hash section, then libc can avoid calculating SysV hashes at all. Both musl and glibc appear to have this optimization. It's plausible that this breaks some ancient glibc version. If so, then, depending on what glibc versions break, we could either require COMPAT_VDSO for them or consider reverting. Which is also a really simple change: -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ The big difference here is that for x86 both hash sections are generated. For aarch64 only the newer GNU hash is generated. That is why we only see this error on kernel >= 6.11 and aarch64. Changing from DT_HASH to DT_GNU_HASH seems to work on aarch64. The test suite runs without any errors. Unfortunately I am not aware of all implication of this change and if a successful test suite run means that it still works. Looking at the kernel I see following hash styles for the VDSO: aarch64: not specified (only GNU hash style) arm: --hash-style=sysv loongarch: --hash-style=sysv mips: --hash-style=sysv powerpc: --hash-style=both riscv: --hash-style=both s390: --hash-style=both x86: --hash-style=both Only aarch64 on kernels >= 6.11 is a problem right now, because all other platforms provide the old style hashing. Signed-off-by: Adrian Reber Co-developed-by: Dmitry Safonov Co-authored-by: Dmitry Safonov Signed-off-by: Dmitry Safonov --- criu/pie/util-vdso.c | 243 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 197 insertions(+), 46 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index f1e3239ff..9819335d8 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -48,10 +49,25 @@ static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, uintptr_t start, return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } +/* Local strlen implementation */ +static size_t __strlen(const char *str) +{ + const char *ptr; + + if (!str) + return 0; + + ptr = str; + while (*ptr != '\0') + ptr++; + + return ptr - str; +} + /* * Elf hash, see format specification. */ -static unsigned long elf_hash(const unsigned char *name) +static unsigned long elf_sysv_hash(const unsigned char *name) { unsigned long h = 0, g; @@ -65,6 +81,15 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* * The GNU hash format. Taken from glibc. */ +static unsigned long elf_gnu_hash(const unsigned char *name) +{ + unsigned long h = 5381; + for (unsigned char c = *name; c != '\0'; c = *++name) + h = h * 33 + c; + return h; +} + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BORD ELFDATA2MSB /* 0x02 */ #else @@ -149,11 +174,14 @@ err_oob: * Output parameters are: * @dyn_strtab - address of the symbol table * @dyn_symtab - address of the string table section - * @dyn_hash - address of the symbol hash table + * @dyn_hash - address of the symbol hash table + * @use_gnu_hash - the format of hash DT_HASH or DT_GNU_HASH */ -static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, - Dyn_t **dyn_hash) +static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, + Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, + Dyn_t **dyn_hash, bool *use_gnu_hash) { + Dyn_t *dyn_gnu_hash = NULL, *dyn_sysv_hash = NULL; Dyn_t *dyn_syment = NULL; Dyn_t *dyn_strsz = NULL; uintptr_t addr; @@ -184,16 +212,52 @@ static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t dyn_syment = d; pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); } else if (d->d_tag == DT_HASH) { - *dyn_hash = d; + dyn_sysv_hash = d; pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_GNU_HASH) { + /* + * This is complicated. + * + * Looking at the Linux kernel source, the following can be seen + * regarding which hashing style the VDSO uses on each arch: + * + * aarch64: not specified (depends on linker, can be + * only GNU hash style) + * arm: --hash-style=sysv + * loongarch: --hash-style=sysv + * mips: --hash-style=sysv + * powerpc: --hash-style=both + * riscv: --hash-style=both + * s390: --hash-style=both + * x86: --hash-style=both + * + * Some architectures are using both hash-styles, that + * is the easiest for CRIU. Some architectures are only + * using the old style (sysv), that is what CRIU supports. + * + * Starting with Linux 6.11, aarch64 unfortunately decided + * to switch from '--hash-style=sysv' to ''. Specifying + * nothing unfortunately may mean GNU hash style only and not + * 'both' (depending on the linker). + */ + dyn_gnu_hash = d; + pr_debug("DT_GNU_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); } } - if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) { + if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || + (!dyn_gnu_hash && !dyn_sysv_hash)) { pr_err("Not all dynamic entries are present\n"); return -EINVAL; } + /* + * Prefer DT_HASH over DT_GNU_HASH as it's been more tested and + * as a result more stable. + */ + *use_gnu_hash = !dyn_sysv_hash; + *dyn_hash = dyn_sysv_hash ?: dyn_gnu_hash; + return 0; err_oob: @@ -208,60 +272,141 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif -static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, - uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab) +static bool elf_symbol_match(uintptr_t mem, size_t size, + uintptr_t dynsymbol_names, Sym_t *sym, + const char *symbol, const size_t vdso_symbol_length) +{ + uintptr_t addr = (uintptr_t)sym; + char *name; + + if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) + return false; + + if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) + return false; + + addr = dynsymbol_names + sym->st_name; + if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) + return false; + name = (void *)addr; + + return !std_strncmp(name, symbol, vdso_symbol_length); +} + + +static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, + const char *symbol, uint32_t symbol_hash, unsigned int sym_off, + uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, + Hash_t nbucket, Hash_t nchain, Hash_t *bucket, Hash_t *chain, + const size_t vdso_symbol_length, bool use_gnu_hash) +{ + unsigned int j; + uintptr_t addr; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + + if (use_gnu_hash) { + uint32_t *h = bucket + nbucket + (j - sym_off); + uint32_t hash_val; + + symbol_hash |= 1; + do { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + hash_val = *h++; + if ((hash_val | 1) == symbol_hash && + elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + j++; + } while (!(hash_val & 1)); + } else { + for (; j < nchain && j != STN_UNDEF; j = chain[j]) { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + if (elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + } + } + return 0; +} + +static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, + struct vdso_symtable *t, uintptr_t dynsymbol_names, + Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash) { ARCH_VDSO_SYMBOLS_LIST const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - Hash_t nbucket, nchain; - Hash_t *bucket, *chain; + Hash_t *bucket = NULL; + Hash_t *chain = NULL; + Hash_t nbucket = 0; + Hash_t nchain = 0; - unsigned int i, j, k; - uintptr_t addr; + unsigned int sym_off = 0; + unsigned int i = 0; - nbucket = hash[0]; - nchain = hash[1]; - bucket = &hash[2]; - chain = &hash[nbucket + 2]; + unsigned long (*elf_hash)(const unsigned char *); + + if (use_gnu_hash) { + uint32_t *gnu_hash = (uint32_t *)hash; + uint32_t bloom_sz; + size_t *bloom; + + nbucket = gnu_hash[0]; + sym_off = gnu_hash[1]; + bloom_sz = gnu_hash[2]; + bloom = (size_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + elf_hash = &elf_gnu_hash; + pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bloom %lx bucket %lx\n", + (unsigned long)nbucket, (unsigned long)sym_off, + (unsigned long)bloom_sz, (unsigned long)bloom, + (unsigned long)bucket); + } else { + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + elf_hash = &elf_sysv_hash; + pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", + (unsigned long)nbucket, (unsigned long)nchain, + (unsigned long)bucket, (unsigned long)chain); + } - pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", (long)nbucket, (long)nchain, (unsigned long)bucket, - (unsigned long)chain); for (i = 0; i < VDSO_SYMBOL_MAX; i++) { const char *symbol = vdso_symbols[i]; - k = elf_hash((const unsigned char *)symbol); + unsigned long addr, symbol_hash; + const size_t symbol_length = __strlen(symbol); - for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { - Sym_t *sym; - char *name; + symbol_hash = elf_hash((const unsigned char *)symbol); + addr = elf_symbol_lookup(mem, size, symbol, symbol_hash, + sym_off, dynsymbol_names, dyn_symtab, load, + nbucket, nchain, bucket, chain, + vdso_symbol_length, use_gnu_hash); + pr_debug("symbol %s at address %lx\n", symbol, addr); + if (!addr) + continue; - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; - - addr += sizeof(Sym_t) * j; - if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) - continue; - sym = (void *)addr; - - if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) - continue; - - addr = dynsymbol_names + sym->st_name; - if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) - continue; - name = (void *)addr; - - if (std_strncmp(name, symbol, vdso_symbol_length)) - continue; - - /* XXX: provide strncpy() implementation for PIE */ - memcpy(t->symbols[i].name, name, vdso_symbol_length); - t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; - break; + /* XXX: provide strncpy() implementation for PIE */ + if (symbol_length > vdso_symbol_length) { + pr_err("strlen(%s) %zd, only %zd bytes available\n", + symbol, symbol_length, vdso_symbol_length); + return -EINVAL; } + memcpy(t->symbols[i].name, symbol, symbol_length); + t->symbols[i].offset = addr - load->p_vaddr; } + + return 0; } int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) @@ -271,6 +416,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_symtab = NULL; Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; + bool use_gnu_hash; uintptr_t dynsymbol_names; uintptr_t addr; @@ -296,7 +442,8 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) * needed. Note that we're interested in a small set of tags. */ - ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, &dyn_hash); + ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, + &dyn_hash, &use_gnu_hash); if (ret < 0) return ret; @@ -310,7 +457,11 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) goto err_oob; hash = (void *)addr; - parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab); + ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, + use_gnu_hash); + + if (ret <0) + return ret; return 0; From d66bc349957137113cdeab0ca679898f0b379395 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 31 Jan 2025 14:45:03 -0800 Subject: [PATCH 551/775] Makefile: move codespell options to .codespellrc This way, - Makefile is less cluttered; - one can run codespell from the command line. Fixes: fd7e97fcf ("lint: exclude tags file from codespell") Signed-off-by: Kir Kolyshkin --- .codespellrc | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.codespellrc b/.codespellrc index 15e6fc7bc..e91a6d2eb 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./.git,./test/pki +skip = ./.git,./test/pki,./tags ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/Makefile b/Makefile index 60b78a074..90908de83 100644 --- a/Makefile +++ b/Makefile @@ -466,7 +466,7 @@ shellcheck: shellcheck -x test/others/action-script/*.sh codespell: - codespell -S tags + codespell lint: ruff shellcheck codespell # Do not append \n to pr_perror, pr_pwarn or fail From 528c94c48b3d25b27a22bd672e700ccf413b5945 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 17:32:48 +0100 Subject: [PATCH 552/775] ci: install gawk for Fedora based tests Currently Fedora rawhide based CI runs fail with: /bin/sh: line 1: awk: command not found Let's install it. Signed-off-by: Adrian Reber --- scripts/ci/prepare-for-fedora-rawhide.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index 42252c93c..f8ad9cf97 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -4,6 +4,7 @@ set -e -x dnf install -y \ diffutils \ findutils \ + gawk \ gcc \ git \ gnutls-devel \ From b7fa7d304c12860405f1aacb012bdc5fc23f7636 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 20 Feb 2025 04:31:12 +0000 Subject: [PATCH 553/775] kerndat: run iptables with -n to not resolve service names Resolving service names can be slow and it isn't needed here. Fixes #2032 Signed-off-by: Andrei Vagin --- criu/kerndat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/kerndat.c b/criu/kerndat.c index fa1ed21fa..5939005a4 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -646,7 +646,7 @@ static int kerndat_loginuid(void) static int kerndat_iptables_has_xtlocks(void) { int fd; - char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; + char *argv[4] = { "sh", "-c", "iptables -n -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { From 030fa4affda75402bf9c2819c7050de27da1a3b0 Mon Sep 17 00:00:00 2001 From: dschervov Date: Wed, 5 Feb 2025 20:04:37 +0300 Subject: [PATCH 554/775] criu: fix internal representation of cgroups hierarchical structure strstartswith() function is incorrect choice for finding parent directory so i change it to issubpath() function Signed-off-by: Dmitrii Chervov --- criu/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cgroup.c b/criu/cgroup.c index fcaed0708..9246be639 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -248,7 +248,7 @@ static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir return EXACT_MATCH; } - if (strstartswith(path, d->path)) { + if (issubpath(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; From da90b33a42a071ee1702a84e076a03d733037632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Thu, 23 Jan 2025 04:07:42 +0530 Subject: [PATCH 555/775] coredump: enable coredump generation on aarch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add relevant elf header constants and notes for the aarch64 platform to enable coredump generation. Signed-off-by: समीर सिंह Sameer Singh --- coredump/coredump | 6 +- coredump/criu_coredump/coredump.py | 164 +++++++++++++++++++++-------- coredump/criu_coredump/elf.py | 55 +++++++++- test/others/criu-coredump/test.sh | 5 +- 4 files changed, 178 insertions(+), 52 deletions(-) diff --git a/coredump/coredump b/coredump/coredump index 3fbdafe81..f1027773d 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -6,6 +6,8 @@ import sys import criu_coredump +PLATFORMS = ["aarch64", "x86_64"] + def coredump(opts): generator = criu_coredump.coredump_generator() @@ -37,8 +39,8 @@ def main(): opts = vars(parser.parse_args()) - if platform.machine() != 'x86_64': - print('ERROR: %s only supported on x86_64' % sys.argv[0]) + if platform.machine() not in PLATFORMS: + print("ERROR: %s is only supported on: %s" % (sys.argv[0], ', '.join(PLATFORMS))) sys.exit(1) try: diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 20ec8e5dc..6bfc462f2 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -31,6 +31,7 @@ import io import sys import ctypes +import platform from pycriu import images from . import elf @@ -130,6 +131,11 @@ class coredump_generator: reg_files = None # reg-files; pagemaps = {} # pagemap by pid; + # thread info key based on the current arch + thread_info_key = {"aarch64": "ti_aarch64", "x86_64": "thread_info"} + + machine = platform.machine() # current arch + def _img_open_and_strip(self, name, single=False, pid=None): """ Load criu image and strip it from magic and redundant list. @@ -213,7 +219,7 @@ class coredump_generator: ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT ehdr.e_type = elf.ET_CORE - ehdr.e_machine = elf.EM_X86_64 + ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) @@ -224,6 +230,13 @@ class coredump_generator: return ehdr + def _get_e_machine(self): + """ + Get the e_machine field based on the current architecture. + """ + e_machine_dict = {"aarch64": elf.EM_AARCH64, "x86_64": elf.EM_X86_64} + return e_machine_dict[self.machine] + def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. @@ -332,7 +345,7 @@ class coredump_generator: Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["gpregs"] + regs = self._get_gpregs(core) pstree = self.pstree[pid] prstatus = elf.elf_prstatus() @@ -345,33 +358,7 @@ class coredump_generator: prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] - prstatus.pr_reg.r15 = regs["r15"] - prstatus.pr_reg.r14 = regs["r14"] - prstatus.pr_reg.r13 = regs["r13"] - prstatus.pr_reg.r12 = regs["r12"] - prstatus.pr_reg.rbp = regs["bp"] - prstatus.pr_reg.rbx = regs["bx"] - prstatus.pr_reg.r11 = regs["r11"] - prstatus.pr_reg.r10 = regs["r10"] - prstatus.pr_reg.r9 = regs["r9"] - prstatus.pr_reg.r8 = regs["r8"] - prstatus.pr_reg.rax = regs["ax"] - prstatus.pr_reg.rcx = regs["cx"] - prstatus.pr_reg.rdx = regs["dx"] - prstatus.pr_reg.rsi = regs["si"] - prstatus.pr_reg.rdi = regs["di"] - prstatus.pr_reg.orig_rax = regs["orig_ax"] - prstatus.pr_reg.rip = regs["ip"] - prstatus.pr_reg.cs = regs["cs"] - prstatus.pr_reg.eflags = regs["flags"] - prstatus.pr_reg.rsp = regs["sp"] - prstatus.pr_reg.ss = regs["ss"] - prstatus.pr_reg.fs_base = regs["fs_base"] - prstatus.pr_reg.gs_base = regs["gs_base"] - prstatus.pr_reg.ds = regs["ds"] - prstatus.pr_reg.es = regs["es"] - prstatus.pr_reg.fs = regs["fs"] - prstatus.pr_reg.gs = regs["gs"] + self._set_pr_regset(prstatus.pr_reg, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -385,28 +372,64 @@ class coredump_generator: return note + def _get_gpregs(self, core): + """ + Get the general purpose registers based on the current architecture. + """ + thread_info_key = self.thread_info_key[self.machine] + thread_info = core[thread_info_key] + + return thread_info["gpregs"] + + def _set_pr_regset(self, pr_reg, regs): + """ + Set the pr_reg struct based on the current architecture. + """ + if self.machine == "aarch64": + pr_reg.regs = (ctypes.c_ulonglong * len(regs["regs"]))(*regs["regs"]) + pr_reg.sp = regs["sp"] + pr_reg.pc = regs["pc"] + pr_reg.pstate = regs["pstate"] + elif self.machine == "x86_64": + pr_reg.r15 = regs["r15"] + pr_reg.r14 = regs["r14"] + pr_reg.r13 = regs["r13"] + pr_reg.r12 = regs["r12"] + pr_reg.rbp = regs["bp"] + pr_reg.rbx = regs["bx"] + pr_reg.r11 = regs["r11"] + pr_reg.r10 = regs["r10"] + pr_reg.r9 = regs["r9"] + pr_reg.r8 = regs["r8"] + pr_reg.rax = regs["ax"] + pr_reg.rcx = regs["cx"] + pr_reg.rdx = regs["dx"] + pr_reg.rsi = regs["si"] + pr_reg.rdi = regs["di"] + pr_reg.orig_rax = regs["orig_ax"] + pr_reg.rip = regs["ip"] + pr_reg.cs = regs["cs"] + pr_reg.eflags = regs["flags"] + pr_reg.rsp = regs["sp"] + pr_reg.ss = regs["ss"] + pr_reg.fs_base = regs["fs_base"] + pr_reg.gs_base = regs["gs_base"] + pr_reg.ds = regs["ds"] + pr_reg.es = regs["es"] + pr_reg.fs = regs["fs"] + pr_reg.gs = regs["gs"] + def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["fpregs"] + regs = self._get_fpregs(core) fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) - fpregset.cwd = regs["cwd"] - fpregset.swd = regs["swd"] - fpregset.ftw = regs["twd"] - fpregset.fop = regs["fop"] - fpregset.rip = regs["rip"] - fpregset.rdp = regs["rdp"] - fpregset.mxcsr = regs["mxcsr"] - fpregset.mxcr_mask = regs["mxcsr_mask"] - fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( - *regs["st_space"]) - fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( - *regs["xmm_space"]) + self._set_fpregset(fpregset, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -420,6 +443,58 @@ class coredump_generator: return note + def _get_fpregs(self, core): + """ + Get the floating point register dictionary based on the current architecture. + """ + fpregs_key_dict = {"aarch64": "fpsimd", "x86_64": "fpregs"} + fpregs_key = fpregs_key_dict[self.machine] + + thread_info_key = self.thread_info_key[self.machine] + + return core[thread_info_key][fpregs_key] + + def _set_fpregset(self, fpregset, regs): + """ + Set the fpregset struct based on the current architecture. + """ + if self.machine == "aarch64": + fpregset.vregs = (ctypes.c_ulonglong * len(regs["vregs"]))(*regs["vregs"]) + fpregset.fpsr = regs["fpsr"] + fpregset.fpcr = regs["fpcr"] + elif self.machine == "x86_64": + fpregset.cwd = regs["cwd"] + fpregset.swd = regs["swd"] + fpregset.ftw = regs["twd"] + fpregset.fop = regs["fop"] + fpregset.rip = regs["rip"] + fpregset.rdp = regs["rdp"] + fpregset.mxcsr = regs["mxcsr"] + fpregset.mxcr_mask = regs["mxcsr_mask"] + fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( + *regs["st_space"]) + fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( + *regs["xmm_space"]) + + def _gen_arm_tls(self, tid): + """ + Generate NT_ARM_TLS note for thread tid of process pid. + """ + core = self.cores[tid] + tls = ctypes.c_ulonglong(core["ti_aarch64"]["tls"]) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(ctypes.c_ulonglong) + nhdr.n_type = elf.NT_ARM_TLS + + note = elf_note() + note.data = tls + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. @@ -593,8 +668,11 @@ class coredump_generator: notes.append(self._gen_prstatus(pid, tid)) notes.append(self._gen_fpregset(pid, tid)) - notes.append(self._gen_x86_xstate(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) + if self.machine == "aarch64": + notes.append(self._gen_arm_tls(tid)) + elif self.machine == "x86_64": + notes.append(self._gen_x86_xstate(pid, tid)) return notes diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 092b47857..2697fad07 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -1,5 +1,8 @@ # Define structures and constants for generating elf file. import ctypes +import platform + +MACHINE = platform.machine() Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; @@ -39,6 +42,7 @@ ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ +EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ @@ -119,6 +123,7 @@ NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ class Elf64_Nhdr(ctypes.Structure): # typedef struct @@ -218,7 +223,7 @@ class timeval(ctypes.Structure): # struct timeval ] -class user_regs_struct(ctypes.Structure): # struct user_regs_struct +class x86_64_user_regs_struct(ctypes.Structure): # struct x86_64_user_regs_struct _fields_ = [ ("r15", ctypes.c_ulonglong), # __extension__ unsigned long long int r15; @@ -277,10 +282,31 @@ class user_regs_struct(ctypes.Structure): # struct user_regs_struct ] +class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_struct + _fields_ = [ + ("regs", + ctypes.c_ulonglong * 31), # unsigned long long int regs[31]; + ("sp", + ctypes.c_ulonglong), # unsigned long long int sp; + ("pc", + ctypes.c_ulonglong), # unsigned long long int pc; + ("pstate", + ctypes.c_ulonglong), # unsigned long long int pstate; + ] + + # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG -elf_gregset_t = user_regs_struct +user_regs_dict = { + "aarch64": aarch64_user_regs_struct, + "x86_64": x86_64_user_regs_struct, +} + +try: + elf_gregset_t = user_regs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) class elf_prstatus(ctypes.Structure): # struct elf_prstatus @@ -420,7 +446,7 @@ class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo ] -class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct +class x86_64_user_fpregs_struct(ctypes.Structure): # struct x86_64_user_fpregs_struct _fields_ = [ # unsigned short int cwd; ("cwd", ctypes.c_ushort), @@ -447,7 +473,28 @@ class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct ] -elf_fpregset_t = user_fpregs_struct +class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpregs_struct + _fields_ = [ + # unsigned long long int vregs[64]; + ("vregs", ctypes.c_ulonglong * 64), + # unsigned int fpsr; + ("fpsr", ctypes.c_uint), + # unsigned int fpcr; + ("fpcr", ctypes.c_uint), + # unsigned int padding[2]; + ("padding", ctypes.c_uint * 2), + ] + + +user_fpregs_dict = { + "aarch64": aarch64_user_fpregs_struct, + "x86_64": x86_64_user_fpregs_struct, +} + +try: + elf_fpregset_t = user_fpregs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) # siginfo_t related constants. diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index 4399044d7..e0ddce58d 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -45,9 +45,8 @@ function run_test { UNAME_M=$(uname -m) -if [ "$UNAME_M" != "x86_64" ]; then - # the criu-coredump script is only x86_64 aware - echo "criu-coredump only support x86_64. skipping." +if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "x86_64" ]]; then + echo "criu-coredump only supports aarch64 and x86_64. skipping." exit 0 fi From 38b9807cd5ef74fa1cd5359b32861e7d94c1897c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Mon, 17 Feb 2025 18:06:10 +0530 Subject: [PATCH 556/775] coredump: enable coredump generation on arm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add relevant elf header constants and notes for the arm platform to enable coredump generation. Signed-off-by: समीर सिंह Sameer Singh --- coredump/coredump | 2 +- coredump/criu_coredump/coredump.py | 124 +++++++++++++++---- coredump/criu_coredump/elf.py | 188 ++++++++++++++++++++++++++++- test/others/criu-coredump/test.sh | 4 +- 4 files changed, 288 insertions(+), 30 deletions(-) diff --git a/coredump/coredump b/coredump/coredump index f1027773d..5b3e6f366 100755 --- a/coredump/coredump +++ b/coredump/coredump @@ -6,7 +6,7 @@ import sys import criu_coredump -PLATFORMS = ["aarch64", "x86_64"] +PLATFORMS = ["aarch64", "armv7l", "x86_64"] def coredump(opts): diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 6bfc462f2..c6a758c8a 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -95,8 +95,13 @@ class coredump: buf.write(b"\0" * (8 - len(note.owner))) buf.write(note.data) - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(self.vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} + + offset = ctypes.sizeof(ehdr[bits]()) + offset += (len(self.vmas) + 1) * ctypes.sizeof(phdr[bits]()) filesz = 0 for note in self.notes: @@ -132,9 +137,18 @@ class coredump_generator: pagemaps = {} # pagemap by pid; # thread info key based on the current arch - thread_info_key = {"aarch64": "ti_aarch64", "x86_64": "thread_info"} + thread_info_key = { + "aarch64": "ti_aarch64", + "armv7l": "ti_arm", + "x86_64": "thread_info", + } machine = platform.machine() # current arch + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} # 32 or 64 bits Ehdr + nhdr = {"32bit": elf.Elf32_Nhdr, "64bit": elf.Elf64_Nhdr} # 32 or 64 bits Nhdr + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} # 32 or 64 bits Phdr def _img_open_and_strip(self, name, single=False, pid=None): """ @@ -207,23 +221,30 @@ class coredump_generator: """ Generate elf header for process pid with program headers phdrs. """ - ehdr = elf.Elf64_Ehdr() + ei_class = {"32bit": elf.ELFCLASS32, "64bit": elf.ELFCLASS64} + + ehdr = self.ehdr[self.bits]() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 - ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 + ehdr.e_ident[elf.EI_CLASS] = ei_class[self.bits] ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT + if self.machine == "armv7l": + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_ARM + else: + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_NONE + ehdr.e_type = elf.ET_CORE ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT - ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) + ehdr.e_phoff = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_ehsize = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_phentsize = ctypes.sizeof(self.phdr[self.bits]()) # FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) @@ -234,7 +255,11 @@ class coredump_generator: """ Get the e_machine field based on the current architecture. """ - e_machine_dict = {"aarch64": elf.EM_AARCH64, "x86_64": elf.EM_X86_64} + e_machine_dict = { + "aarch64": elf.EM_AARCH64, + "armv7l": elf.EM_ARM, + "x86_64": elf.EM_X86_64, + } return e_machine_dict[self.machine] def _gen_phdrs(self, pid, notes, vmas): @@ -243,15 +268,15 @@ class coredump_generator: """ phdrs = [] - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + offset = ctypes.sizeof(self.ehdr[self.bits]()) + offset += (len(vmas) + 1) * ctypes.sizeof(self.phdr[self.bits]()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset @@ -271,7 +296,7 @@ class coredump_generator: for vma in vmas: offset += filesz filesz = vma.filesz - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE @@ -328,7 +353,7 @@ class coredump_generator: prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] prpsinfo.pr_fname = core["tc"]["comm"].encode() - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO @@ -360,7 +385,7 @@ class coredump_generator: self._set_pr_regset(prstatus.pr_reg, regs) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS @@ -390,6 +415,25 @@ class coredump_generator: pr_reg.sp = regs["sp"] pr_reg.pc = regs["pc"] pr_reg.pstate = regs["pstate"] + elif self.machine == "armv7l": + pr_reg.r0 = regs["r0"] + pr_reg.r1 = regs["r1"] + pr_reg.r2 = regs["r2"] + pr_reg.r3 = regs["r3"] + pr_reg.r4 = regs["r4"] + pr_reg.r5 = regs["r5"] + pr_reg.r6 = regs["r6"] + pr_reg.r7 = regs["r7"] + pr_reg.r8 = regs["r8"] + pr_reg.r9 = regs["r9"] + pr_reg.r10 = regs["r10"] + pr_reg.fp = regs["fp"] + pr_reg.ip = regs["ip"] + pr_reg.sp = regs["sp"] + pr_reg.lr = regs["lr"] + pr_reg.pc = regs["pc"] + pr_reg.cpsr = regs["cpsr"] + pr_reg.orig_r0 = regs["orig_r0"] elif self.machine == "x86_64": pr_reg.r15 = regs["r15"] pr_reg.r14 = regs["r14"] @@ -495,6 +539,34 @@ class coredump_generator: return note + def _gen_arm_vfp(self, tid): + """ + Generate NT_ARM_VFP note for thread tid of process pid. + """ + core = self.cores[tid] + fpstate = core["ti_arm"]["fpstate"] + + data = elf.vfp_hard_struct() + ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) + + data.vfp_regs = (ctypes.c_uint64 * len(fpstate["vfp_regs"]))(*fpstate["vfp_regs"]) + data.fpexc = fpstate["fpexc"] + data.fpscr = fpstate["fpscr"] + data.fpinst = fpstate["fpinst"] + data.fpinst2 = fpstate["fpinst2"] + + nhdr = elf.Elf32_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(data) + nhdr.n_type = elf.NT_ARM_VFP + + note = elf_note() + note.data = data + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. @@ -544,7 +616,7 @@ class coredump_generator: # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO @@ -563,17 +635,22 @@ class coredump_generator: mm = self.mms[pid] num_auxv = len(mm["mm_saved_auxv"]) // 2 - class elf_auxv(ctypes.Structure): + class elf32_auxv(ctypes.Structure): + _fields_ = [("auxv", elf.Elf32_auxv_t * num_auxv)] + + class elf64_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] - auxv = elf_auxv() + elf_auxv = {"32bit": elf32_auxv(), "64bit": elf64_auxv()} + + auxv = elf_auxv[self.bits] for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i + 1] - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 - nhdr.n_descsz = ctypes.sizeof(elf_auxv()) + nhdr.n_descsz = ctypes.sizeof(elf_auxv[self.bits]) nhdr.n_type = elf.NT_AUXV note = elf_note() @@ -650,7 +727,7 @@ class coredump_generator: setattr(data, "file_ofs" + str(i), info.file_ofs) setattr(data, "name" + str(i), info.name.encode()) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 # strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) @@ -667,10 +744,13 @@ class coredump_generator: notes = [] notes.append(self._gen_prstatus(pid, tid)) - notes.append(self._gen_fpregset(pid, tid)) + if self.machine != "armv7l": + notes.append(self._gen_fpregset(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) if self.machine == "aarch64": notes.append(self._gen_arm_tls(tid)) + elif self.machine == "armv7l": + notes.append(self._gen_arm_vfp(tid)) elif self.machine == "x86_64": notes.append(self._gen_x86_xstate(pid, tid)) diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 2697fad07..2911f491e 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -4,13 +4,19 @@ import platform MACHINE = platform.machine() +Elf32_Half = ctypes.c_uint16 # typedef uint16_t Elf32_Half; +Elf32_Word = ctypes.c_uint32 # typedef uint32_t Elf32_Word; +Elf32_Addr = ctypes.c_uint32 # typedef uint32_t Elf32_Addr; +Elf32_Off = ctypes.c_uint32 # typedef uint32_t Elf32_Off; +Elf32_Xword = ctypes.c_uint64 # typedef uint64_t Elf32_Xword; + Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; -# Elf64_Ehdr related constants. +# Elf_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) @@ -31,22 +37,50 @@ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ +EI_OSABI = 7 # #define EI_OSABI 7 /* OS ABI identification */ + EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ +ELFCLASS32 = 1 # #define ELFCLASS32 1 /* 32-bit objects */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). +EM_ARM = 40 # #define EM_ARM 40 /* ARM */ EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ +# Legal values for e_osabi +ELFOSABI_NONE = 0 # #define ELFOSABI_NONE 0 /* UNIX System V ABI */ +ELFOSABI_ARM = 97 # #define ELFOSABI_ARM 97 /* ARM */ + + +class Elf32_Ehdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("e_ident", + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf32_Half), # Elf32_Half e_type; + ("e_machine", Elf32_Half), # Elf32_Half e_machine; + ("e_version", Elf32_Word), # Elf32_Word e_version; + ("e_entry", Elf32_Addr), # Elf32_Addr e_entry; + ("e_phoff", Elf32_Off), # Elf32_Off e_phoff; + ("e_shoff", Elf32_Off), # Elf32_Off e_shoff; + ("e_flags", Elf32_Word), # Elf32_Word e_flags; + ("e_ehsize", Elf32_Half), # Elf32_Half e_ehsize; + ("e_phentsize", Elf32_Half), # Elf32_Half e_phentsize; + ("e_phnum", Elf32_Half), # Elf32_Half e_phnum; + ("e_shentsize", Elf32_Half), # Elf32_Half e_shentsize; + ("e_shnum", Elf32_Half), # Elf32_Half e_shnum; + ("e_shstrndx", Elf32_Half) # Elf32_Half e_shstrndx; + ] # } Elf32_Ehdr; + class Elf64_Ehdr(ctypes.Structure): # typedef struct _fields_ = [ @@ -68,7 +102,7 @@ class Elf64_Ehdr(ctypes.Structure): # typedef struct ] # } Elf64_Ehdr; -# Elf64_Phdr related constants. +# Elf_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ @@ -80,6 +114,19 @@ PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ +class Elf32_Phdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("p_type", Elf32_Word), # Elf32_Word p_type; + ("p_offset", Elf32_Off), # Elf32_Off p_offset; + ("p_vaddr", Elf32_Addr), # Elf32_Addr p_vaddr; + ("p_paddr", Elf32_Addr), # Elf32_Addr p_paddr; + ("p_filesz", Elf32_Word), # Elf32_Word p_filesz; + ("p_memsz", Elf32_Word), # Elf32_Word p_memsz; + ("p_flags", Elf32_Word), # Elf32_Word p_flags; + ("p_align", Elf32_Word), # Elf32_Word p_align; + ] # } Elf32_Phdr; + + class Elf64_Phdr(ctypes.Structure): # typedef struct _fields_ = [ ("p_type", Elf64_Word), # Elf64_Word p_type; @@ -93,7 +140,25 @@ class Elf64_Phdr(ctypes.Structure): # typedef struct ] # } Elf64_Phdr; -# Elf64_auxv_t related constants. +# Elf_auxv_t related constants. + + +class _Elf32_auxv_t_U(ctypes.Union): + _fields_ = [("a_val", ctypes.c_uint32)] + + +class Elf32_auxv_t(ctypes.Structure): # typedef struct + _fields_ = [ + ("a_type", + ctypes.c_uint32), # uint32_t a_type; /* Entry type */ + ("a_un", _Elf32_auxv_t_U) # union + + # uint32_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; + ] # } Elf32_auxv_t; class _Elf64_auxv_t_U(ctypes.Union): @@ -114,7 +179,7 @@ class Elf64_auxv_t(ctypes.Structure): # typedef struct ] # } Elf64_auxv_t; -# Elf64_Nhdr related constants. +# Elf_Nhdr related constants. NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ @@ -123,9 +188,24 @@ NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_VFP = 0x400 # #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ +class Elf32_Nhdr(ctypes.Structure): # typedef struct + _fields_ = [ + ( + "n_namesz", Elf32_Word + ), # Elf32_Word n_namesz; /* Length of the note's name. */ + ( + "n_descsz", Elf32_Word + ), # Elf32_Word n_descsz; /* Length of the note's descriptor. */ + ( + "n_type", Elf32_Word + ), # Elf32_Word n_type; /* Type of the note. */ + ] # } Elf32_Nhdr; + + class Elf64_Nhdr(ctypes.Structure): # typedef struct _fields_ = [ ( @@ -139,7 +219,52 @@ class Elf64_Nhdr(ctypes.Structure): # typedef struct ] # } Elf64_Nhdr; -# Elf64_Shdr related constants. +# Elf_Shdr related constants. + + +class Elf32_Shdr(ctypes.Structure): + _fields_ = [ + ( + # Section name (string tbl index) + "sh_name", Elf32_Word + ), + ( + # Section type + "sh_type", Elf32_Word + ), + ( + # Section flags + "sh_flags", Elf32_Word + ), + ( + # Section virtual addr at execution + "sh_addr", Elf32_Addr + ), + ( + # Section file offset + "sh_offset", Elf32_Off + ), + ( + # Section size in bytes + "sh_size", Elf32_Word + ), + ( + # Link to another section + "sh_link", Elf32_Word + ), + ( + # Additional section information + "sh_info", Elf32_Word + ), + ( + # Section alignment + "sh_addralign", Elf32_Word + ), + ( + # Entry size if section holds table + "sh_entsize", Elf32_Word + ) + ] class Elf64_Shdr(ctypes.Structure): @@ -295,11 +420,53 @@ class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_st ] +class arm_user_regs_struct(ctypes.Structure): # struct arm_user_regs_struct + _fields_ = [ + ("r0", + ctypes.c_ulong), # unsigned ulong int r0; + ("r1", + ctypes.c_ulong), # unsigned ulong int r1; + ("r2", + ctypes.c_ulong), # unsigned ulong int r2; + ("r3", + ctypes.c_ulong), # unsigned ulong int r3; + ("r4", + ctypes.c_ulong), # unsigned ulong int r4; + ("r5", + ctypes.c_ulong), # unsigned ulong int r5; + ("r6", + ctypes.c_ulong), # unsigned ulong int r6; + ("r7", + ctypes.c_ulong), # unsigned ulong int r7; + ("r8", + ctypes.c_ulong), # unsigned ulong int r8; + ("r9", + ctypes.c_ulong), # unsigned ulong int r9; + ("r10", + ctypes.c_ulong), # unsigned ulong int r10; + ("fp", + ctypes.c_ulong), # unsigned ulong int fp; + ("ip", + ctypes.c_ulong), # unsigned ulong int ip; + ("sp", + ctypes.c_ulong), # unsigned ulong int sp; + ("lr", + ctypes.c_ulong), # unsigned ulong int lr; + ("pc", + ctypes.c_ulong), # unsigned ulong int pc; + ("cpsr", + ctypes.c_ulong), # unsigned ulong int cpsr; + ("orig_r0", + ctypes.c_ulong), # unsigned ulong int orig_r0; + ] + + # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG user_regs_dict = { "aarch64": aarch64_user_regs_struct, + "armv7l": arm_user_regs_struct, "x86_64": x86_64_user_regs_struct, } @@ -488,6 +655,7 @@ class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpreg user_fpregs_dict = { "aarch64": aarch64_user_fpregs_struct, + "armv7l": None, "x86_64": x86_64_user_fpregs_struct, } @@ -889,3 +1057,13 @@ class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { # struct ymmh_struct ymmh; ("ymmh", ymmh_struct) ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; + + +class vfp_hard_struct(ctypes.Structure): # struct vfp_hard_struct { + _fields_ = [ + ("vfp_regs", ctypes.c_ulonglong * 32), # __u64 fpregs[32]; + ("fpexc", ctypes.c_ulong), # __u32 fpexc; + ("fpscr", ctypes.c_ulong), # __u32 fpscr; + ("fpinst", ctypes.c_ulong), # __u32 fpinst; + ("fpinst2", ctypes.c_ulong), # __u32 fpinst2; + ] # }; diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index e0ddce58d..2be82e64c 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -45,8 +45,8 @@ function run_test { UNAME_M=$(uname -m) -if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "x86_64" ]]; then - echo "criu-coredump only supports aarch64 and x86_64. skipping." +if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "armv7l" &&"$UNAME_M" != "x86_64" ]]; then + echo "criu-coredump only supports aarch64 armv7l, and x86_64. skipping." exit 0 fi From c298b51a6989a26611fe3cbccd1f3da64a23eb50 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 3 Mar 2025 15:03:51 +0000 Subject: [PATCH 557/775] scripts/uninstall_module: import signal module With Python 3.13, the `subprocess` module now uses the `posix_spawn()` function [1], which requires the `signal` module to be imported. Fixes: #2607 [1] https://docs.python.org/3/whatsnew/3.13.html#subprocess Signed-off-by: Radostin Stoyanov --- scripts/uninstall_module.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py index 8a9b70892..2da63c800 100755 --- a/scripts/uninstall_module.py +++ b/scripts/uninstall_module.py @@ -10,6 +10,16 @@ import site import subprocess import sys +# With Python 3.13 the subprocess module now uses the `posix_spawn()` +# function which requires loading the `signal` module: +# https://docs.python.org/3/whatsnew/3.13.html#subprocess +# +# We need to load this module here, before PYTHONPATH and sys.path +# have been modified to use the path specified with `--prefix`. +# +# flake8: noqa: F401 +import signal + import importlib_metadata From d35808f5eec67810df70e34b99ec9064c40cbc13 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 16:38:33 +0100 Subject: [PATCH 558/775] ci: update to latest actions for codeql CI job Signed-off-by: Adrian Reber --- .github/workflows/codeql.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 518d9b8ae..88e21d3d1 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -29,22 +29,22 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} queries: +security-and-quality - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{ matrix.language }}" From ed6374b48c5923bca53d760ac6f04a2817236407 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 11:07:17 +0100 Subject: [PATCH 559/775] lsm: use the user provided lsm label Currently CRIU has the possibility to specify a LSM label during restore. Unfortunately the information is completely ignored in the case of SELinux. This change selects the lsm label from the user if it is provided and else the label from the checkpoint image is used. Signed-off-by: Adrian Reber --- criu/lsm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/lsm.c b/criu/lsm.c index d1b73cc79..70b66d42e 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -370,7 +370,7 @@ int render_lsm_profile(char *profile, char **val) case LSMTYPE__APPARMOR: return render_aa_profile(val, profile); case LSMTYPE__SELINUX: - if (asprintf(val, "%s", profile) < 0) { + if (asprintf(val, "%s", opts.lsm_supplied ? opts.lsm_profile : profile) < 0) { *val = NULL; return -1; } From d8555015759724c1e90462105ec21c77f89127ec Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 7 Feb 2025 09:24:19 +0100 Subject: [PATCH 560/775] vdso: Fixes in DT_GNU_HASH handling * Hash buckets is an array of 32-bit words. While DT_HASH is 32-bit on most platforms except s390 (where it's 64-bit). * The bloom filter word size differs between 32-bit and 64-bit ELF files. This commit adjusts the code to handle both cases. Signed-off-by: Andrei Vagin --- criu/pie/util-vdso.c | 57 +++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 9819335d8..af3c08985 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -121,7 +121,8 @@ static int has_elf_identity(Ehdr_t *ehdr) return true; } -static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t **load) +static int parse_elf_phdr(uintptr_t mem, size_t size, + Phdr_t **dynamic, Phdr_t **load, bool *is_32bit) { Ehdr_t *ehdr = (void *)mem; uintptr_t addr; @@ -136,6 +137,8 @@ static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t * if (!has_elf_identity(ehdr)) return -EINVAL; + *is_32bit = ehdr->e_ident[EI_CLASS] != ELFCLASS64; + addr = mem + ehdr->e_phoff; if (__ptr_oob(addr, mem, size)) goto err_oob; @@ -272,6 +275,8 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif +typedef uint32_t Hash32_t; + static bool elf_symbol_match(uintptr_t mem, size_t size, uintptr_t dynsymbol_names, Sym_t *sym, const char *symbol, const size_t vdso_symbol_length) @@ -297,21 +302,22 @@ static bool elf_symbol_match(uintptr_t mem, size_t size, static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, const char *symbol, uint32_t symbol_hash, unsigned int sym_off, uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, - Hash_t nbucket, Hash_t nchain, Hash_t *bucket, Hash_t *chain, + uint64_t nbucket, uint64_t nchain, void *_bucket, Hash_t *chain, const size_t vdso_symbol_length, bool use_gnu_hash) { unsigned int j; uintptr_t addr; - j = bucket[symbol_hash % nbucket]; - if (j == STN_UNDEF) - return 0; - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; if (use_gnu_hash) { - uint32_t *h = bucket + nbucket + (j - sym_off); - uint32_t hash_val; + Hash32_t *h, hash_val, *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + h = bucket + nbucket + (j - sym_off); symbol_hash |= 1; do { @@ -325,6 +331,12 @@ static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, j++; } while (!(hash_val & 1)); } else { + Hash_t *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + for (; j < nchain && j != STN_UNDEF; j = chain[j]) { Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; @@ -338,17 +350,17 @@ static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, uintptr_t dynsymbol_names, - Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash) + Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash, + bool is_32bit) { ARCH_VDSO_SYMBOLS_LIST const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - Hash_t *bucket = NULL; + void *bucket = NULL; Hash_t *chain = NULL; - Hash_t nbucket = 0; - Hash_t nchain = 0; + uint64_t nbucket, nchain = 0; unsigned int sym_off = 0; unsigned int i = 0; @@ -358,17 +370,23 @@ static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, if (use_gnu_hash) { uint32_t *gnu_hash = (uint32_t *)hash; uint32_t bloom_sz; - size_t *bloom; nbucket = gnu_hash[0]; sym_off = gnu_hash[1]; bloom_sz = gnu_hash[2]; - bloom = (size_t *)&gnu_hash[4]; - bucket = (Hash_t *)(&bloom[bloom_sz]); + if (is_32bit) { + uint32_t *bloom; + bloom = (uint32_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } else { + uint64_t *bloom; + bloom = (uint64_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } elf_hash = &elf_gnu_hash; - pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bloom %lx bucket %lx\n", + pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bucket %lx\n", (unsigned long)nbucket, (unsigned long)sym_off, - (unsigned long)bloom_sz, (unsigned long)bloom, + (unsigned long)bloom_sz, (unsigned long)bucket); } else { nbucket = hash[0]; @@ -417,6 +435,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; bool use_gnu_hash; + bool is_32bit; uintptr_t dynsymbol_names; uintptr_t addr; @@ -427,7 +446,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) /* * We need PT_LOAD and PT_DYNAMIC here. Each once. */ - ret = parse_elf_phdr(mem, size, &dynamic, &load); + ret = parse_elf_phdr(mem, size, &dynamic, &load, &is_32bit); if (ret < 0) return ret; if (!load || !dynamic) { @@ -458,7 +477,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) hash = (void *)addr; ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, - use_gnu_hash); + use_gnu_hash, is_32bit); if (ret <0) return ret; From 7748b3fe7326f6f987fc9fd0d3fa267800420264 Mon Sep 17 00:00:00 2001 From: Han-Wen Nienhuys Date: Mon, 10 Mar 2025 14:43:24 +0100 Subject: [PATCH 561/775] pstree: print clone flags in error message Signed-off-by: Han-Wen Nienhuys --- criu/pstree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/pstree.c b/criu/pstree.c index 41df846ed..660f1b9d9 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -962,7 +962,7 @@ static int prepare_pstree_kobj_ids(void) * this namespace is either inherited from the * criu or is created for the init task (only) */ - pr_err("Can't restore sub-task in NS\n"); + pr_err("Can't restore sub-task in NS (cflags %lx)\n", cflags); return -1; } } From c5d46d86a8b07b063bca7e2de762f3c3b1f7b364 Mon Sep 17 00:00:00 2001 From: Han-Wen Nienhuys Date: Thu, 13 Mar 2025 08:46:16 +0100 Subject: [PATCH 562/775] restorer: Add a lock around cgroupd communication. Threads are put into cgroups through the cgroupd thread, which communicates with other threads using a socketpair. Previously, each thread received a dup'd copy of the socket, and did the following sendmsg(socket_dup_fd, my_cgroup_set); // wait for ack. while (1) { recvmsg(socket_dup_fd, &h, MSG_PEEK); if (h.pid != my_pid) continue; recvmsg(socket_dup_fd, &h, 0); } close(socket_dup_fd); When restoring many threads, many threads would be spinning in the above loop waiting for their PID to appear. In my test-case, restoring a process with a 11.5G heap and 491 threads could take anywhere between 10 seconds and 60 seconds to complete. To avoid the spinning, we drop the loop and MSG_PEEK, and add a lock around the above code. This does not decrease parallelism, as the cgroupd daemon uses a single thread anyway. With the lock in place, the same restore consistently takes around 10 seconds on my machine (Thinkpad P14s, AMD Ryzen 8840HS). There is a similar "daemon" thread for user namespaces. That already is protected with a similar userns_sync_lock in __userns_call(). Fixes #2614 Signed-off-by: Han-Wen Nienhuys --- criu/cr-restore.c | 1 + criu/include/rst_info.h | 1 + criu/pie/restorer.c | 61 ++++++++++++++++++++--------------------- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index ddca6b8ec..e906da0ce 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2329,6 +2329,7 @@ int prepare_task_entries(void) task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); + mutex_init(&task_entries->cgroupd_sync_lock); mutex_init(&task_entries->last_pid_mutex); return 0; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index df9f9de01..4c9335a73 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -14,6 +14,7 @@ struct task_entries { futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; + mutex_t cgroupd_sync_lock; mutex_t last_pid_mutex; }; diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 6d048c3f1..348ce6659 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -704,9 +704,8 @@ static int send_cg_set(int sk, int cg_set) } /* - * As this socket is shared among threads, recvmsg(MSG_PEEK) - * from the socket until getting its own thread id as an - * acknowledge of successful threaded cgroup fixup + * As the cgroupd socket is shared among threads and processes, this + * should be called with task_entries->cgroupd_sync_lock held. */ static int recv_cg_set_restore_ack(int sk) { @@ -719,33 +718,22 @@ static int recv_cg_set_restore_ack(int sk) h.msg_control = cmsg; h.msg_controllen = sizeof(cmsg); - while (1) { - ret = sys_recvmsg(sk, &h, MSG_PEEK); - if (ret < 0) { - pr_err("Unable to peek from cgroupd %d\n", ret); - return -1; - } + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } - if (h.msg_controllen != sizeof(cmsg)) { - pr_err("The message from cgroupd is truncated\n"); - return -1; - } + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } - ch = CMSG_FIRSTHDR(&h); - cred = (struct ucred *)CMSG_DATA(ch); - if (cred->pid != sys_gettid()) - continue; - - /* - * Actual remove message from recv queue of socket - */ - ret = sys_recvmsg(sk, &h, 0); - if (ret < 0) { - pr_err("Unable to receive from cgroupd %d\n", ret); - return -1; - } - - break; + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) { + pr_err("cred pid %d != gettid\n", cred->pid); + return -1; } return 0; } @@ -782,12 +770,21 @@ __visible long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; if (args->cg_set != -1) { + int err = 0; + + mutex_lock(&task_entries_local->cgroupd_sync_lock); + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); - if (send_cg_set(args->cgroupd_sk, args->cg_set)) - goto core_restore_end; - if (recv_cg_set_restore_ack(args->cgroupd_sk)) - goto core_restore_end; + + err = send_cg_set(args->cgroupd_sk, args->cg_set); + if (!err) + err = recv_cg_set_restore_ack(args->cgroupd_sk); + + mutex_unlock(&task_entries_local->cgroupd_sync_lock); sys_close(args->cgroupd_sk); + + if (err) + goto core_restore_end; } if (restore_thread_common(args)) From 8ae5db37bb01f405ece0a08160a35cd92034e26a Mon Sep 17 00:00:00 2001 From: AV Date: Mon, 3 Mar 2025 19:14:54 +0000 Subject: [PATCH 563/775] arm64: C/R PAC keys PAC stands for Pointer Authentication Code. Each process has 5 PAC keys and a mask of enabled keys. All this properties have to be C/R-ed. As they are per-process protperties, we can save/restore them just for one thread. Signed-off-by: Andrei Vagin --- compel/arch/aarch64/src/lib/infect.c | 2 +- compel/arch/arm/src/lib/infect.c | 2 +- compel/arch/loongarch64/src/lib/infect.c | 2 +- compel/arch/mips/src/lib/infect.c | 2 +- compel/arch/ppc64/src/lib/infect.c | 2 +- compel/arch/riscv64/src/lib/infect.c | 2 +- compel/arch/s390/src/lib/infect.c | 2 +- compel/arch/x86/src/lib/infect.c | 2 +- compel/include/uapi/infect.h | 2 +- compel/src/lib/infect.c | 2 +- criu/arch/aarch64/crtools.c | 167 ++++++++++++++++++++++- criu/arch/aarch64/include/asm/dump.h | 2 +- criu/arch/aarch64/include/asm/restore.h | 10 ++ criu/arch/arm/crtools.c | 2 +- criu/arch/arm/include/asm/dump.h | 2 +- criu/arch/loongarch64/crtools.c | 2 +- criu/arch/loongarch64/include/asm/dump.h | 2 +- criu/arch/mips/crtools.c | 2 +- criu/arch/mips/include/asm/dump.h | 2 +- criu/arch/ppc64/crtools.c | 2 +- criu/arch/ppc64/include/asm/dump.h | 2 +- criu/arch/riscv64/crtools.c | 2 +- criu/arch/riscv64/include/asm/dump.h | 2 +- criu/arch/s390/crtools.c | 2 +- criu/arch/s390/include/asm/dump.h | 2 +- criu/arch/x86/crtools.c | 2 +- criu/arch/x86/include/asm/compat.h | 2 + criu/arch/x86/include/asm/dump.h | 2 +- criu/cr-restore.c | 10 ++ criu/include/rst_info.h | 8 ++ images/core-aarch64.proto | 23 ++++ 31 files changed, 244 insertions(+), 26 deletions(-) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 812ba34a3..ec1d0d59e 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -81,7 +81,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - ret = save(arg, regs, fpsimd); + ret = save(pid, arg, regs, fpsimd); err: return ret; } diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 8b810a88f..a9fb639e2 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -94,7 +94,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } - ret = save(arg, regs, vfp); + ret = save(pid, arg, regs, vfp); err: return ret; } diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c index 8e3c19aff..190c39227 100644 --- a/compel/arch/loongarch64/src/lib/infect.c +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -91,7 +91,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - ret = save(arg, regs, fpregs); + ret = save(pid, arg, regs, fpregs); err: return 0; } diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index 0e98aaee3..a1d4865cc 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -149,7 +149,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct regs->regs[0] = 0; } - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); return ret; } diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 84c2b1d7c..54abd48a4 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -400,7 +400,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct if (ret) return ret; - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c index 861fe3b2f..3f3a4b7ec 100644 --- a/compel/arch/riscv64/src/lib/infect.c +++ b/compel/arch/riscv64/src/lib/infect.c @@ -92,7 +92,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct return -1; } - ret = save(arg, regs, fpsimd); + ret = save(pid, arg, regs, fpsimd); return ret; } diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 85dfc3a4d..a77b38917 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -348,7 +348,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } /* Call save_task_regs() */ - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index a07b1c9f3..644c483b4 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -453,7 +453,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; out: - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); err: return ret; } diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 7e6134f4b..ed97d64dd 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -97,7 +97,7 @@ extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); -typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); +typedef int (*save_regs_t)(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); struct infect_ctx { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index caf54e03f..a9bbd6400 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1300,7 +1300,7 @@ struct plain_regs_struct { user_fpregs_struct_t fpregs; }; -static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) +static int save_regs_plain(pid_t pid, void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index e87b8629a..6cde03ee3 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -1,5 +1,6 @@ #include #include +#include #include @@ -20,10 +21,86 @@ #include "cpu.h" #include "restorer.h" #include "compel/infect.h" +#include "pstree.h" + +extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +static int save_pac_keys(int pid, CoreEntry *core) +{ + struct user_pac_address_keys paca; + struct user_pac_generic_keys pacg; + PacKeys *pac_entry; + long pac_enabled_key; + struct iovec iov; + int ret; + + unsigned long hwcaps = getauxval(AT_HWCAP); + + pac_entry = xmalloc(sizeof(PacKeys)); + if (!pac_entry) + return -1; + core->ti_aarch64->pac_keys = pac_entry; + pac_keys__init(pac_entry); + + if (hwcaps & HWCAP_PACA) { + PacAddressKeys *pac_address_keys; + + pr_debug("%d: Dumping address authentication keys\n", pid); + iov.iov_base = &paca; + iov.iov_len = sizeof(paca); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to get address authentication key for %d", pid); + return -1; + } + pac_address_keys = xmalloc(sizeof(PacAddressKeys)); + if (!pac_address_keys) + return -1; + pac_address_keys__init(pac_address_keys); + pac_entry->pac_address_keys = pac_address_keys; + pac_address_keys->apiakey_lo = paca.apiakey; + pac_address_keys->apiakey_hi = paca.apiakey >> 64; + pac_address_keys->apibkey_lo = paca.apibkey; + pac_address_keys->apibkey_hi = paca.apibkey >> 64; + pac_address_keys->apdakey_lo = paca.apdakey; + pac_address_keys->apdakey_hi = paca.apdakey >> 64; + pac_address_keys->apdbkey_lo = paca.apdbkey; + pac_address_keys->apdbkey_hi = paca.apdbkey >> 64; + + iov.iov_base = &pac_enabled_key; + iov.iov_len = sizeof(pac_enabled_key); + ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); + if (ret) { + pr_perror("Failed to get authentication key mask for %d", pid); + return -1; + } + + pac_address_keys->pac_enabled_key = pac_enabled_key; + + } + if (hwcaps & HWCAP_PACG) { + PacGenericKeys *pac_generic_keys; + + pr_debug("%d: Dumping generic authentication keys\n", pid); + iov.iov_base = &pacg; + iov.iov_len = sizeof(pacg); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to get a generic authantication key for %d", pid); + return -1; + } + pac_generic_keys = xmalloc(sizeof(PacGenericKeys)); + if (!pac_generic_keys) + return -1; + pac_generic_keys__init(pac_generic_keys); + pac_entry->pac_generic_keys = pac_generic_keys; + pac_generic_keys->apgakey_lo = pacg.apgakey; + pac_generic_keys->apgakey_hi = pacg.apgakey >> 64; + } + return 0; +} + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; @@ -43,6 +120,8 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsi assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); + if (save_pac_keys(pid, core)) + return -1; return 0; } @@ -92,6 +171,12 @@ void arch_free_thread_info(CoreEntry *core) xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } + if (CORE_THREAD_ARCH_INFO(core)->pac_keys) { + PacKeys *pac_entry = CORE_THREAD_ARCH_INFO(core)->pac_keys; + xfree(pac_entry->pac_address_keys); + xfree(pac_entry->pac_generic_keys); + xfree(pac_entry); + } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); @@ -135,3 +220,83 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) return 0; } + +int arch_ptrace_restore(int pid, struct pstree_item *item) +{ + unsigned long hwcaps = getauxval(AT_HWCAP); + struct user_pac_address_keys upaca; + struct user_pac_generic_keys upacg; + PacAddressKeys *paca; + PacGenericKeys *pacg; + long pac_enabled_keys; + struct iovec iov; + int ret; + + + pr_debug("%d: Restoring PAC keys\n", pid); + + paca = &rsti(item)->arch_info.pac_address_keys; + pacg = &rsti(item)->arch_info.pac_generic_keys; + if (rsti(item)->arch_info.has_paca) { + if (!(hwcaps & HWCAP_PACA)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + pac_enabled_keys = rsti(item)->arch_info.pac_address_keys.pac_enabled_key; + + upaca.apiakey = paca->apiakey_lo + ((__uint128_t)paca->apiakey_hi << 64); + upaca.apibkey = paca->apibkey_lo + ((__uint128_t)paca->apibkey_hi << 64); + upaca.apdakey = paca->apdakey_lo + ((__uint128_t)paca->apdakey_hi << 64); + upaca.apdbkey = paca->apdbkey_lo + ((__uint128_t)paca->apdbkey_hi << 64); + + iov.iov_base = &upaca; + iov.iov_len = sizeof(upaca); + + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to set address authentication keys for %d", pid); + return 1; + } + iov.iov_base = &pac_enabled_keys; + iov.iov_len = sizeof(pac_enabled_keys); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { + pr_perror("Failed to set enabled key mask for %d", pid); + return 1; + } + } + + if (rsti(item)->arch_info.has_pacg) { + if (!(hwcaps & HWCAP_PACG)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + upacg.apgakey = pacg->apgakey_lo + ((__uint128_t)pacg->apgakey_hi << 64); + iov.iov_base = &upacg; + iov.iov_len = sizeof(upacg); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to set the generic authentication key for %d", pid); + return 1; + } + } + + return 0; +} + +void arch_rsti_init(struct pstree_item *p) +{ + PacKeys *pac_keys = p->core[0]->ti_aarch64->pac_keys; + + rsti(p)->arch_info.has_paca = false; + rsti(p)->arch_info.has_pacg = false; + + if (!pac_keys) + return; + + if (pac_keys->pac_address_keys) { + rsti(p)->arch_info.has_paca = true; + rsti(p)->arch_info.pac_address_keys = *pac_keys->pac_address_keys; + } + if (pac_keys->pac_generic_keys) { + rsti(p)->arch_info.has_pacg = true; + rsti(p)->arch_info.pac_generic_keys = *pac_keys->pac_generic_keys; + } +} diff --git a/criu/arch/aarch64/include/asm/dump.h b/criu/arch/aarch64/include/asm/dump.h index 90cd8bca8..ecab061c3 100644 --- a/criu/arch/aarch64/include/asm/dump.h +++ b/criu/arch/aarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/aarch64/include/asm/restore.h b/criu/arch/aarch64/include/asm/restore.h index 75e87996a..c79605c40 100644 --- a/criu/arch/aarch64/include/asm/restore.h +++ b/criu/arch/aarch64/include/asm/restore.h @@ -26,4 +26,14 @@ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); +#define ARCH_RST_INFO y +struct rst_arch_info { + bool has_paca, has_pacg; + PacAddressKeys pac_address_keys; + PacGenericKeys pac_generic_keys; +}; + +int arch_ptrace_restore(int pid, struct pstree_item *item); +void arch_rsti_init(struct pstree_item *current); + #endif diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c index 26b94e157..6a5e4c89a 100644 --- a/criu/arch/arm/crtools.c +++ b/criu/arch/arm/crtools.c @@ -22,7 +22,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/arm/include/asm/dump.h b/criu/arch/arm/include/asm/dump.h index 485986065..b0ac5715d 100644 --- a/criu/arch/arm/include/asm/dump.h +++ b/criu/arch/arm/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c index eeb0731ca..783951b5b 100644 --- a/criu/arch/loongarch64/crtools.c +++ b/criu/arch/loongarch64/crtools.c @@ -29,7 +29,7 @@ #define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { int i; CoreEntry *core = x; diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h index 04347155c..a1c0c4c58 100644 --- a/criu/arch/loongarch64/include/asm/dump.h +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/mips/crtools.c b/criu/arch/mips/crtools.c index ed4da9b7e..eabbd85f4 100644 --- a/criu/arch/mips/crtools.c +++ b/criu/arch/mips/crtools.c @@ -27,7 +27,7 @@ #include "images/core.pb-c.h" #include "images/creds.pb-c.h" -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/mips/include/asm/dump.h b/criu/arch/mips/include/asm/dump.h index 58015833d..ec59b051b 100644 --- a/criu/arch/mips/include/asm/dump.h +++ b/criu/arch/mips/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c index a08a2ca5b..d57040008 100644 --- a/criu/arch/ppc64/crtools.c +++ b/criu/arch/ppc64/crtools.c @@ -404,7 +404,7 @@ static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpre return 0; } -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } diff --git a/criu/arch/ppc64/include/asm/dump.h b/criu/arch/ppc64/include/asm/dump.h index eb488900a..7393654fa 100644 --- a/criu/arch/ppc64/include/asm/dump.h +++ b/criu/arch/ppc64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c index b2d6d2951..eea98d6de 100644 --- a/criu/arch/riscv64/crtools.c +++ b/criu/arch/riscv64/crtools.c @@ -23,7 +23,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; CoreEntry *core = x; diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h index c2988f9bf..4f0a2d209 100644 --- a/criu/arch/riscv64/include/asm/dump.h +++ b/criu/arch/riscv64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 5cf160d82..96cef819e 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -282,7 +282,7 @@ static void free_ri_cb(UserS390RiEntry *ri_cb) /* * Copy internal structures into Google Protocol Buffers */ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; diff --git a/criu/arch/s390/include/asm/dump.h b/criu/arch/s390/include/asm/dump.h index c200724d7..5a24c5b3d 100644 --- a/criu/arch/s390/include/asm/dump.h +++ b/criu/arch/s390/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index e068a9a02..1f4d0736b 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -15,7 +15,7 @@ #define XSAVE_PB_NELEMS(__s, __obj, __member) (sizeof(__s) / sizeof(*(__obj)->__member)) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; UserX86RegsEntry *gpregs = core->thread_info->gpregs; diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h index 867357fa2..4ca704fd7 100644 --- a/criu/arch/x86/include/asm/compat.h +++ b/criu/arch/x86/include/asm/compat.h @@ -11,6 +11,8 @@ #include +#include "log.h" + static inline void *alloc_compat_syscall_stack(void) { void *mem = (void *)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, diff --git a/criu/arch/x86/include/asm/dump.h b/criu/arch/x86/include/asm/dump.h index 192f6bd02..925ea91ff 100644 --- a/criu/arch/x86/include/asm/dump.h +++ b/criu/arch/x86/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index e906da0ce..1f4881dab 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "types.h" #include @@ -1707,6 +1708,9 @@ static int restore_task_with_children(void *_arg) arg); } +int __attribute((weak)) arch_ptrace_restore(int pid, struct pstree_item *item); +int arch_ptrace_restore(int pid, struct pstree_item *item) { return 0; } + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -1747,6 +1751,8 @@ static int attach_to_tasks(bool root_seized) pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); return -1; } + if (arch_ptrace_restore(pid, item)) + return -1; /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -3104,6 +3110,9 @@ static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) return restorer_sym(restorer_blob, arch_export_unmap); } +void arch_rsti_init(struct pstree_item *p) __attribute__((weak)); +void arch_rsti_init(struct pstree_item *p) {} + static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; @@ -3323,6 +3332,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; + arch_rsti_init(current); for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 4c9335a73..deb297e5f 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -1,6 +1,7 @@ #ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ +#include "asm/restore.h" #include "common/lock.h" #include "common/list.h" #include "vma.h" @@ -33,6 +34,11 @@ struct rst_rseq { uint64_t rseq_cs_pointer; }; +#ifndef ARCH_RST_INFO +struct rst_arch_info { +}; +#endif + struct rst_info { struct list_head fds; @@ -80,6 +86,8 @@ struct rst_info { futex_t shstk_unlock; void *breakpoint; + + struct rst_arch_info arch_info; }; extern struct task_entries *task_entries; diff --git a/images/core-aarch64.proto b/images/core-aarch64.proto index 3356e6b75..64b0ee9fb 100644 --- a/images/core-aarch64.proto +++ b/images/core-aarch64.proto @@ -17,9 +17,32 @@ message user_aarch64_fpsimd_context_entry { required uint32 fpcr = 3; } +message pac_address_keys { + required uint64 apiakey_lo = 1; + required uint64 apiakey_hi = 2; + required uint64 apibkey_lo = 3; + required uint64 apibkey_hi = 4; + required uint64 apdakey_lo = 5; + required uint64 apdakey_hi = 6; + required uint64 apdbkey_lo = 7; + required uint64 apdbkey_hi = 8; + required uint64 pac_enabled_key = 9; +} + +message pac_generic_keys { + required uint64 apgakey_lo = 1; + required uint64 apgakey_hi = 2; +} + +message pac_keys { + optional pac_address_keys pac_address_keys = 6; + optional pac_generic_keys pac_generic_keys = 7; +} + message thread_info_aarch64 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required uint64 tls = 2; required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; + optional pac_keys pac_keys = 5; } From b8553d19edc1d5278c619420844b24aad2bdd415 Mon Sep 17 00:00:00 2001 From: AV Date: Mon, 3 Mar 2025 20:09:05 +0000 Subject: [PATCH 564/775] test/zdtm: check that PAC keys are C/R-ed Add another variation of ptrhead00 compiled with enabled branch-protection. Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 9 +++++++++ test/zdtm/static/pthread00-pac.c | 1 + 2 files changed, 10 insertions(+) create mode 120000 test/zdtm/static/pthread00-pac.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index f72fb2a77..6a19cad3c 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -66,6 +66,7 @@ TST_NOFILE := \ pipe01 \ pipe02 \ pthread00 \ + pthread00-pac \ pthread01 \ pthread02 \ pthread_timers \ @@ -497,6 +498,12 @@ STATE_OUT = $(TST_STATE:%=%.out) include ../Makefile.inc +ifeq ($(ARCH),aarch64) + PAC_CFLAGS := -mbranch-protection=standard +else + PAC_CFLAGS := +endif + all: $(TST) criu-rtc.so install: all .PHONY: all install @@ -588,6 +595,8 @@ uptime_grow: LDLIBS += -lrt -pthread unlink_largefile: CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE inotify_system_nodel: CFLAGS += -DNO_DEL pthread00: LDLIBS += -pthread +pthread00-pac: CFLAGS += ${PAC_CFLAGS} +pthread00-pac: LDLIBS += -pthread pthread01: LDLIBS += -pthread pthread02: LDLIBS += -pthread pthread_timers: LDLIBS += -lrt -pthread diff --git a/test/zdtm/static/pthread00-pac.c b/test/zdtm/static/pthread00-pac.c new file mode 120000 index 000000000..3ee8dc1f1 --- /dev/null +++ b/test/zdtm/static/pthread00-pac.c @@ -0,0 +1 @@ +pthread00.c \ No newline at end of file From 62a4a5874b4b0bd462f28d659b93c73c5c06a900 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 12 Mar 2025 23:46:05 +0000 Subject: [PATCH 565/775] vdso: correct data types for ELF hash table sizes Let's change the data types of `nbucket` and `nchain` to uint32. This should fix the following compile-time error on arm32: /criu/criu/pie/util-vdso.c:336: undefined reference to `__aeabi_uldivmod' Signed-off-by: Andrei Vagin --- criu/pie/util-vdso.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index af3c08985..8daf5c71f 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -302,7 +302,7 @@ static bool elf_symbol_match(uintptr_t mem, size_t size, static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, const char *symbol, uint32_t symbol_hash, unsigned int sym_off, uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, - uint64_t nbucket, uint64_t nchain, void *_bucket, Hash_t *chain, + uint32_t nbucket, uint32_t nchain, void *_bucket, Hash_t *chain, const size_t vdso_symbol_length, bool use_gnu_hash) { unsigned int j; @@ -360,7 +360,7 @@ static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, void *bucket = NULL; Hash_t *chain = NULL; - uint64_t nbucket, nchain = 0; + uint32_t nbucket, nchain = 0; unsigned int sym_off = 0; unsigned int i = 0; From 720bf67e065525133f4b0209baa7142192fbb667 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 16 Mar 2025 22:23:14 +0000 Subject: [PATCH 566/775] zdtm/vdso02: unmap vvar_vclock mappings It is a part of vvar and this test intends to unmap vdso and all vvar mappings. Fixes #2622 Signed-off-by: Andrei Vagin --- test/zdtm/static/vdso02.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/test/zdtm/static/vdso02.c b/test/zdtm/static/vdso02.c index 2050bca71..5779b7fd6 100644 --- a/test/zdtm/static/vdso02.c +++ b/test/zdtm/static/vdso02.c @@ -29,7 +29,8 @@ static int parse_vm_area(char *buf, struct vm_area *vma) return -1; } -static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) +static int find_blobs(pid_t pid, struct vm_area *vdso, + struct vm_area *vvar, struct vm_area *vvar_vclock) { char buf[BUF_SZ]; int ret = -1; @@ -39,6 +40,8 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) vdso->end = VDSO_BAD_ADDR; vvar->start = VVAR_BAD_ADDR; vvar->end = VVAR_BAD_ADDR; + vvar_vclock->start = VVAR_BAD_ADDR; + vvar_vclock->end = VVAR_BAD_ADDR; if (snprintf(buf, BUF_SZ, "/proc/%d/maps", pid) < 0) { pr_perror("snprintf() failure for path"); @@ -57,12 +60,18 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) if (strstr(buf, "[vvar]") && parse_vm_area(buf, vvar)) goto err; + if (strstr(buf, "[vvar_vclock]") && + parse_vm_area(buf, vvar_vclock)) + goto err; } if (vdso->start != VDSO_BAD_ADDR) test_msg("[vdso] %lx-%lx\n", vdso->start, vdso->end); if (vvar->start != VVAR_BAD_ADDR) test_msg("[vvar] %lx-%lx\n", vvar->start, vvar->end); + if (vvar_vclock->start != VVAR_BAD_ADDR) + test_msg("[vvar_vclock] %lx-%lx\n", + vvar_vclock->start, vvar_vclock->end); ret = 0; err: fclose(maps); @@ -143,10 +152,10 @@ void sys_exit(int status) static int unmap_blobs(void) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; int ret; - if (find_blobs(getpid(), &vdso, &vvar)) + if (find_blobs(getpid(), &vdso, &vvar, &vvar_vclock)) return -1; if (vdso.start != VDSO_BAD_ADDR) { @@ -159,13 +168,19 @@ static int unmap_blobs(void) if (ret) return ret; } + if (vvar_vclock.start != VVAR_BAD_ADDR) { + ret = sys_munmap((void *)vvar_vclock.start, + vvar_vclock.end - vvar_vclock.start); + if (ret) + return ret; + } return 0; } int main(int argc, char *argv[]) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; pid_t child; int status, ret = -1; @@ -201,9 +216,11 @@ int main(int argc, char *argv[]) goto out_kill; } - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; - if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { + if (vdso.start != VDSO_BAD_ADDR || + vvar.start != VVAR_BAD_ADDR || + vvar_vclock.start != VVAR_BAD_ADDR) { pr_err("Found vvar or vdso blob(s) in child, which should have unmapped them\n"); goto out_kill; } @@ -211,7 +228,7 @@ int main(int argc, char *argv[]) test_daemon(); test_waitsig(); - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Child without vdso got it after C/R\n"); From 867c773031aef74e66cd15b55418141bcc538b95 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:50:02 +0000 Subject: [PATCH 567/775] make: allow setting the default network locking backend As different Linux distributions are switching away from iptables to nftables, this makes it easier to compile CRIU with a different default network locking backend. Instead of changing the source code it is now possible to select the nft backend like this: make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES Signed-off-by: Adrian Reber --- Makefile | 4 ++++ criu/include/cr_options.h | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/Makefile b/Makefile index 90908de83..5d8e89ac1 100644 --- a/Makefile +++ b/Makefile @@ -140,6 +140,10 @@ ifneq ($(GCOV),) CFLAGS += $(CFLAGS-GCOV) endif +ifneq ($(NETWORK_LOCK_DEFAULT),) + CFLAGS += -DNETWORK_LOCK_DEFAULT=$(NETWORK_LOCK_DEFAULT) +endif + ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 60cf9437e..ab0bd8fa3 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -70,7 +70,15 @@ enum NETWORK_LOCK_METHOD { NETWORK_LOCK_SKIP, }; +/** + * CRIU currently defaults to the iptables locking backend. + * + * It is, however, possible to change this by defining + * NETWORK_LOCK_DEFAULT to a different value on the command-line. + */ +#ifndef NETWORK_LOCK_DEFAULT #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES +#endif /* * Ghost file size we allow to carry by default. From 2cd9d5ded86204e1a43f57102b86cc06e9ecf0eb Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:56:27 +0000 Subject: [PATCH 568/775] docs: update INSTALL.md with a section about building CRIU The building section also contains the information how to change the network locking backend without source code changes. Signed-off-by: Adrian Reber --- INSTALL.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index d786d06eb..76ace5b02 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,3 +1,23 @@ +## Building CRIU from source code + +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. + +To compile CRIU, run: +``` +make +``` +This should create the `./criu/criu` executable. + +To change the default behaviour of CRIU, the following variables can be passed +to the make command: + + * **NETWORK_LOCK_DEFAULT**, can be set to one of the following + values: `NETWORK_LOCK_IPTABLES`, `NETWORK_LOCK_NFTABLES`, + `NETWORK_LOCK_SKIP`. CRIU defaults to `NETWORK_LOCK_IPTABLES` + if nothing is specified. If another network locking backend is + needed, `make` can be called like this: + `make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES` + ## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package From 95729ec328a02a81824ce2b8c3ecd5eb90a170d4 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 15:57:47 +0000 Subject: [PATCH 569/775] docs: mark make commands with same format as elsewhere This uses the same formatting for the make command examples as seen in README.md. Signed-off-by: Adrian Reber --- INSTALL.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 76ace5b02..af0702518 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -23,9 +23,9 @@ to the make command: Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing - - make install - +``` +make install +``` this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); @@ -36,17 +36,17 @@ this command accepts the following variables: * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type - - make DESTDIR=/some/new/place install - +``` +make DESTDIR=/some/new/place install +``` and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type - - make uninstall - +``` +make uninstall +``` and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. From 29ccb5b625a5cf915f87d1d85952dde6b9b572ee Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 07:34:26 +0000 Subject: [PATCH 570/775] test: others/rpc do not use nftables locking backend The tests in others/rpc are running as non-root and fail silently if the nftables network locking backend is used. This switches those tests to skip the network locking. Signed-off-by: Adrian Reber --- test/others/rpc/errno.py | 2 ++ test/others/rpc/ps_test.py | 1 + test/others/rpc/run.sh | 2 +- test/others/rpc/test-c.c | 2 ++ test/others/rpc/test.py | 1 + 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index b600b6d1c..4ea6c9d44 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -67,6 +67,7 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.pid = pid + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() @@ -84,6 +85,7 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.leave_running = True + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index daeda49bc..259f22e77 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -23,6 +23,7 @@ req.type = rpc.PAGE_SERVER req.opts.log_file = 'page-server.log' req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP s.send(req.SerializeToString()) diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index afd4fb5e3..3d5a53ae6 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -51,7 +51,7 @@ function test_restore_loop { title_print "Dump loop process" # So theoretically '-j' (--shell-job) should not be necessary, but on alpine # this test fails without it. - ${CRIU} dump -j -v4 -o dump-loop.log -D build/imgs_loop -t ${P} + ${CRIU} dump -j -v4 -o dump-loop.log --network-lock skip -D build/imgs_loop -t ${P} title_print "Run restore-loop" ./restore-loop.py build/criu_service.socket build/imgs_loop diff --git a/test/others/rpc/test-c.c b/test/others/rpc/test-c.c index 792dbbf9c..b3507975f 100644 --- a/test/others/rpc/test-c.c +++ b/test/others/rpc/test-c.c @@ -99,6 +99,8 @@ int main(int argc, char *argv[]) req.opts->images_dir_fd = dir_fd; req.opts->has_log_level = true; req.opts->log_level = 4; + req.opts->has_network_lock = true; + req.opts->network_lock = CRIU_NETWORK_LOCK_METHOD__SKIP; /* * Connect to service socket diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index ce8411bc6..6f692f755 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -24,6 +24,7 @@ req.type = rpc.DUMP req.opts.leave_running = True req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP # Send request s.send(req.SerializeToString()) From f22330ff07354fd8007a42247fb1e29bcc346033 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Thu, 6 Feb 2025 09:57:52 +0000 Subject: [PATCH 571/775] test: print out logs if tests fail If the tests in others/rpc are failing no information about that error can be seen in a CI run. This change displays the log files if the test fails. Signed-off-by: Adrian Reber --- test/others/rpc/Makefile | 10 +++++++++- test/others/rpc/run.sh | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index 69537bb0d..b2f907abe 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -8,9 +8,17 @@ PYTHON ?= python3 run: all @make -C .. loop - mkdir -p build + mkdir -p build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} chmod a+rwx build + chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + @# Create all log files to be accessible for anybody + @# so that they can be displayed by any user. + for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ + imgs_c/restore-c.log imgs_loop/criu.log imgs_loop/dump-loop.log \ + imgs_py/criu.log imgs_py/restore-py.log imgs_c/criu.log service.log; do \ + touch build/$$i; chmod 666 build/$$i; \ + done sudo -g '#1000' -u '#1000' mkfifo build/status @# Need to start the criu daemon here to access the pidfile. @# The script read.py is used to wait until 'criu service' diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 3d5a53ae6..b6158dfea 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -3,6 +3,7 @@ set -e CRIU=./criu +FAIL=1 export PROTODIR=`readlink -f "${PWD}/../../protobuf"` @@ -19,6 +20,13 @@ function stop_server { title_print "Shutdown service server" kill -SIGTERM $(cat build/pidfile) unlink build/pidfile + if [ "${FAIL}" == "1" ]; then + for i in build/output*; do + echo "File: $i" + cat $i + done + find . -name "*.log" -print -exec cat {} \; || true + fi } function test_c { @@ -80,6 +88,8 @@ test_restore_loop test_ps test_errno +FAIL=0 + stop_server trap 'echo "Success"' EXIT From 700a8c4b5ebeef536612a95f4d697f8ff0bf9b34 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 5 Feb 2025 07:51:38 +0000 Subject: [PATCH 572/775] ci: do not run tests requiring iptables if it is missing There are a couple of tests that require the iptables binary. Instead of adding a checkskip script, which could also handle this, this change now uses CRIU's feature detection to see if the CRIU feature 'has_ipt_legacy' exists. Signed-off-by: Adrian Reber --- test/zdtm/static/net_lock_socket_iptables.desc | 1 + test/zdtm/static/net_lock_socket_iptables6.desc | 1 + test/zdtm/static/netns-nf.desc | 1 + test/zdtm/static/netns_lock_iptables.desc | 1 + test/zdtm/static/socket-tcp-closed-last-ack.desc | 2 +- test/zdtm/static/socket-tcp-reseted.desc | 2 +- test/zdtm/static/socket-tcp-syn-sent.desc | 2 +- 7 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/net_lock_socket_iptables.desc b/test/zdtm/static/net_lock_socket_iptables.desc index 936ff8702..cb622536f 100644 --- a/test/zdtm/static/net_lock_socket_iptables.desc +++ b/test/zdtm/static/net_lock_socket_iptables.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/net_lock_socket_iptables6.desc b/test/zdtm/static/net_lock_socket_iptables6.desc index 936ff8702..cb622536f 100644 --- a/test/zdtm/static/net_lock_socket_iptables6.desc +++ b/test/zdtm/static/net_lock_socket_iptables6.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index c99696d1c..58c23e8ba 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -3,4 +3,5 @@ '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', + 'feature': 'has_ipt_legacy', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns_lock_iptables.desc b/test/zdtm/static/netns_lock_iptables.desc index 69020f34e..b465706b8 100644 --- a/test/zdtm/static/netns_lock_iptables.desc +++ b/test/zdtm/static/netns_lock_iptables.desc @@ -1,6 +1,7 @@ { 'flavor': 'h', 'flags': 'suid excl reqrst', + 'feature': 'has_ipt_legacy', 'opts': '--tcp-established', 'dopts': '--network-lock iptables', 'ropts': '--join-ns net:/var/run/netns/criu-net-lock-test' diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index 309854fa5..c77d58477 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -5,6 +5,6 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed', + 'feature' : 'tcp_half_closed has_ipt_legacy', 'flavor': 'ns uns', } diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index 4aa48ad87..ff92e9f9f 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -6,5 +6,5 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 71cd26d72..52382414b 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -5,5 +5,5 @@ ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } From 6826ac58ce842393a8a7d8cf73cd0478d4456330 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 4 Feb 2025 16:10:02 +0000 Subject: [PATCH 573/775] ci: run tests on a nftables only system Signed-off-by: Adrian Reber --- .github/workflows/nftables-test.yml | 24 ++++++++++++++++++++++++ scripts/ci/run-ci-tests.sh | 13 ++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/nftables-test.yml diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml new file mode 100644 index 000000000..eb3d8e814 --- /dev/null +++ b/.github/workflows/nftables-test.yml @@ -0,0 +1,24 @@ +name: Nftables bases testing + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: nftables-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Remove iptables + run: sudo apt remove -y iptables + - name: Install libnftables-dev + run: sudo scripts/ci/apt-install libnftables-dev + - name: chmod 755 /home/runner + # CRIU's tests are sometimes running as some random user and need + # to be able to access the test files. + run: sudo chmod 755 /home/runner + - name: Build with nftables network locking backend + run: sudo make -C scripts/ci local COMPILE_FLAGS="NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES" diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 611ff7803..0c4a08975 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -39,6 +39,10 @@ ci_prep () { # This can fail on aarch64 travis service apport stop || : + # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user + # namespaces by unprivileged users. We need this for some of our tests. + sysctl kernel.apparmor_restrict_unprivileged_userns=0 || : + if [ "$CLANG" = "1" ]; then # clang support CC=clang @@ -121,8 +125,14 @@ if [ "${CD_TO_TOP}" = "1" ]; then fi export GCOV CC +if [ -z "$COMPILE_FLAGS" ]; then + LOCAL_COMPILE_FLAGS=("V=1") +else + IFS=" " read -r -a LOCAL_COMPILE_FLAGS <<< "$COMPILE_FLAGS" + LOCAL_COMPILE_FLAGS=("V=1" "${LOCAL_COMPILE_FLAGS[@]}") +fi $CC --version -time make CC="$CC" -j4 V=1 +time make CC="$CC" -j4 "${LOCAL_COMPILE_FLAGS[@]}" ./criu/criu -v4 cpuinfo dump || : ./criu/criu -v4 cpuinfo check || : @@ -150,6 +160,7 @@ ulimit -c unlimited cgid=$$ cleanup_cgroup() { ./test/zdtm_umount_cgroups $cgid + dmesg } trap cleanup_cgroup EXIT ./test/zdtm_mount_cgroups $cgid From 0f647094424811aaa83839ea10f49d94596a3d15 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 19 Mar 2025 23:19:31 +0700 Subject: [PATCH 574/775] namespace: skip cleaning up the uid/gid map in error cases free_userns_maps is called to clean up uid/gid map when the dump finishes. If we try to clean up these maps in error cases, it can lead to double free panic. So just skip cleaning up these maps and let free_userns_maps do its job. Signed-off-by: Bui Quang Minh --- criu/namespaces.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/criu/namespaces.c b/criu/namespaces.c index b7c0ab400..0c9b16a87 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -1009,36 +1009,31 @@ int dump_user_ns(pid_t pid, int ns_id) ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) - goto err; + /* + * The uid_map and gid_map is clean up in free_userns_maps + * later, so we don't need to clean these up in error cases. + */ + return -1; + e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) - goto err; + return -1; e->n_gid_map = ret; if (check_user_ns(pid)) - goto err; + return -1; img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) - goto err; + return -1; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) - goto err; + return -1; return 0; -err: - if (e->uid_map) { - xfree(e->uid_map[0]); - xfree(e->uid_map); - } - if (e->gid_map) { - xfree(e->gid_map[0]); - xfree(e->gid_map); - } - return -1; } void free_userns_maps(void) From bc1415317379c45b08ac6f8eb98698ca2df9b78c Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Sat, 22 Mar 2025 19:31:02 -0400 Subject: [PATCH 575/775] criu: fix log_keep_err signal deadlock When using pr_err in signal handler, locking is used in an unsafe manner. If another signal happens while holding the lock, deadlock can happen. To fix this, we can introduce mutex_trylock similar to pthread_mutex_trylock that returns immediately. Due to the fact that lock is used only for writing first_err, this change garantees that deadlock cannot happen. Fixes: #358 Signed-off-by: Ivan Pravdin --- criu/log.c | 9 +++++---- include/common/lock.h | 6 ++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/criu/log.c b/criu/log.c index 89ae8f820..70e267fd6 100644 --- a/criu/log.c +++ b/criu/log.c @@ -132,10 +132,11 @@ static void log_note_err(char *msg) * anyway, so it doesn't make much sense to try hard * and optimize this out. */ - mutex_lock(&first_err->l); - if (first_err->s[0] == '\0') - __strlcpy(first_err->s, msg, sizeof(first_err->s)); - mutex_unlock(&first_err->l); + if (mutex_trylock(&first_err->l)) { + if (first_err->s[0] == '\0') + __strlcpy(first_err->s, msg, sizeof(first_err->s)); + mutex_unlock(&first_err->l); + } } } diff --git a/include/common/lock.h b/include/common/lock.h index ccfa468b8..4733d7287 100644 --- a/include/common/lock.h +++ b/include/common/lock.h @@ -2,6 +2,7 @@ #define __CR_COMMON_LOCK_H__ #include +#include #include #include #include @@ -162,6 +163,11 @@ static inline void mutex_lock(mutex_t *m) } } +static inline bool mutex_trylock(mutex_t *m) +{ + return atomic_inc_return(&m->raw) == 1; +} + static inline void mutex_unlock(mutex_t *m) { uint32_t c = 0; From b6059ff193a9b0dff98e997134d662c3ccfd1600 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 24 Mar 2025 15:23:34 -0700 Subject: [PATCH 576/775] criu: Version 4.1 (CRISC-V) Major changes: * RISC-V Support * PIDFD Support * CUDA Enhancements * Fixes here and there The full changelog can be found here: https://criu.org/Download/criu/4.1. Signed-off-by: Andrei Vagin --- Makefile.versions | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.versions b/Makefile.versions index c5859801a..85653c217 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. CRIU_VERSION_MAJOR := 4 -CRIU_VERSION_MINOR := 0 +CRIU_VERSION_MINOR := 1 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := CRIUDA +CRIU_VERSION_NAME := CRISCV CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL From 570621a48a81664a37a97f38d0ed65c1c0f56110 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 25 Jul 2025 00:05:06 +0000 Subject: [PATCH 577/775] mount-v2: enter the mount namesapce to propagation properties A kernel change (commit 12f147ddd6de, "do_change_type(): refuse to operate on unmounted/not ours mounts") modified how mount propagation properties can be changed. Previously, these properties could be changed from any mount namespace. Now, they can only be modified from the specific mount namespace where the target mount is actually mounted This commit addresses this new restriction by ensuring that CRIU enters the correct mount namespace before attempting to restore mount propagation properties (MS_SLAVE or MS_SHARED) for a mount. Signed-off-by: Andrei Vagin --- criu/mount-v2.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 5d53e9a22..cdebc8318 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -927,8 +927,12 @@ static int move_mount_set_group(int src_id, char *source, int dst_id) static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) { + int nsfd = -1, orig_nsfd = -1, exit_code = -1; char target_path[PATH_MAX]; - int target_fd; + int target_fd = -1; + + if (!sg->master_id && !sg->shared_id) + return 0; target_fd = fdstore_get(target->mnt_fd_id); BUG_ON(target_fd < 0); @@ -943,8 +947,7 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ first = get_first_mount(sg->parent); if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); - close(target_fd); - return -1; + goto err; } } else { /* @@ -956,16 +959,23 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ */ if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); - close(target_fd); - return -1; + goto err; } } + } + nsfd = fdstore_get(target->nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) + goto err; + + if (sg->master_id) { /* Convert shared_id to master_id */ if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { pr_perror("Failed to make mount %d slave", target->mnt_id); - close(target_fd); - return -1; + goto err; } } @@ -973,13 +983,16 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ if (sg->shared_id) { if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { pr_perror("Failed to make mount %d shared", target->mnt_id); - close(target_fd); - return -1; + goto err; } } - close(target_fd); - - return 0; + exit_code = 0; +err: + close_safe(&target_fd); + close_safe(&nsfd); + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + return exit_code; } static int restore_one_sharing_group(struct sharing_group *sg) From ced15c302b3f5f11f529e335d4b54ad88b45075e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 8 Jun 2025 17:19:52 -0700 Subject: [PATCH 578/775] test/zdtm: remove unused compiler argument Fixes a clang compile-time error: "argument unused during compilation: '-c'". Signed-off-by: Andrei Vagin --- test/zdtm/Makefile.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 24f32c606..c19888da3 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -76,7 +76,7 @@ endef %.d: %.c $(E) " DEP " $@ - $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP -c $< -o $@ + $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP $< -o $@ %.o: %.c | %.d $(E) " CC " $@ From a44aa6d985472d995d04fef7eae22d63c7500f8c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 28 Jul 2025 21:32:02 +0000 Subject: [PATCH 579/775] criu: Version 4.1.1 This release of CRIU (4.1.1) addresses a critical compatibility issue introduced in the Linux kernel and back-ported to all stable releases. The kernel commit (12f147ddd6de "do_change_type(): refuse to operate on unmounted/not ours mounts") addressed the security issue introduced almost 20 years ago. Unfortunately, this change inadvertently broke the restore functionality of mount namespaces within CRIU. Users attempting to restore a container on updated kernels would encounter the error: "mnt-v2: Failed to make mount 476 slave: Invalid argument." This release contains the necessary adjustments to CRIU, allowing it to work seamlessly with kernels incorporating this security change. Signed-off-by: Andrei Vagin --- Makefile.versions | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.versions b/Makefile.versions index 85653c217..0b1a46a16 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -2,7 +2,7 @@ # CRIU version. CRIU_VERSION_MAJOR := 4 CRIU_VERSION_MINOR := 1 -CRIU_VERSION_SUBLEVEL := +CRIU_VERSION_SUBLEVEL := 1 CRIU_VERSION_EXTRA := CRIU_VERSION_NAME := CRISCV CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) From 34226fd243b599b8c02dad3ef1530cef2016dabe Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Sat, 18 Jan 2025 13:43:15 +0000 Subject: [PATCH 580/775] ci: try GitHub arm runners Signed-off-by: Adrian Reber --- .github/workflows/actuated-aarch64-test.yaml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml index 8b0a63fc7..567746a5f 100644 --- a/.github/workflows/actuated-aarch64-test.yaml +++ b/.github/workflows/actuated-aarch64-test.yaml @@ -1,4 +1,4 @@ -name: Actuated aarch64 test +name: aarch64 test on: [push, pull_request] @@ -11,32 +11,38 @@ jobs: build: # Actuated runners are not available in all repositories. if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected. - # 3GB and 4 CPUs seems to be enough according to the result from 'vmmeter'. - runs-on: actuated-arm64-4cpu-3gb + # The memory size and the number of CPUs can be freely selected for + # the actuated runners. 3GB and 4 CPUs seems to be enough according to the + # result from 'vmmeter'. + runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: + os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] target: [GCC=1, CLANG=1] steps: # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md # vmmeter start - name: Prepare arkade + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: alexellis/arkade-get@master with: crane: latest print-summary: false - name: Install vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} run: | crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - name: Run vmmeter + if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} uses: self-actuated/vmmeter-action@master # vmmeter end - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} # Following tests are failing on the actuated VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From daa548bbfb189beb3c2b632a39081f8713b5222f Mon Sep 17 00:00:00 2001 From: Yuanhong Peng Date: Wed, 2 Apr 2025 18:48:12 +0800 Subject: [PATCH 581/775] criu: Do not print failed message when there is no late stage hook This is highly confusing, and it seems that the ret variable is not handled in the subsequent process. Signed-off-by: Yuanhong Peng --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1f4881dab..583b446e0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2258,7 +2258,7 @@ skip_ns_bouncing: * might actually be a true error code but that would be also * captured in the plugin so no need to print the error here. */ - if (ret < 0) + if (ret < 0 && ret != -ENOTSUP) pr_debug("restore late stage hook for external plugin failed\n"); } From 9a1e979666275f2b94aa42f83bb4bd86ef00b7ea Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 2 Apr 2025 21:13:16 +0000 Subject: [PATCH 582/775] compel: fix the stack test The stack test incorrectly assumed the page immediately following the stack pointer could never be changed. This doesn't work, because this page can be a part of another mapping. This commit introduces a dedicated "stack redzone," a small guard region directly after the stack. The stack test is modified to specifically check for corruption within this redzone. Signed-off-by: Andrei Vagin --- compel/include/uapi/infect.h | 9 +++ compel/src/lib/infect.c | 6 +- compel/test/stack/spy.c | 113 +---------------------------------- 3 files changed, 12 insertions(+), 116 deletions(-) diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index ed97d64dd..1f61876ff 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,6 +13,15 @@ #define PARASITE_START_AREA_MIN (4096) +#define PARASITE_STACK_SIZE (16 << 10) +/* + * A stack redzone is a small, protected region of memory located immediately + * after a parasite stack. It is intended to remain unchanged. While it can be + * implemented as a guard page, we want to avoid the overhead of additional + * remote system calls. + */ +#define PARASITE_STACK_REDZONE 128 + extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index a9bbd6400..4ea27bc63 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -38,8 +38,6 @@ #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif -#define PARASITE_STACK_SIZE (16 << 10) - #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -1064,7 +1062,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p; + ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; /* * x86-64 ABI requires a 16 bytes aligned stack. @@ -1078,7 +1076,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p; + ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; } ret = arch_fetch_sas(ctl, ctl->rsigframe); diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c index 9b7c9a7f0..184c8ab31 100644 --- a/compel/test/stack/spy.c +++ b/compel/test/stack/spy.c @@ -50,70 +50,6 @@ static void *get_parasite_rstack_start(struct parasite_ctl *ctl) return rstack_start; } -static int page_writable(struct parasite_ctl *ctl, int pid, void *page) -{ - FILE *maps; - size_t maps_line_len = 0; - char *maps_line = NULL; - char victim_maps_path[6 + 11 + 5 + 1]; - int written; - int ret = 0; - - if (((uintptr_t)page & (page_size() - 1)) != 0) { - fprintf(stderr, "Page address not aligned\n"); - ret = -1; - goto done; - } - - written = snprintf(victim_maps_path, sizeof(victim_maps_path), "/proc/%d/maps", pid); - if (written < 0 || written >= sizeof(victim_maps_path)) { - fprintf(stderr, "Failed to create path string to victim's /proc/%d/maps file\n", pid); - ret = -1; - goto done; - } - - maps = fopen(victim_maps_path, "r"); - if (maps == NULL) { - perror("Can't open victim's /proc/$pid/maps"); - ret = -1; - goto done; - } - - while (getline(&maps_line, &maps_line_len, maps) != -1) { - unsigned long vmstart, vmend; - char r, w; - - if (sscanf(maps_line, "%lx-%lx %c%c", &vmstart, &vmend, &r, &w) < 4) { - fprintf(stderr, "Can't parse victim's /proc/%d/maps; line: %s\n", pid, maps_line); - ret = -1; - goto free_linebuf; - } - - if (page >= (void *)vmstart && page < (void *)vmend) { - if (w == 'w') { - if (r != 'r') { - fprintf(stderr, "Expecting writable memory to also be readable"); - ret = -1; - goto free_linebuf; - } - ret = 1; - } - break; - } - } - - if (errno) { - perror("Can't read victim's /proc/$pid/maps"); - ret = -1; - } - -free_linebuf: - free(maps_line); - fclose(maps); -done: - return ret; -} - static void *read_proc_mem(int pid, void *offset, size_t len) { char victim_mem_path[6 + 11 + 4 + 1]; @@ -153,51 +89,6 @@ freebuf: return NULL; } -static int save_data_near_stack(struct parasite_ctl *ctl, int pid, void *stack, void **saved_data, - size_t *saved_data_size) -{ - size_t page_mask = page_size() - 1; - size_t saved_size = 0; - size_t stack_size_last_page = (uintptr_t)stack & page_mask; - void *next_page = stack; - - if (stack_size_last_page != 0) { - size_t empty_space_last_page = page_size() - stack_size_last_page; - saved_size = min(empty_space_last_page, (size_t)SAVED_DATA_MAX); - next_page += page_size() - stack_size_last_page; - } - - while (saved_size < SAVED_DATA_MAX && next_page != NULL) { - switch (page_writable(ctl, pid, next_page)) { - case 1: - saved_size = min((size_t)(saved_size + page_size()), (size_t)SAVED_DATA_MAX); - next_page += page_size(); - break; - case 0: - next_page = NULL; - break; - default: - return -1; - } - } - - if (saved_size > 0) { - void *sd; - - sd = read_proc_mem(pid, stack, saved_size); - if (sd == NULL) - return -1; - - *saved_data = sd; - } else { - *saved_data = NULL; - } - - *saved_data_size = saved_size; - - return 0; -} - static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) { if (saved_data != NULL) { @@ -221,7 +112,7 @@ static int do_infection(int pid) struct infect_ctx *ictx; int *arg; void *stack; - size_t saved_data_size; + size_t saved_data_size = PARASITE_STACK_REDZONE; int saved_data_check; compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); @@ -257,8 +148,6 @@ static int do_infection(int pid) err_and_ret("Can't register cleanup function with atexit\n"); stack = get_parasite_rstack_start(ctl); - if (save_data_near_stack(ctl, pid, stack, &saved_data, &saved_data_size)) - err_and_ret("Can't save data above stack\n"); if (compel_start_daemon(ctl)) err_and_ret("Can't start daemon in victim\n"); From 5ff52326e15b90dc59ed8ae317735201277a2377 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 27 Mar 2025 14:21:03 +0000 Subject: [PATCH 583/775] restore: use the new kernel interface to restore timers Thomas Gleixner introduced the new interface to create posix timers with specifed timer IDs: https://github.com/torvalds/linux/commit/ec2d0c04624b3c8a7eb1682e006717fa20cfbe24 Previously, CRIU recreated timers by repeatedly creating and deleting them until the desired ID was reached. This approach isn't fast, especially for timers with large IDs. For example, restoring two timers with IDs 1000000 and 2000000 took approximately 1.5 seconds. The new `prctl()` based interface allows direct creation of timers with specified IDs, reducing the restoration time to around 3 microseconds for the same example. Signed-off-by: Andrei Vagin --- criu/cr-check.c | 10 ++++++++ criu/include/kerndat.h | 1 + criu/include/prctl.h | 7 ++++++ criu/include/restorer.h | 1 + criu/kerndat.c | 20 +++++++++++++++ criu/pie/restorer.c | 54 +++++++++++++++++++++++++++++++++++++---- criu/timer.c | 2 ++ 7 files changed, 90 insertions(+), 5 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 0388cbe7f..7b4a6415a 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1392,6 +1392,14 @@ static int check_pagemap_scan(void) return 0; } +static int check_timer_cr_ids(void) +{ + if (!kdat.has_timer_cr_ids) + return -1; + + return 0; +} + /* musl doesn't have a statx wrapper... */ struct staty { __u32 stx_dev_major; @@ -1703,6 +1711,7 @@ int cr_check(void) ret |= check_ipv6_freebind(); ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); + ret |= check_timer_cr_ids(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1825,6 +1834,7 @@ static struct feature_list feature_list[] = { { "get_rseq_conf", check_ptrace_get_rseq_conf }, { "ipv6_freebind", check_ipv6_freebind }, { "pagemap_scan", check_pagemap_scan }, + { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index e03a57341..bd8744d62 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -89,6 +89,7 @@ struct kerndat_s { bool has_pagemap_scan; bool has_shstk; bool has_close_range; + bool has_timer_cr_ids; }; extern struct kerndat_s kdat; diff --git a/criu/include/prctl.h b/criu/include/prctl.h index f5f23c969..2966659da 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -97,4 +97,11 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 #endif +#ifndef PR_TIMER_CREATE_RESTORE_IDS +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index a4fb7ea79..56bea0fcc 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -170,6 +170,7 @@ struct task_restore_args { struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; + bool posix_timer_cr_ids; struct restore_timerfd *timerfd; unsigned int timerfd_n; diff --git a/criu/kerndat.c b/criu/kerndat.c index 5939005a4..930117b0a 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1720,6 +1720,22 @@ static int kerndat_has_close_range(void) return 0; } +static int kerndat_has_timer_cr_ids(void) +{ + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { + if (errno == EINVAL) { + pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); + return 0; + } + pr_perror("prctl returned unexpected error code"); + return -1; + } + + kdat.has_timer_cr_ids = true; + return 0; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1981,6 +1997,10 @@ int kerndat_init(void) pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_timer_cr_ids()) { + pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 348ce6659..9867a3ddd 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1235,9 +1235,23 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { - int ret, i; + int ret, i, exit_code = -1; kernel_timer_t next_id = 0, timer_id; struct sigevent sev; + bool create_restore_ids = false; + + if (!args->posix_timers_n) + return 0; + + /* prctl returns EINVAL if PR_TIMER_CREATE_RESTORE_IDS isn't supported. */ + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0); + if (ret == 0) { + create_restore_ids = true; + } else if (ret != -EINVAL) { + pr_err("Can't enabled PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + return -1; + } for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; @@ -1249,16 +1263,36 @@ static int create_posix_timers(struct task_restore_args *args) #endif sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + if (create_restore_ids) { + /* + * With enabled PR_TIMER_CREATE_RESTORE_IDS, the + * timer_create syscall creates a new timer with the + * specified ID. + */ + timer_id = args->posix_timers[i].spt.it_id; + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); + if (ret < 0) { + pr_err("Can't create posix timer - %d: %d\n", i, ret); + goto out; + } + if (timer_id != args->posix_timers[i].spt.it_id) { + pr_err("Unexpected timer id %u (expected %lu)\n", + timer_id, args->posix_timers[i].spt.it_id); + goto out; + } + continue; + } + while (1) { ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); - return ret; + goto out; } if (timer_id != next_id) { pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; + goto out; } next_id++; @@ -1268,12 +1302,22 @@ static int create_posix_timers(struct task_restore_args *args) ret = sys_timer_delete(timer_id); if (ret < 0) { pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); - return ret; + goto out; } } } - return 0; + exit_code = 0; +out: + if (create_restore_ids) { + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0); + if (ret != 0) { + pr_err("Can't disable PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + exit_code = -1; + } + } + return exit_code; } static void restore_posix_timers(struct task_restore_args *args) diff --git a/criu/timer.c b/criu/timer.c index 0413e2a72..856501be6 100644 --- a/criu/timer.c +++ b/criu/timer.c @@ -195,6 +195,7 @@ int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) if (!img) return -1; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; ta->posix_timers_n = 0; while (1) { PosixTimerEntry *pte; @@ -234,6 +235,7 @@ int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) return prepare_posix_timers_from_fd(pid, ta); ta->posix_timers_n = tte->n_posix; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; for (i = 0; i < ta->posix_timers_n; i++) { t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); if (!t) From e7aee3c5c723e95e1c0e787f4c57919c2fc58c60 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 20:56:23 +0100 Subject: [PATCH 584/775] cuda: use pr_perror for libc function errors When handing errors for functions such as `ptrace()`, `pipe()`, and `fork()` it would be better to use `pr_perror` instead of `pr_err` as it would include a message describing the encountered error. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 99e4caf74..1aaad6842 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -93,7 +93,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int fd[2], buf_off; if (pipe(fd) != 0) { - pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); + pr_perror("Couldn't create pipes for reading cuda-checkpoint output"); return -1; } @@ -101,7 +101,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) int child_pid = fork(); if (child_pid == -1) { - pr_err("Failed to fork to exec cuda-checkpoint\n"); + pr_perror("Failed to fork to exec cuda-checkpoint"); close(fd[READ]); close(fd[WRITE]); return -1; @@ -166,7 +166,6 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) } if (WIFSIGNALED(status)) { int sig = WTERMSIG(status); - pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); } else if (WIFEXITED(status)) { exit_code = WEXITSTATUS(status); @@ -283,8 +282,8 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse * a compel_interrupt_task() */ if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { - pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", - restore_tid); + pr_perror("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state", + restore_tid); return -1; } @@ -295,12 +294,12 @@ static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigse } if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { - pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); + pr_perror("Failed to set ptrace options on interrupt for restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { - pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); + pr_perror("Unable to restore original sigmask to restore tid %d", restore_tid); return -1; } @@ -312,7 +311,7 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) k_rtsigset_t block; if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { - pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); + pr_perror("Failed to get current sigmask for restore tid %d", restore_tid); return -1; } @@ -320,18 +319,18 @@ static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { - pr_err("Failed to block signals on restore tid %d\n", restore_tid); + pr_perror("Failed to block signals on restore tid %d", restore_tid); return -1; } // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { - pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); + pr_perror("Could not clear ptrace options on restore tid %d", restore_tid); return -1; } if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { - pr_err("Could not resume cuda restore tid %d\n", restore_tid); + pr_perror("Could not resume cuda restore tid %d", restore_tid); return -1; } From 6805841660e741eda203ef8339a895281f2095e9 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 10 Apr 2025 21:14:05 +0100 Subject: [PATCH 585/775] cuda: remove redundant goto label The `goto interrupt` label is unnecessary as the code directly returns after `cuda_process_checkpoint_action()`. Signed-off-by: Radostin Stoyanov --- plugins/cuda/cuda_plugin.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 1aaad6842..9ccb04224 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -395,12 +395,9 @@ int cuda_plugin_checkpoint_devices(int pid) status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); - goto interrupt; } -interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? -1 : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); From 74799ae023f82d99efac8d67974705087f208567 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 9 Apr 2025 13:25:44 +0000 Subject: [PATCH 586/775] aarch64: fix build with missing NT_ARM_PAC_ENABLED_KEYS On a RHEL 8 based system building CRIU fails with: criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS criu/arch/aarch64/crtools.c:73:39: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_PACA_KEYS'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ NT_ARM_PACA_KEYS This adds the missing define if it is undefined. Signed-off-by: Adrian Reber --- criu/arch/aarch64/crtools.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 6cde03ee3..c077dd06b 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,10 @@ #include "compel/infect.h" #include "pstree.h" +#ifndef NT_ARM_PAC_ENABLED_KEYS +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ +#endif + extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e From b9da95b0b2c5f42b24725d673bf287b3c00bbc40 Mon Sep 17 00:00:00 2001 From: Younes Manton Date: Tue, 23 Jan 2024 08:22:07 -0800 Subject: [PATCH 587/775] s390: Fix FP reg restore after parasite code runs Currently we save FP regs before parasite code runs, and restore after for --leave-running, --check-only, and in case of errors. In case of errors the error may have happened before FP regs were saved, so we should only restore them if they were actually saved. Signed-off-by: Younes Manton --- criu/arch/s390/crtools.c | 90 +++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 96cef819e..e08c83878 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -142,6 +142,29 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Allocate VxrsLow registers */ @@ -294,7 +317,13 @@ int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_stru CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + /* + * We delay allocating this until now because checkpointing can fail earlier. + * When it fails we need to know if we reached here or not so that the cleanup + * code doesn't restore FPRs that were never saved in the first place. + */ + fpregs = allocate_fp_regs(); + CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -399,36 +428,15 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) return 0; } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); + if (fpregs) { + xfree(fpregs->fprs); + xfree(fpregs); + } } /* @@ -487,15 +495,17 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - ti_s390->fpregs = allocate_fp_regs(); - if (!ti_s390->fpregs) - goto fail_free_gp_regs; + + /* + * Delay allocating space until needed. Checkpointing can fail before that + * and the cleanup code needs to be able to tell if FPRs were saved or not + * before trying to restore the register state. + */ + ti_s390->fpregs = NULL; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; -fail_free_gp_regs: - free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -678,14 +688,18 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* Floating point registers */ + /* + * Floating point registers + * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup + * so there's no guarantee that we saved FPRs for this thread. + */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (!cfpregs) - return -1; - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; + if (cfpregs) { + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + } /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { From 5de61a721fbc56de68094f19ac34466d66f7374f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 21 Apr 2025 06:33:41 +0000 Subject: [PATCH 588/775] net: nftables: avoid restore failure if the CRIU nft table already exist CRIU locks the network during restore in an "empty" network namespace. However, "empty" in this context means CRIU isn't restoring the namespace. This network namespace can be the same namespace where processes have been dumped and so the network is already locked in it. Fixes #2650 Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 2 +- criu/include/net.h | 2 +- criu/net.c | 30 +++++++++++++++++------------- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 583b446e0..30932f60a 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2119,7 +2119,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(); + ret = network_lock_internal(/* restore = */ true); if (ret) goto out_kill; } diff --git a/criu/include/net.h b/criu/include/net.h index 5e8a84862..7c5ede21e 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(void); +extern int network_lock_internal(bool restore); extern struct ns_desc net_ns_desc; diff --git a/criu/net.c b/criu/net.c index ee46f1c49..300df480b 100644 --- a/criu/net.c +++ b/criu/net.c @@ -3206,12 +3206,12 @@ static inline FILE *redirect_nftables_output(struct nft_ctx *nft) } #endif -static inline int nftables_lock_network_internal(void) +static inline int nftables_lock_network_internal(bool restore) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) cleanup_file FILE *fp = NULL; struct nft_ctx *nft; - int ret = 0; + int ret = 0, exit_code = -1; char table[32]; char buf[128]; @@ -3224,11 +3224,16 @@ static inline int nftables_lock_network_internal(void) fp = redirect_nftables_output(nft); if (!fp) - goto out; + goto err2; snprintf(buf, sizeof(buf), "create table %s", table); - if (NFT_RUN_CMD(nft, buf)) + ret = NFT_RUN_CMD(nft, buf); + if (ret) { + /* The network has been locked on dump. */ + if (restore && errno == EEXIST) + return 0; goto err2; + } snprintf(buf, sizeof(buf), "add chain %s output { type filter hook output priority 0; policy drop; }", table); if (NFT_RUN_CMD(nft, buf)) @@ -3246,17 +3251,16 @@ static inline int nftables_lock_network_internal(void) if (NFT_RUN_CMD(nft, buf)) goto err1; - goto out; - + exit_code = 0; +out: + nft_ctx_free(nft); + return exit_code; err1: snprintf(buf, sizeof(buf), "delete table %s", table); NFT_RUN_CMD(nft, buf); err2: - ret = -1; pr_err("Locking network failed using nftables\n"); -out: - nft_ctx_free(nft); - return ret; + goto out; #else pr_err("CRIU was built without libnftables support\n"); return -1; @@ -3288,7 +3292,7 @@ static int iptables_network_lock_internal(void) return ret; } -int network_lock_internal(void) +int network_lock_internal(bool restore) { int ret = 0, nsret; @@ -3301,7 +3305,7 @@ int network_lock_internal(void) if (opts.network_lock_method == NETWORK_LOCK_IPTABLES) ret = iptables_network_lock_internal(); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) - ret = nftables_lock_network_internal(); + ret = nftables_lock_network_internal(restore); if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -3427,7 +3431,7 @@ int network_lock(void) if (run_scripts(ACT_NET_LOCK)) return -1; - return network_lock_internal(); + return network_lock_internal(false); } void network_unlock(void) From b6dca31162562385cb0657af3443666990a28c01 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 14 Apr 2025 14:12:31 +0100 Subject: [PATCH 589/775] aarch64/crtools: fix define for missing constants Building CRIU package on Debian 11 aarch64 fails with criu/arch/aarch64/crtools.c: In function 'save_pac_keys': criu/arch/aarch64/crtools.c:32:31: error: storage size of 'paca' isn't known struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c:33:31: error: storage size of 'pacg' isn't known struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:47:15: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (hwcaps & HWCAP_PACA) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:47:15: note: each undeclared identifier is reported only once for each function it appears in criu/arch/aarch64/crtools.c:53:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:73:39: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:82:15: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (hwcaps & HWCAP_PACG) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:88:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:33:31: error: unused variable 'pacg' [-Werror=unused-variable] struct user_pac_generic_keys pacg; ^~~~ criu/arch/aarch64/crtools.c:32:31: error: unused variable 'paca' [-Werror=unused-variable] struct user_pac_address_keys paca; ^~~~ criu/arch/aarch64/crtools.c: In function 'arch_ptrace_restore': criu/arch/aarch64/crtools.c:227:31: error: storage size of 'upaca' isn't known struct user_pac_address_keys upaca; ^~~~~ criu/arch/aarch64/crtools.c:228:31: error: storage size of 'upacg' isn't known struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:241:18: error: 'HWCAP_PACA' undeclared (first use in this function); did you mean 'HWCAP_FCMA'? if (!(hwcaps & HWCAP_PACA)) { ^~~~~~~~~~ HWCAP_FCMA criu/arch/aarch64/crtools.c:255:44: error: 'NT_ARM_PACA_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:261:44: error: 'NT_ARM_PAC_ENABLED_KEYS' undeclared (first use in this function) if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { ^~~~~~~~~~~~~~~~~~~~~~~ criu/arch/aarch64/crtools.c:268:18: error: 'HWCAP_PACG' undeclared (first use in this function); did you mean 'HWCAP_AES'? if (!(hwcaps & HWCAP_PACG)) { ^~~~~~~~~~ HWCAP_AES criu/arch/aarch64/crtools.c:275:44: error: 'NT_ARM_PACG_KEYS' undeclared (first use in this function); did you mean 'NT_ARM_SVE'? if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { ^~~~~~~~~~~~~~~~ NT_ARM_SVE criu/arch/aarch64/crtools.c:233:6: error: variable 'ret' set but not used [-Werror=unused-but-set-variable] int ret; ^~~ criu/arch/aarch64/crtools.c:228:31: error: unused variable 'upacg' [-Werror=unused-variable] struct user_pac_generic_keys upacg; ^~~~~ criu/arch/aarch64/crtools.c:227:31: error: unused variable 'upaca' [-Werror=unused-variable] struct user_pac_address_keys upaca; ^~~~~ This patch adds the missing constants and structs if undefined. Signed-off-by: Radostin Stoyanov --- criu/arch/aarch64/crtools.c | 47 +++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index c077dd06b..3ed5c9d63 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -23,6 +23,45 @@ #include "compel/infect.h" #include "pstree.h" +/* + * cr_user_pac_* are a copy of the corresponding uapi structs + * in arch/arm64/include/uapi/asm/ptrace.h + */ +struct cr_user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct cr_user_pac_generic_keys { + __uint128_t apgakey; +}; + +/* + * The following HWCAP constants are copied from + * arch/arm64/include/uapi/asm/hwcap.h + */ +#ifndef HWCAP_PACA +#define HWCAP_PACA (1 << 30) +#endif + +#ifndef HWCAP_PACG +#define HWCAP_PACG (1UL << 31) +#endif + +/* + * The following NT_ARM_PAC constants are copied from + * include/uapi/linux/elf.h + */ +#ifndef NT_ARM_PACA_KEYS +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#endif + +#ifndef NT_ARM_PACG_KEYS +#define NT_ARM_PACG_KEYS 0x408 +#endif + #ifndef NT_ARM_PAC_ENABLED_KEYS #define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ #endif @@ -33,8 +72,8 @@ extern unsigned long getauxval(unsigned long type); static int save_pac_keys(int pid, CoreEntry *core) { - struct user_pac_address_keys paca; - struct user_pac_generic_keys pacg; + struct cr_user_pac_address_keys paca; + struct cr_user_pac_generic_keys pacg; PacKeys *pac_entry; long pac_enabled_key; struct iovec iov; @@ -228,8 +267,8 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) int arch_ptrace_restore(int pid, struct pstree_item *item) { unsigned long hwcaps = getauxval(AT_HWCAP); - struct user_pac_address_keys upaca; - struct user_pac_generic_keys upacg; + struct cr_user_pac_address_keys upaca; + struct cr_user_pac_generic_keys upacg; PacAddressKeys *paca; PacGenericKeys *pacg; long pac_enabled_keys; From 88cb552f692353983aeab6478d1779566afd154e Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:02:46 +0800 Subject: [PATCH 590/775] mount: restore root mount flags Mount flags belong to mount and mount namespace of the Container, so we should preserve them, as Container user will not expect mounts switching between ro and rw over c/r. Fixes: #2632 v5: fix both mount-v1 and mount-v2 Signed-off-by: Pavel Tikhomirov --- criu/mount-v2.c | 6 ++++++ criu/mount.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/criu/mount-v2.c b/criu/mount-v2.c index cdebc8318..1e33ac12a 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -443,6 +443,7 @@ err: /* Mounts root container mount. */ static int do_mount_root_v2(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); unsigned long flags = MS_BIND; int fd; @@ -477,6 +478,11 @@ static int do_mount_root_v2(struct mount_info *mi) return -1; } + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + mi->mounted = true; return 0; diff --git a/criu/mount.c b/criu/mount.c index 82bbd52d6..06b959542 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -2690,9 +2690,16 @@ shared: static int do_mount_root(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + return fetch_rt_stat(mi, service_mountpoint(mi)); } From 6b3826a6fb384632dfbd6e4b90c43b15842f09f8 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 591/775] zdtm/lib: add "bind" desc option Add {'bind': 'path/to/bindmount'} zdtm descriptor option, so that in test mount namespace a directory bindmount can be created before running the test. This is useful to leave test directory writable (e.g. for logs) while the test makes root mount readonly. note: We create this bindmount early so that all test files are opened on it initially and not on the below mount. Will be used in mnt_ro_root test. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 +++ test/zdtm/lib/ns.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index 37ebe63b7..e3ddc762a 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -443,6 +443,7 @@ class zdtm_test: self._bins = [name] self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} self._deps = desc.get('deps', []) + self._bind = desc.get('bind') self.auto_reap = True def __make_action(self, act, env=None, root=None): @@ -513,6 +514,8 @@ class zdtm_test: if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root + if self._bind: + env['ZDTM_BIND'] = self._bind env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 3c0dbdeb8..5fe81561f 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -28,8 +28,9 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path, *dev_path; + char *root, *criu_path, *dev_path, *zdtm_bind; char path[PATH_MAX]; + char bind_path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { @@ -52,6 +53,18 @@ static int prepare_mntns(void) return -1; } + zdtm_bind = getenv("ZDTM_BIND"); + if (zdtm_bind) { + /* + * Bindmount the directory to itself. + */ + snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); + if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { + fprintf(stderr, "Can't bind-mount ZDTM_BIND: %m\n"); + return -1; + } + } + dev_path = getenv("ZDTM_DEV"); if (dev_path) { snprintf(path, sizeof(path), "%s/dev", root); From 5a725266ac83ab4dedbd11cc76c29a257c018fef Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 2 Apr 2025 12:47:46 +0800 Subject: [PATCH 592/775] zdtm: add mnt_ro_root test It makes root mount readonly and checks that it is still readonly after migration. Make zdtm/static writable for logs via "bind" desc option. v2: explain why we don't have explicit rw/ro flag check v3: use new zdtm "bind" desc option Signed-off-by: Pavel Tikhomirov --- test/zdtm/lib/ns.c | 3 +++ test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ro_root.c | 32 +++++++++++++++++++++++++++++++ test/zdtm/static/mnt_ro_root.desc | 6 ++++++ 4 files changed, 42 insertions(+) create mode 100644 test/zdtm/static/mnt_ro_root.c create mode 100644 test/zdtm/static/mnt_ro_root.desc diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 5fe81561f..822e09c92 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -57,6 +57,9 @@ static int prepare_mntns(void) if (zdtm_bind) { /* * Bindmount the directory to itself. + * e.g.: The mnt_ro_root test makes "/" mount readonly, but we + * still want to write logs to /zdtm/static/ so let's make it + * separate writable bind mount. */ snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6a19cad3c..81e44de22 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -423,6 +423,7 @@ TST_DIR = \ mntns_ghost \ mntns_ghost01 \ mntns_ro_root \ + mnt_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ diff --git a/test/zdtm/static/mnt_ro_root.c b/test/zdtm/static/mnt_ro_root.c new file mode 100644 index 000000000..2d8370150 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.c @@ -0,0 +1,32 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if root mount remains read-only after c/r"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* + * Note: In zdtm.py:check_visible_state() we already check for all + * tests, that all mounts in the test's mount namespace remain the + * same, by comparing mountinfo before and after c/r. So rw/ro mount + * option inconsistency will be detected there and we don't need to + * check it in the test itself. + */ + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ro_root.desc b/test/zdtm/static/mnt_ro_root.desc new file mode 100644 index 000000000..c9a8e4f18 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.desc @@ -0,0 +1,6 @@ +{ + 'flavor': 'ns uns', + 'flags': 'suid', + 'feature': 'mnt_id', + 'bind': 'zdtm/static', +} From b458a5c1ad71b1081b3e1fdbc51b4581faabc4cf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 7 May 2025 14:06:55 +0100 Subject: [PATCH 593/775] sk-inet: add message how to disable MPTCP in Go With Go version 1.24, ListenConfig now uses MPTCP by default [1]. Checkpoint/restore for this protocol is not currently supported and adding support requires kernel changes that are not trivial to implement. As a result, checkpointing of many containers that run Go programs is likely to fail with the following error [2]: (00.026522) Error (criu/sk-inet.c:130): inet: Unsupported proto 262 for socket 2f9bc5 This patch adds a message with suggested workaround for this problem. [1] https://go.dev/doc/go1.24#netpkgnet [2] https://github.com/checkpoint-restore/criu/issues/2655 Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 92f53e569..a191e78c4 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -128,6 +128,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); + if (proto == IPPROTO_MPTCP) + pr_err("For Go programs, consider using \"GODEBUG=multipathtcp=0\" to disable MPTCP\n"); return 0; } From 1eaa870ccebba1e067862e45b6f8887e07d61a5c Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 May 2025 15:38:26 +0000 Subject: [PATCH 594/775] kerndat: check that hardware breakpoints work In some cases, they might not work in virtual machines if the hypervisor doesn't virtualize them. For example, they don't work in AMD SEV virtual machines if the Debug Virtualization extension isn't supported or isn't enabled in SEV_FEATURES. Fixes #2658 Signed-off-by: Andrei Vagin --- criu/cr-check.c | 17 +++++++++ criu/cr-restore.c | 3 +- criu/include/kerndat.h | 1 + criu/kerndat.c | 80 +++++++++++++++++++++++++++++++++++++++++ criu/parasite-syscall.c | 2 +- 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/criu/cr-check.c b/criu/cr-check.c index 7b4a6415a..9c4778490 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1589,6 +1589,17 @@ static int check_overlayfs_maps(void) return status == 0 ? 0 : -1; } +static int check_breakpoints(void) +{ + if (!kdat.has_breakpoints) { + pr_warn("Hardware breakpoints don't seem to work\n"); + return -1; + } + + return 0; +} + + static int (*chk_feature)(void); /* @@ -1616,6 +1627,7 @@ static int (*chk_feature)(void); return ret; \ } \ } while (0) + int cr_check(void) { struct ns_id *ns; @@ -1724,6 +1736,10 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } + /* + * Category 4 - optional. + */ + check_breakpoints(); pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; @@ -1836,6 +1852,7 @@ static struct feature_list feature_list[] = { { "pagemap_scan", check_pagemap_scan }, { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, + { "breakpoints", check_breakpoints }, { NULL, NULL }, }; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 30932f60a..cabe2f464 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1820,6 +1820,7 @@ static int restore_rseq_cs(void) static int catch_tasks(bool root_seized) { struct pstree_item *item; + bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; for_each_pstree_item(item) { int status, i, ret; @@ -1847,7 +1848,7 @@ static int catch_tasks(bool root_seized) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); if (ret < 0) return -1; } diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index bd8744d62..c5deb3283 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -90,6 +90,7 @@ struct kerndat_s { bool has_shstk; bool has_close_range; bool has_timer_cr_ids; + bool has_breakpoints; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index 930117b0a..fa43f7d3f 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -1736,6 +1736,83 @@ static int kerndat_has_timer_cr_ids(void) return 0; } +static void breakpoint_func(void) +{ + if (raise(SIGSTOP)) + pr_perror("Unable to kill itself with SIGSTOP"); + exit(1); +} + +/* + * kerndat_breakpoints checks that hardware breakpoints work as they should. + * In some cases, they might not work in virtual machines if the hypervisor + * doesn't virtualize them. For example, they don't work in AMD SEV virtual + * machines if the Debug Virtualization extension isn't supported or isn't + * enabled in SEV_FEATURES. + */ +static int kerndat_breakpoints(void) +{ + int status, ret, exit_code = -1; + pid_t pid; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return -1; + } + if (pid == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("ptrace(PTRACE_TRACEME)"); + exit(1); + } + raise(SIGSTOP); + breakpoint_func(); + exit(1); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for initial stop"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child didn't stop as expected: status=%x\n", status); + goto err; + } + ret = ptrace_set_breakpoint(pid, &breakpoint_func); + if (ret < 0) { + pr_err("Failed to set breakpoint\n"); + goto err; + } + if (ret == 0) { + pr_debug("Hardware breakpoints appear to be disabled\n"); + goto out; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for breakpoint trigger"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { + pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); + goto out; + } + kdat.has_breakpoints = true; +out: + exit_code = 0; +err: + if (kill(pid, SIGKILL)) { + pr_perror("Failed to kill the child process"); + exit_code = -1; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Failed to wait for the child process"); + exit_code = -1; + } + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("The child exited with unexpected code: %x\n", status); + exit_code = -1; + } + return exit_code; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1999,6 +2076,9 @@ int kerndat_init(void) } if (!ret && kerndat_has_timer_cr_ids()) { pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + } + if (!ret && kerndat_breakpoints()) { + pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index 6db9d21fe..e19847b37 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -421,7 +421,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; - if (fault_injected(FI_NO_BREAKPOINTS)) + if (fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; From 366d73a4c29033665d59a23d5e0f89323b5fc2b2 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Wed, 14 May 2025 19:02:06 +0200 Subject: [PATCH 595/775] make: remove checks and warnings for bsd strlcat and strlcpy In 0a7c5fd1bd8d1e49e273b51ff39af473d6c68cbc we swapped the BSD implementation of strlcat and strlcpy in favor of our own replacement. The checks and the predefined macros are not needed anymore. Signed-off-by: Lorenzo Fontana --- Makefile.config | 4 ++-- scripts/feature-tests.mak | 28 ---------------------------- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/Makefile.config b/Makefile.config index 5ab689d41..5cf4b8216 100644 --- a/Makefile.config +++ b/Makefile.config @@ -9,7 +9,7 @@ ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() and strlcpy() support.) + $(info Note: Building without setproctitle() support.) $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif @@ -84,7 +84,7 @@ endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ +FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ OPENAT2 NO_LIBC_RSEQ_DEFS diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index fb5d2ef7a..727e9689e 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -35,34 +35,6 @@ int main(void) } endef -define FEATURE_TEST_STRLCPY - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcpy(NULL, NULL, 0); -} -endef - -define FEATURE_TEST_STRLCAT - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcat(NULL, NULL, 0); -} -endef - define FEATURE_TEST_PTRACE_PEEKSIGINFO #include From fddca67cc633b28a73bbb1bb272018f5a3a7ea74 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 11 May 2025 11:33:29 +0100 Subject: [PATCH 596/775] seize: fix pause devices for frozen containers The container checkpointing procedure in Kubernetes freezes running containers to create a consistent snapshot of both the runtime state and the rootfs of the container. However, when checkpointing a GPU container, the container must be unfrozen before invoking the cuda-checkpoint tool. This is achieved in prepare_freezer_for_interrupt_only_mode(), which needs to be called before the PAUSE_DEVICES hook. The patch introducing this functionality fixes this problem for containers with multiple processes. However, if the container has a single process, prepare_freezer_for_interrupt_only_mode() must be invoked immediately before the PAUSE_DEVICES hook. Fixes: #2514 Signed-off-by: Radostin Stoyanov --- criu/seize.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/criu/seize.c b/criu/seize.c index f56357ac7..23f192d46 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1060,22 +1060,32 @@ int collect_pstree(void) */ alarm(opts.timeout); - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - if (opts.freeze_cgroup && cgroup_version()) goto err; pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); if (opts.freeze_cgroup && !compel_interrupt_only_mode) { + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (freeze_processes()) goto err; } else { if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) goto err; + + /* + * Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode() + * to be able to checkpoint containers in a frozen state. + */ + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + if (compel_interrupt_task(pid)) { set_cr_errno(ESRCH); goto err; @@ -1136,4 +1146,4 @@ int checkpoint_devices(void) exit_code = 0; err: return exit_code; -} \ No newline at end of file +} From d57d40a5ad76eec0e6d09e3ad44e35922cb98ff2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 16 May 2025 12:43:14 +0100 Subject: [PATCH 597/775] sk-inet: add MPTCP definition Building CRIU on Ubuntu 20.04 fails with the following error: criu/sk-inet.c: In function 'can_dump_ipproto': criu/sk-inet.c:131:16: error: 'IPPROTO_MPTCP' undeclared (first use in this function); did you mean 'IPPROTO_MTP'? 131 | if (proto == IPPROTO_MPTCP) | ^~~~~~~~~~~~~ | IPPROTO_MTP Add definition for MPTCP to fix this error. Signed-off-by: Radostin Stoyanov --- criu/sk-inet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index a191e78c4..1238b03dc 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -44,6 +44,11 @@ #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 +/* Definition for older kernels without MPTCP support (e.g. Ubuntu 20.04) */ +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif + static LIST_HEAD(inet_ports); struct inet_port { From 427c0dc27b473ead1367c417bee8aac2b39a2844 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:07:38 +0000 Subject: [PATCH 598/775] criu: Introduce a new device plugin hook for restore Currently, in the target process, device-related restore operations and other restore operations almost run sequentially. When the target process executes the corresponding CRIU hook functions, it can't perform other restore operations. However, for GPU applications, some device restore operations have no logical dependencies on other common restore operations and can be parallelized with other operations to speed up the process. Instead of launching a thread in child processes for parallelization, this patch chooses to add a new hook, `POST_FORKING`, in the main CRIU process to handle these restore operations. This is because the restoration of memory state in the restore blob is one of the most time-consuming parts of all restore logic. The main CRIU process can easily parallelize these operations, whereas parallelizing in threads within child processes is challenging. - POST_FORKING *POST_FORKING: Hook to enable the main CRIU process to perform some restore operations of plugins. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 3 +++ criu/include/criu-plugin.h | 4 ++++ criu/plugin.c | 1 + 3 files changed, 8 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index cabe2f464..9cc77b21f 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2132,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: + ret = run_plugins(POST_FORKING); + if (ret < 0 && ret != -ENOTSUP) + goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 392ea9f53..9fb21a449 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -60,6 +60,8 @@ enum { CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__MAX }; @@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); enum { CR_PLUGIN_STAGE__DUMP, @@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); typedef int(cr_plugin_resume_devices_late_t)(int pid); +typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/plugin.c b/criu/plugin.c index 65e79a069..18da0499d 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); + __assign_hook(POST_FORKING, "cr_plugin_post_forking"); #undef __assign_hook From 497109eb4e68caeb478dd3664b3ee1186c3baafd Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 29 Nov 2024 02:13:28 +0000 Subject: [PATCH 599/775] cr-restore: Move `cr_plugin_init` after `fdstore_init` Currently, when CRIU calls `cr_plugin_init`, `fdstore` is not initialized. However, during the plugin restore procedure, there may be some common file operations used in multiple hooks. This patch moves `cr_plugin_init` after `fdstore_init`, allowing `cr_plugin_init` to use `fdstore` to place these file operations. Signed-off-by: Yanning Yang --- criu/cr-restore.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9cc77b21f..c1d1f4b9d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2366,41 +2366,47 @@ int cr_restore_tasks(void) return 1; if (check_img_inventory(/* restore = */ true) < 0) - goto err; - - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; if (init_stats(RESTORE_STATS)) - goto err; + return -1; if (lsm_check_opts()) - goto err; + return -1; timing_start(TIME_RESTORE); if (cpu_init() < 0) - goto err; + return -1; if (vdso_init_restore()) - goto err; + return -1; if (tty_init_restore()) - goto err; + return -1; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - goto err; + return -1; } if (prepare_task_entries() < 0) - goto err; + return -1; if (prepare_pstree() < 0) - goto err; + return -1; if (fdstore_init()) - goto err; + return -1; + + /* + * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store + * its socket file descriptor. This allows the main process and the target process to + * communicate with each other through this file descriptor. Therefore, cr_plugin_init + * must be initialized after fdstore_init. + */ + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; if (inherit_fd_move_to_fdstore()) goto err; From e257d04974d7945e6e3fad52b6dae39e1e711cfc Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:34:14 +0000 Subject: [PATCH 600/775] pstree: Add `has_children` function Currently, parallel restore only focuses on the single-process situation. Therefore, it needs an interface to know if there is only one process to restore. This patch adds a `has_children` function in `pstree.h` and replaces some existing implementations with this function. Signed-off-by: Yanning Yang --- criu/cr-dump.c | 2 +- criu/include/pstree.h | 1 + criu/pstree.c | 9 +++++++-- criu/seize.c | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 302078caa..b8cf7d64d 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -1396,7 +1396,7 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(!list_empty(&item->children)); + BUG_ON(has_children(item)); if (!item->sid) { pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 1137046d4..b750a919e 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; +extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) diff --git a/criu/pstree.c b/criu/pstree.c index 660f1b9d9..75c2fc8d0 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -182,7 +182,7 @@ void free_pstree(struct pstree_item *root_item) struct pstree_item *item = root_item, *parent; while (item) { - if (!list_empty(&item->children)) { + if (has_children(item)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } @@ -244,10 +244,15 @@ int init_pstree_helper(struct pstree_item *ret) return 0; } +bool has_children(struct pstree_item *item) +{ + return !list_empty(&item->children); +} + /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { - if (!list_empty(&item->children)) + if (has_children(item)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { diff --git a/criu/seize.c b/criu/seize.c index 23f192d46..d0cf7b36c 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -1008,7 +1008,7 @@ static int collect_task(struct pstree_item *item) if (ret < 0) goto err_close; - if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { + if ((item->pid->state == TASK_DEAD) && has_children(item)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } From 1fd1b670c4c536e908abfbc01ebb76377555c2e1 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:36:33 +0000 Subject: [PATCH 601/775] plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 59 ++++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 6 +++ 2 files changed, 65 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_socket_utils.c create mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 000000000..9e957ae54 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 000000000..4e7aa2aa4 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,6 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +int install_parallel_sock(void); + +#endif \ No newline at end of file From e8ba7c103a02c49bbb1435b9d54d1fee33e31a0c Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:38:48 +0000 Subject: [PATCH 602/775] plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 261 +++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 48 +++++ 2 files changed, 309 insertions(+) diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c index 9e957ae54..c8bf6d1ba 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "amdgpu_socket_utils.h" #include "criu-log.h" @@ -53,6 +54,266 @@ int install_parallel_sock(void) ret = -1; goto err; } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + err: close(sock_fd); return ret; diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h index 4e7aa2aa4..d7200c6bd 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -1,6 +1,54 @@ #ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ #define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + int install_parallel_sock(void); +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + #endif \ No newline at end of file From a61116fd934868fbefb4db4edc565d117389e511 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Wed, 15 Jan 2025 06:38:27 +0000 Subject: [PATCH 603/775] plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 420 +++++++++++++++++++++--- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 + 4 files changed, 374 insertions(+), 51 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index a20d1d163..4bf5e499f 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c086162..69194fbc7 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; + if (!e->device_entries[i]->gpu_id) + continue; + + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0c..730f2e028 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3dda..e19f8e7ce 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); From 7a5b3d1f41cdc1f1f8960bf8037c1f646aada229 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:44:35 +0000 Subject: [PATCH 604/775] plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang --- Documentation/criu-amdgpu-plugin.txt | 1 + plugins/amdgpu/README.md | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3db..fe76fc3bc 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6..b808fbc4f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to From ae1395de184976250b4ddd9f7213fd129b6f2e74 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 30 Apr 2025 11:39:18 +0800 Subject: [PATCH 605/775] zdtm.py: add an option to change pycriu import path By default zdtm expects that criu is built from source first and only then you can run zdtm tests against it. But what if you really want to run tests against a criu version installed on the system? Yes there is already a nice option for zdtm to change the criu binary it uses "--criu-bin", but it would still end up using the pycriu module from source and you would still have to build everything beforehand. Let's add an option to change the path where zdtm searches for pycriu module "--pycriu-search-path". This way we can run zdtm tests on the criu installed on the system directly without building criu from source, e.g. on Fedora it works like: test/zdtm.py run --criu-bin /usr/sbin/criu \ --pycriu-search-path /usr/lib/python3.13/site-packages \ -t zdtm/static/env00 Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index e3ddc762a..d5514af71 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -22,11 +22,11 @@ import sys import tempfile import time import uuid +import site from builtins import input, int, open, range, str, zip import yaml -import pycriu as crpc from zdtm.criu_config import criu_config # File to store content of streamed images @@ -1142,6 +1142,24 @@ class criu: self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] + + global crpc + pycriu_search_path = opts.get('pycriu_search_path') + if pycriu_search_path: + sys.path.insert(0, pycriu_search_path) + + try: + import pycriu as crpc + if pycriu_search_path: + print(f"pycriu loaded from: {crpc.__file__}") + except ImportError: + if not pycriu_search_path: + print("Consider building CRIU or using '--pycriu-search-path' option.") + raise + finally: + if pycriu_search_path: + sys.path.pop(0) + self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] self.__preload_libfault = bool(opts['preload_libfault']) @@ -2169,7 +2187,8 @@ class Launcher: 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint') + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint', + 'pycriu_search_path') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2860,6 +2879,9 @@ def get_cli_args(): rp.add_argument("--criu-bin", help="Path to criu binary", default='../criu/criu') + rp.add_argument("--pycriu-search-path", + help=f"Path to search for pycriu module first (e.g., {site.getsitepackages()[0]})", + default=None) rp.add_argument("--crit-bin", help="Path to crit binary", default='../crit/crit') From 1fdff7c7a6f12627212b2704db48929204f6a397 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 19 May 2025 11:53:18 +0800 Subject: [PATCH 606/775] zdtm: fix check for criu binary The opts['action'] contains actor function and not the action name, so we should compare it with a function. While on it let's also add a comment about --criu-bin option if CRIU binary is missing. Signed-off-by: Pavel Tikhomirov --- test/zdtm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm.py b/test/zdtm.py index d5514af71..3339dd816 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1611,6 +1611,7 @@ class criu: def available(): if not os.access(opts['criu_bin'], os.X_OK): print("CRIU binary not found at %s" % opts['criu_bin']) + print("Consider building CRIU or using '--criu-bin' option.") sys.exit(1) def kill(self): @@ -2972,7 +2973,7 @@ if __name__ == '__main__': if opts['debug']: sys.settrace(traceit) - if opts['action'] == 'run': + if opts['action'] == run_tests: criu.available() for tst in test_classes.values(): tst.available() From 2b8951a9cf22c587d3ba397f9f2adc1863bd5dd3 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 16 May 2025 19:26:01 +0000 Subject: [PATCH 607/775] image: use `protoc` instead of `protoc-c` The new protoc 1.5.2 reports warnings: `protoc-c` is deprecated. Please use `protoc` instead! Signed-off-by: Andrei Vagin --- images/Makefile | 4 ++-- plugins/amdgpu/Makefile | 2 +- test/others/rpc/Makefile | 2 +- test/others/unix-callback/Makefile | 2 +- test/zdtm/static/Makefile | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/images/Makefile b/images/Makefile index 1e40b8a8f..d966fbfca 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,7 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto +proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -96,7 +96,7 @@ makefile-deps := Makefile $(obj)/Makefile define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ - $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< + $$(Q) protoc --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 4bf5e499f..870a039cd 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -25,7 +25,7 @@ else endif criu-amdgpu.pb-c.c: criu-amdgpu.proto - protoc-c --proto_path=. --c_out=. criu-amdgpu.proto + protoc --proto_path=. --c_out=. criu-amdgpu.proto amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index b2f907abe..384eb0539 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -47,7 +47,7 @@ rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto - protoc-c --proto_path=. --c_out=. rpc.proto + protoc --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu diff --git a/test/others/unix-callback/Makefile b/test/others/unix-callback/Makefile index 25bcf228b..984044077 100644 --- a/test/others/unix-callback/Makefile +++ b/test/others/unix-callback/Makefile @@ -4,7 +4,7 @@ run: all ./run.sh unix.pb-c.c: unix.proto - protoc-c --proto_path=. --c_out=. unix.proto + protoc --proto_path=. --c_out=. unix.proto unix-lib.so: unix-lib.c unix.pb-c.c gcc -g -Werror -Wall -shared -nostartfiles unix-lib.c unix.pb-c.c -o unix-lib.so -iquote ../../../criu/include -fPIC diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 81e44de22..61cacbb4e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -734,7 +734,7 @@ criu-rtc.pb-c.c: criu-rtc.proto $(Q)echo $@ >> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ - $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto + $(Q)protoc --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ From af5412a433c6456071b45d7753132b84a942891b Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:27:32 +0200 Subject: [PATCH 608/775] criu/proc_parse: support MADV_WIPEONFORK/VM_WIPEONFORK Support VM_WIPEONFORK [1] by detecting it from /proc//smaps and setting a corresponding MADV_WIPEONFORK flag on vma. [1] https://github.com/torvalds/linux/commit/d2cd9ede6e193dd7d88b6d27399e96229a551b19 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/proc_parse.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 8ca71fadf..a55356490 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -13,5 +13,8 @@ #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 99dc518a5..a97ee11d1 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -160,6 +160,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) From 6476488a510264f922568d94e1f2be2208c8b2be Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 16:32:01 +0200 Subject: [PATCH 609/775] test/zdtm/static/maps02: add MADV_WIPEONFORK testcase In addition to that I did small non-functional corrections. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index 31d0d92b2..d9ac8b1ce 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -18,6 +18,10 @@ #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; @@ -57,6 +61,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* * Anything else is just ignored. diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 29f1372c9..37c09dc71 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -6,7 +6,11 @@ #define MADV_DONTDUMP 16 #endif -const char *test_doc = "Test shared memory with advises"; +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test private memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { @@ -43,12 +47,12 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[5] = {}; + struct mmap_data m[6] = {}; size_t i; test_init(argc, argv); - test_msg("Alloc growsdown\n"); + test_msg("Alloc dontfork\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; @@ -64,10 +68,14 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; - test_msg("Alloc dontfork/random|mergeable\n"); + test_msg("Alloc mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; + test_msg("Alloc wipeonfork\n"); + if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From b90cfc1a80f69ff1fc7595c349c81d73e7f7ccc0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:11:28 +0200 Subject: [PATCH 610/775] criu/proc_parse: support MAP_DROPPABLE mappings Support MAP_DROPPABLE [1] by detecting it from /proc//smaps and restoring it as a normal private mapping flag on vma with only difference that instead of MAP_PRIVATE we should use MAP_DROPPABLE. [1] https://github.com/torvalds/linux/commit/9651fcedf7b92d3f7f1ab179e8ab55b85ee10fc1 Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ criu/mem.c | 12 ++++++++++++ criu/proc_parse.c | 16 ++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index a55356490..086753bcf 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -4,6 +4,9 @@ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif diff --git a/criu/mem.c b/criu/mem.c index c9578ef44..803cb545b 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,6 +10,7 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" +#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -398,6 +399,17 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str if (vma_entry_is(vma->e, VMA_AREA_VVAR)) return 0; + /* + * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") + * tells us that: + * Under memory pressure, mm can just drop the pages (so that they're + * zero when read back again). + * + * Let's just skip MAP_DROPPABLE mappings pages dump logic. + */ + if (vma->e->flags & MAP_DROPPABLE) + return 0; + /* * To facilitate any combination of pre-dump modes to run after * one another, we need to take extra care as discussed below. diff --git a/criu/proc_parse.c b/criu/proc_parse.c index a97ee11d1..d7eb25662 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -144,6 +144,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -206,6 +208,20 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) if (vma_area->e->madv) vma_area->e->has_madv = true; + + /* + * We set MAP_PRIVATE flag on vma_area->e->flags right after parsing + * a first line of VMA entry in /proc//smaps file: + * 7fa84fa70000-7fa84fa95000 rw-p 00000000 00:00 0 + * but it's too early and we can't distinguish between MAP_DROPPABLE + * and MAP_PRIVATE mappings yet, as they both private mappings in nature + * and at this point we haven't yet read "VmFlags:" line in smaps. + * + * Let's detect this situation and drop MAP_PRIVATE flag while keep + * MAP_DROPPABLE, otherwise restorer's restore_mapping() helper will fail. + */ + if ((vma_area->e->flags & MAP_PRIVATE) && (vma_area->e->flags & MAP_DROPPABLE)) + vma_area->e->flags &= ~MAP_PRIVATE; } static inline int is_anon_shmem_map(dev_t dev) From 4f9dcfb9c8dc1d2c6bb07ffc63722c70b8b50796 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 18:55:46 +0200 Subject: [PATCH 611/775] pycriu/images/pb2dict: add MAP_DROPPABLE flag Signed-off-by: Alexander Mikhalitsyn --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index e3dd95ac0..6c4f68889 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -83,6 +83,7 @@ mmap_prot_map = [ mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), + ('MAP_DROPPABLE', 0x08), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ] From dfa0ce1808fb1e3a1439392a6aec8071643ad2c0 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 17:21:23 +0200 Subject: [PATCH 612/775] test/zdtm/static/maps02: add MAP_DROPPABLE testcase Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/get_smaps_bits.c | 6 ++++++ test/zdtm/static/maps02.c | 20 +++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index d9ac8b1ce..3d952ac95 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -6,6 +6,10 @@ #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -45,6 +49,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 37c09dc71..38244f020 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -2,6 +2,10 @@ #include "zdtmtst.h" #include "get_smaps_bits.h" +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif @@ -27,8 +31,14 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { - pr_perror("mmap failed"); - return -1; + if (errno == EINVAL) { + test_msg("mmap failed, no kernel support\n"); + *m = (struct mmap_data){}; + return 0; + } else { + pr_perror("mmap failed"); + return -1; + } } if (madvise(m->start, MEM_SIZE, adv)) { @@ -47,7 +57,7 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[6] = {}; + struct mmap_data m[7] = {}; size_t i; test_init(argc, argv); @@ -76,6 +86,10 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) return -1; + test_msg("Alloc droppable\n"); + if (alloc_anon_mmap(&m[6], MAP_DROPPABLE | MAP_ANONYMOUS, MADV_NORMAL)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) From 5f18ca1bbe34a287af4dc4b0e7900253c3c71d51 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 4 May 2025 21:11:29 +0200 Subject: [PATCH 613/775] test/zdtm/static: add maps11 test for MAP_DROPPABLE/MADV_WIPEONFORK In this test we want to ensure that contents of droppable mappings and mappings with MADV_WIPEONFORK is properly restored in parent/child processes. Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps11.c | 205 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 test/zdtm/static/maps11.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 61cacbb4e..34fc90513 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -150,6 +150,7 @@ TST_NOFILE := \ maps05 \ maps09 \ maps10 \ + maps11 \ mlock_setuid \ xids00 \ groups \ diff --git a/test/zdtm/static/maps11.c b/test/zdtm/static/maps11.c new file mode 100644 index 000000000..df309714b --- /dev/null +++ b/test/zdtm/static/maps11.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test MAP_DROPPABLE/MADV_WIPEONFORK mappings with 2 processes"; +const char *test_author = "Alexander Mikhalitsyn "; + +bool mem_is_zero(const uint8_t *buffer, size_t length) +{ + size_t i; + + for (i = 0; i < length; i++) + if (buffer[i] != 0) + return false; + + return true; +} + +int main(int argc, char **argv) +{ + uint8_t *p1, *p2; + pid_t pid; + int status; + const char data[] = "MADV_WIPEONFORK vma data"; + bool criu_was_there = false; + struct stat st1, st2; + + test_init(argc, argv); + + p1 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_DROPPABLE | MAP_ANONYMOUS, 0, 0); + if (p1 == MAP_FAILED) { + if (errno == EINVAL) { + skip("mmap failed, no kernel support for MAP_DROPPABLE\n"); + goto skip; + } else { + pr_perror("mmap failed"); + return -1; + } + } + + p2 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (p2 == MAP_FAILED) { + pr_perror("mmap failed"); + return 1; + } + + if (madvise(p2, sizeof(data), MADV_WIPEONFORK)) { + pr_perror("madvise failed"); + return -1; + } + + /* contents of this mapping is supposed to be dropped after C/R */ + memcpy(p1, data, sizeof(data)); + + /* contents of this mapping is supposed to be dropped after fork() */ + memcpy(p2, data, sizeof(data)); + + /* + * Let's spawn a process before C/R so our mappings get inherited + * then, after C/R we need to ensure that CRIU memory premapping + * machinery works properly. + * + * It is important, because we restore MADV_WIPEONFORK on a later + * stages (after vma premapping happens) and we need to ensure that + * CRIU handles everything in a right way. + */ + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + test_waitsig(); + + /* + * Both mappings have VM_WIPEONFORK flag set, + * so we expect to have it null-ified after fork(). + */ + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("1st child: memory check failed\n"); + return 1; + } + + return 0; + } + + /* + * A simple way to detect if C/R happened is to compare st_ino + * fields of stat() on the procfs files of the current task. + * + * Hopefully, this terrible hack is never used in real-world + * applications ;-) Here, we only need this to make test + * to pass with/without --nocr option. + */ + if (stat("/proc/self/status", &st1)) { + pr_perror("stat"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* signal a child process to continue */ + if (kill(pid, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("1st waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("1st process didn't exit cleanly: status=%d", status); + goto err; + } + + if (stat("/proc/self/status", &st2)) { + pr_perror("stat"); + return 1; + } + + /* detect CRIU */ + criu_was_there = st1.st_ino != st2.st_ino; + + /* + * We should mark failure if one of the following happens: + * 1. MAP_DROPPABLE memory is not zero after C/R + * 2. MAP_DROPPABLE memory somehow changed without C/R + * (kernel issue? memory pressure?) + * 3. MADV_WIPEONFORK memory is not preserved + * + * We care about 2nd case only because we would like test + * to pass even with --nocr zdtm.py option. + */ + if ((criu_was_there && !mem_is_zero(p1, sizeof(data))) || + (!criu_was_there && memcmp(p1, data, sizeof(data))) || + memcmp(p2, data, sizeof(data))) { + fail("Data mismatch"); + return 1; + } + + /* contents of these mappings is supposed to be dropped after fork() */ + memcpy(p1, data, sizeof(data)); + memcpy(p2, data, sizeof(data)); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("2nd child: memory check failed\n"); + return 1; + } + + return 0; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("2nd waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("2nd process didn't exit cleanly: status=%d", status); + goto err; + } + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; + +skip: + test_daemon(); + test_waitsig(); + pass(); + return 0; +} From fbfed312e086b79bcddd30dbc368c16f2ca43310 Mon Sep 17 00:00:00 2001 From: Prajwal S N Date: Mon, 14 Apr 2025 14:06:40 +0530 Subject: [PATCH 614/775] feat: introduce Nix flake CRIU currently requires a number of dependencies in order to build from source. The package names vary across distributions and package managers. A Nix flake allows developers to spin up a dev environment with `nix develop`, eliminating the hassle of manual dependency management. It also prevents polluting the global package set on the machine. Signed-off-by: Prajwal S N --- CONTRIBUTING.md | 2 +- flake.lock | 61 +++++++++++++++++++++++++++++++++++++++ flake.nix | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 flake.lock create mode 100644 flake.nix diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 37965e5fb..712e7b813 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,7 +34,7 @@ To clone CRIU repo and switch to the proper branch, run: ### Compile -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. To compile CRIU, run: diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..90c914452 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1744463964, + "narHash": "sha256-LWqduOgLHCFxiTNYi3Uj5Lgz0SR+Xhw3kr/3Xd0GPTM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2631b0b7abcea6e640ce31cd78ea58910d31e650", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..dc2429ffc --- /dev/null +++ b/flake.nix @@ -0,0 +1,77 @@ +{ + description = "CRIU development environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + # Dependencies for CRIU + criuDeps = with pkgs; [ + # Compiler and build essentials + gcc + gnumake + pkg-config + + # Protocol Buffers + protobuf + protobufc + python3Packages.protobuf + + # Other required libraries + libuuid + libbsd + iproute2 + nftables + libcap + libnet + libnl + libaio + gnutls + libdrm + + # ZDTM + python3Packages.pyyaml + ]; + + # Multilib support for 32-bit compatibility + # criuDeps32bit = with pkgs; [ + # glibc.dev + # glibc + # gcc-unwrapped + # ]; + + devShell = pkgs.mkShell { + buildInputs = criuDeps; # ++ (if pkgs.stdenv.isx86_64 then criuDeps32bit else []); + + shellHook = '' + echo "CRIU development environment" + echo "==============================" + echo "" + echo "Useful commands:" + echo " make - Build CRIU" + echo " make test - Run tests (requires ZDTM dependencies)" + echo "" + ''; + + # Add proper flags for multilib support + # NIX_CFLAGS_COMPILE = pkgs.lib.optional pkgs.stdenv.isx86_64 "-m32"; + + # Make sure the shell can find headers for multilib + # PKG_CONFIG_PATH = pkgs.lib.makeSearchPath "lib/pkgconfig" criuDeps; + }; + in + { + # Export the development shell + devShells.default = devShell; + + # Build CRIU package as well + packages.default = pkgs.criu; + } + ); +} From fcbaac0598e5be2cb87bf19341a0705fcae98259 Mon Sep 17 00:00:00 2001 From: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:41:51 +0000 Subject: [PATCH 615/775] crtools: simplify check for cpuinfo subcommands The cpuinfo command requires a "dump" or "check" subcommand. Thus, we replace `CR_CPUINFO` with `CR_CPUINFO_DUMP` and `CR_CPUINFO_CHECK`. This allows us to remove unnecessary subcommand check in `image_dir_mode()` and perform all parsing in `parse_criu_mode()`. With this change the check for validating the cpuinfo subcommand is now done only once with `CR_CPUINFO_DUMP` or `CR_CPUINFO_CHECK` enum. Signed-off-by: Liana Koleva <43767763+lianakoleva@users.noreply.github.com> Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 2 +- criu/crtools.c | 57 ++++++++++++++++++++------------------- criu/include/cr_options.h | 3 ++- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b9d11ced2..d8c5967bc 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -1261,7 +1261,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; - opts.mode = CR_CPUINFO; + opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; diff --git a/criu/crtools.c b/criu/crtools.c index 6f493850b..4734c90f2 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,19 +54,17 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } -static int image_dir_mode(char *argv[], int optind) +static int image_dir_mode(void) { switch (opts.mode) { case CR_DUMP: /* fallthrough */ + case CR_CPUINFO_DUMP: + /* fallthrough */ case CR_PRE_DUMP: return O_DUMP; case CR_RESTORE: return O_RSTR; - case CR_CPUINFO: - if (!strcmp(argv[optind + 1], "dump")) - return O_DUMP; - /* fallthrough */ default: return -1; } @@ -76,7 +74,7 @@ static int image_dir_mode(char *argv[], int optind) return -1; } -static int parse_criu_mode(char *mode) +static int parse_criu_mode(char *mode, char *subcommand) { if (!strcmp(mode, "dump")) opts.mode = CR_DUMP; @@ -96,8 +94,12 @@ static int parse_criu_mode(char *mode) opts.mode = CR_SWRK; else if (!strcmp(mode, "dedup")) opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo")) - opts.mode = CR_CPUINFO; + else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) + return -2; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; else if (!strcmp(mode, "exec")) opts.mode = CR_EXEC_DEPRECATED; else if (!strcmp(mode, "show")) @@ -115,6 +117,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; + char *subcommand; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -165,9 +168,15 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - if (parse_criu_mode(argv[optind])) { + has_sub_command = (argc - optind) > 1; + subcommand = has_sub_command ? argv[optind + 1] : NULL; + ret = parse_criu_mode(argv[optind], subcommand); + if (ret == -1) { pr_err("unknown command: %s\n", argv[optind]); goto usage; + } else if (ret == -2) { + pr_err("cpuinfo requires an action: dump or check\n"); + goto usage; } /* * util_init initializes criu_run_id and compel_run_id so that sockets @@ -223,25 +232,20 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else { + } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (opts.mode != CR_CPUINFO && has_sub_command) { - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); - goto usage; - } else if (opts.mode == CR_CPUINFO && !has_sub_command) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + goto usage; } - if (opts.stream && image_dir_mode(argv, optind) == -1) { + if (opts.stream && image_dir_mode() == -1) { pr_err("--stream cannot be used with the %s command\n", argv[optind]); goto usage; } /* We must not open imgs dir, if service is called */ if (opts.mode != CR_SERVICE) { - ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); + ret = open_image_dir(opts.imgs_dir, image_dir_mode()); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; @@ -335,15 +339,12 @@ int main(int argc, char *argv[], char *envp[]) if (opts.mode == CR_DEDUP) return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO) { - if (!argv[optind + 1]) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } - if (!strcmp(argv[optind + 1], "dump")) - return cpuinfo_dump(); - else if (!strcmp(argv[optind + 1], "check")) - return cpuinfo_check(); + if (opts.mode == CR_CPUINFO_DUMP) { + return cpuinfo_dump(); + } + + if (opts.mode == CR_CPUINFO_CHECK) { + return cpuinfo_check(); } if (opts.mode == CR_EXEC_DEPRECATED) { diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index ab0bd8fa3..4df8056b7 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -125,7 +125,8 @@ enum criu_mode { CR_SERVICE, CR_SWRK, CR_DEDUP, - CR_CPUINFO, + CR_CPUINFO_DUMP, + CR_CPUINFO_CHECK, CR_EXEC_DEPRECATED, CR_SHOW_DEPRECATED, }; From 99ba6db89b288b81beff3bfeace72552dedf5579 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 20 May 2025 14:47:55 +0000 Subject: [PATCH 616/775] crtools: do a few minor cleanups Signed-off-by: Andrei Vagin --- criu/crtools.c | 142 +++++++++++++++++++++++++------------------------ 1 file changed, 73 insertions(+), 69 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 4734c90f2..509e73d74 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -74,40 +74,55 @@ static int image_dir_mode(void) return -1; } -static int parse_criu_mode(char *mode, char *subcommand) -{ - if (!strcmp(mode, "dump")) - opts.mode = CR_DUMP; - else if (!strcmp(mode, "pre-dump")) - opts.mode = CR_PRE_DUMP; - else if (!strcmp(mode, "restore")) - opts.mode = CR_RESTORE; - else if (!strcmp(mode, "lazy-pages")) - opts.mode = CR_LAZY_PAGES; - else if (!strcmp(mode, "check")) - opts.mode = CR_CHECK; - else if (!strcmp(mode, "page-server")) - opts.mode = CR_PAGE_SERVER; - else if (!strcmp(mode, "service")) - opts.mode = CR_SERVICE; - else if (!strcmp(mode, "swrk")) - opts.mode = CR_SWRK; - else if (!strcmp(mode, "dedup")) - opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo") && subcommand == NULL) - return -2; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "dump")) - opts.mode = CR_CPUINFO_DUMP; - else if (!strcmp(mode, "cpuinfo") && !strcmp(subcommand, "check")) - opts.mode = CR_CPUINFO_CHECK; - else if (!strcmp(mode, "exec")) - opts.mode = CR_EXEC_DEPRECATED; - else if (!strcmp(mode, "show")) - opts.mode = CR_SHOW_DEPRECATED; - else - return -1; +struct { + char *cmd; + int mode; +} commands[] = { + { "dump", CR_DUMP }, + { "pre-dump", CR_PRE_DUMP }, + { "restore", CR_RESTORE }, + { "lazy-pages", CR_LAZY_PAGES }, + { "check", CR_CHECK }, + { "page-server", CR_PAGE_SERVER }, + { "service", CR_SERVICE }, + { "swrk", CR_SWRK }, + { "dedup", CR_DEDUP }, + { "exec", CR_EXEC_DEPRECATED }, + { "show", CR_SHOW_DEPRECATED }, +}; - return 0; +static int parse_criu_mode(int argc, char **argv, int *optind) +{ + char *cmd = argv[*optind]; + bool has_sub_command = (argc - *optind) > 1; + char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(cmd, commands[i].cmd)) + continue; + opts.mode = commands[i].mode; + return 0; + } + + if (!strcmp(cmd, "cpuinfo")) { + if (subcommand == NULL) { + pr_err("cpuinfo requires an action: dump or check\n"); + return -1; + } + if (!strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; + else { + pr_err("unknown cpuinfo sub-command: %s\n", subcommand); + return -1; + } + (*optind)++; + return 0; + } + pr_err("unknown command: %s\n", argv[*optind]); + return -1; } int main(int argc, char *argv[], char *envp[]) @@ -117,7 +132,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; - char *subcommand; + char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -168,16 +183,11 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - has_sub_command = (argc - optind) > 1; - subcommand = has_sub_command ? argv[optind + 1] : NULL; - ret = parse_criu_mode(argv[optind], subcommand); - if (ret == -1) { - pr_err("unknown command: %s\n", argv[optind]); + cmd = argv[optind]; + ret = parse_criu_mode(argc, argv, &optind); + if (ret) goto usage; - } else if (ret == -2) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + /* * util_init initializes criu_run_id and compel_run_id so that sockets * are generated with an unique name identifying the specific process @@ -232,14 +242,13 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else if (opts.mode != CR_CPUINFO_DUMP && opts.mode != CR_CPUINFO_CHECK && has_sub_command) { - /* No subcommands except for cpuinfo and restore --exec-cmd */ - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + } else if (has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); goto usage; } if (opts.stream && image_dir_mode() == -1) { - pr_err("--stream cannot be used with the %s command\n", argv[optind]); + pr_err("--stream cannot be used with the %s command\n", cmd); goto usage; } @@ -290,14 +299,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (opts.mode == CR_DUMP) { + switch (opts.mode) { + case CR_DUMP: if (!opts.tree_id) goto opt_pid_missing; return cr_dump_tasks(opts.tree_id); - } - - if (opts.mode == CR_PRE_DUMP) { + case CR_PRE_DUMP: if (!opts.tree_id) goto opt_pid_missing; @@ -307,9 +315,7 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - } - - if (opts.mode == CR_RESTORE) { + case CR_RESTORE: if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -322,43 +328,41 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; - } - if (opts.mode == CR_LAZY_PAGES) + case CR_LAZY_PAGES: return cr_lazy_pages(opts.daemon_mode) != 0; - if (opts.mode == CR_CHECK) + case CR_CHECK: return cr_check() != 0; - if (opts.mode == CR_PAGE_SERVER) + case CR_PAGE_SERVER: return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (opts.mode == CR_SERVICE) + case CR_SERVICE: return cr_service(opts.daemon_mode); - if (opts.mode == CR_DEDUP) + case CR_DEDUP: return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO_DUMP) { + case CR_CPUINFO_DUMP: return cpuinfo_dump(); - } - if (opts.mode == CR_CPUINFO_CHECK) { + case CR_CPUINFO_CHECK: return cpuinfo_check(); - } - if (opts.mode == CR_EXEC_DEPRECATED) { + case CR_EXEC_DEPRECATED: pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; - } - if (opts.mode == CR_SHOW_DEPRECATED) { + case CR_SHOW_DEPRECATED: pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - } - pr_err("unknown command: %s\n", argv[optind]); + case CR_UNSET: + default: + pr_err("unknown command: %s\n", cmd); + } usage: pr_msg("\n" "Usage:\n" From a79b33d0c5f5a56c58cb1201f2b5dfa9aed159bc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 23 May 2025 08:33:20 +0100 Subject: [PATCH 617/775] cpuinfo: show error when image is missing The `criu cpuinfo check` command calls cpu_validate_cpuinfo(), which attempts to open the cpuinfo.img file using `open_image()`. If the image file is not found, `open_image()` returns an "empty image" object. As a result, `cpu_validate_cpuinfo()` tries to read from it and fails with the following error: (00.002473) Error (criu/protobuf.c:72): Unexpected EOF on (empty-image) This patch adds a check for an empty image and appropriate error message. Signed-off-by: Radostin Stoyanov --- criu/arch/ppc64/cpu.c | 6 ++++++ criu/arch/s390/cpu.c | 6 ++++++ criu/arch/x86/cpu.c | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index bb5b7256e..b87230f40 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -64,6 +64,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index 3f430f455..e227fad5e 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -87,6 +87,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index dfa31569f..2e1f2de9a 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -407,6 +407,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; From 922754dffd9efd99b051215c477e0bf6d70562aa Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Tue, 6 May 2025 22:40:25 -0400 Subject: [PATCH 618/775] rpc/log: return first error always Use shared first error buffer to return correct first error in rpc. Fixes: #338 Signed-off-by: Ivan Pravdin --- criu/cr-service.c | 24 +++++++++++++++++++++++- criu/log.c | 4 ++++ test/others/rpc/errno.py | 22 +++++++++++++++++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index d8c5967bc..a1089ad5c 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -895,6 +895,11 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -919,6 +924,7 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -927,6 +933,11 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) int pid, status; bool success = false; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1005,6 +1016,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -1078,6 +1094,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1252,6 +1269,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1301,7 +1323,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) out: resp.type = msg->type; resp.success = success; - + set_resp_err(&resp); return send_criu_msg(sk, &resp); } diff --git a/criu/log.c b/criu/log.c index 70e267fd6..a02a8df20 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -114,6 +115,9 @@ static struct str_and_lock *first_err; int log_keep_err(void) { + if (first_err) + return 0; + first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index 4ea6c9d44..a5a3eb54d 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -40,7 +40,7 @@ class test: resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp - def check_resp(self, resp, typ, err): + def check_resp(self, resp, typ, err, errmsg = None): if resp.type != typ: raise Exception('Unexpected response type ' + str(resp.type)) @@ -49,6 +49,9 @@ class test: if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) + + if errmsg and errmsg not in resp.cr_errmsg: + raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') def no_process(self): print('Try to dump unexisting process') @@ -131,12 +134,29 @@ class test: self.check_resp(resp, rpc.EMPTY, None) print('Success') + + def child_first_err(self): + print('Receive correct first error message') + + req = self.get_base_req() + req.type = rpc.CHECK + + # mntns_compat_mode options is only allowed on restore + req.opts.mntns_compat_mode = True + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.CHECK, None, "Option --mntns-compat-mode is only valid on restore\n") + + print('Success') def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() + self.child_first_err() t = test() From 4c7d42f67a0da8fbd60b811c58f2d18950d88d1a Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:24:11 +0800 Subject: [PATCH 619/775] ipc/sysctl: fix CTL_FLAGS_IPC_EACCES_SKIP by making it a flag Having CTL_FLAGS_IPC_EACCES_SKIP == (CTL_FLAGS_OPTIONAL | CTL_FLAGS_READ_EIO_SKIP) is probably not what we want. So let's make it a real distinct flag. Fixes: 840735aa0 ("ipc_sysctl: Prioritize restoring IPC variables using non usernsd approach") Signed-off-by: Pavel Tikhomirov --- criu/include/sysctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index cb3eba817..2d689a9a0 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -37,6 +37,6 @@ enum { #define CTL_FLAGS_OPTIONAL 1 #define CTL_FLAGS_HAS 2 #define CTL_FLAGS_READ_EIO_SKIP 4 -#define CTL_FLAGS_IPC_EACCES_SKIP 5 +#define CTL_FLAGS_IPC_EACCES_SKIP 8 #endif /* __CR_SYSCTL_H__ */ From 4f057a6aeb6ced50ec412e425cd214975ceea42b Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 13:34:19 +0800 Subject: [PATCH 620/775] net/sysctl: fix missprint in an error message Fixes: f38e58836 ("net/sysctl: c/r ipv4/ping_group_range value") Signed-off-by: Pavel Tikhomirov --- criu/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 300df480b..e5d2f1c4d 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2147,7 +2147,7 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) size_t n = *pn; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctlig (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } From 45d09ae17e9524250e31750abdc34e9a34710e94 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Tue, 10 Jun 2025 11:33:59 +0800 Subject: [PATCH 621/775] net/sysctl: fix broken ipv4_sysctls_op We have ability to skip sysctl if there is no value, but we still give n requests to sysctl_op, that is not correct and probably can segfault on nullptr access. Fix it by adding ri to count non skipped requests. To be on the safe side, let's add a check that ri == n on read, as we should not do any skips there. While on it lets fix bad error message prefix: s/unix/ipv4/. Remove excess has_iarg set, and add sarg reset to NULL for the case sysctl_op skipped it. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/criu/net.c b/criu/net.c index e5d2f1c4d..2c018ef7b 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2144,51 +2144,53 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; SysctlEntry **sysctl = *rsysctl; - size_t n = *pn; + size_t n = *pn, ri; if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("unix: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + pr_err("ipv4: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < n; i++) { - snprintf(path[i], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); - req[i].name = path[i]; - req[i].flags = flags; + for (i = 0, ri = 0; i < n; i++) { + snprintf(path[ri], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[ri].name = path[ri]; + req[ri].flags = flags; switch (sysctl[i]->type) { case SYSCTL_TYPE__CTL_STR: - req[i].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + req[ri].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); /* skip write if have no value */ if (op == CTL_WRITE && !sysctl[i]->sarg) continue; - req[i].arg = sysctl[i]->sarg; + req[ri].arg = sysctl[i]->sarg; break; default: pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); return -1; } + ri++; } - ret = sysctl_op(req, n, op, CLONE_NEWNET); + ret = sysctl_op(req, ri, op, CLONE_NEWNET); if (ret < 0) { - pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + pr_err("ipv4: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); return -1; } if (op == CTL_READ) { bool has_entries = false; + BUG_ON(ri != n); for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { - sysctl[i]->has_iarg = true; - if (!has_entries) - has_entries = true; + has_entries = true; + } else { + sysctl[i]->sarg = NULL; } } From 87bd09a0d18b1388dde831ee2ff6bef7bc9f0845 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Sun, 8 Jun 2025 14:07:13 +0800 Subject: [PATCH 622/775] net/sysctl: make ipv4/ping_group_range work in user namespaces We dump sysctls from criu user namespace, but restore from restored user namespace. So group id values should be mapped to the restored user namespace gid space to restore correctly. Signed-off-by: Andrei Vagin Signed-off-by: Pavel Tikhomirov --- criu/net.c | 44 ++++++++++++++++++++++++++ test/zdtm/static/netns_sub_sysctl.desc | 2 +- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/criu/net.c b/criu/net.c index 2c018ef7b..e5775a328 100644 --- a/criu/net.c +++ b/criu/net.c @@ -2203,6 +2203,42 @@ static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) return 0; } +static int ipv4_sysctls_ping_group_range_map_gid(SysctlEntry *ent, size_t size) +{ + int start, end, ustart, uend, ret; + + if (sscanf(ent->sarg, "%d %d", &start, &end) != 2) { + pr_err("Failed to parse ping_group_range: %s\n", ent->sarg); + return -1; + } + + /* + * The default is "1 0", which means no group + * is allowed to create ICMP Echo sockets. + */ + if (start == 1 && end == 0) { + pr_debug("The ping_group_range is set to default, skipping it.\n"); + ent->sarg = NULL; + return 0; + } + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + ustart = userns_gid(start); + uend = userns_gid(end); + pr_debug("Mapping ping_group_range %d %d to userns -> %d %d\n", + start, end, ustart, uend); + + ret = snprintf(ent->sarg, size, "%d\t%d\n", ustart, uend); + if (ret < 0 || ret >= size) { + pr_err("Failed to map ping_group_range: %d\t%d\n", ustart, uend); + return -1; + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2220,6 +2256,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) SysctlEntry *ipv4_sysctls = NULL; size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; + int ping_group_range_id = -1; NetnsId *ids; struct netns_id *p; @@ -2310,6 +2347,7 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; netns.ipv4_sysctl[i]->sarg = ping_group_range; + ping_group_range_id = i; } else { /* Need to handle this case when we have more sysctls */ BUG(); @@ -2338,6 +2376,12 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + BUG_ON(ping_group_range_id == -1); + ret = ipv4_sysctls_ping_group_range_map_gid(netns.ipv4_sysctl[ping_group_range_id], + MAX_STR_IPV4_SYSCTL_LEN + 1); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc index 535842668..0c357aefe 100644 --- a/test/zdtm/static/netns_sub_sysctl.desc +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -1,4 +1,4 @@ { - 'flavor': 'ns', + 'flavor': 'ns uns', 'flags': 'suid' } From 677a56891917b873dc30278be0063730d901717d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 9 Jun 2025 21:17:57 -0700 Subject: [PATCH 623/775] zdtm/netns_sub_sysctl: skip unsupported sysctls net/unix/max_dgram_qlen can't be tuned from non-root userns before: v5.17-rc1~170^2~215 ("net: Enable max_dgram_qlen unix sysctl to be configurable by non-init user namespaces") Signed-off-by: Andrei Vagin --- test/zdtm/static/netns_sub_sysctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 0f94c40a7..03b478b7d 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -1,4 +1,6 @@ #include +#include +#include #include "zdtmtst.h" #include "sysctl.h" @@ -20,6 +22,7 @@ typedef struct { int new; char s_old[MAX_STR_SYSCTL_LEN]; char s_new[MAX_STR_SYSCTL_LEN]; + bool set; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" @@ -38,6 +41,11 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { + if (access(p->path, W_OK) != 0) { + test_msg("%s doesn't exist\n", p->path); + continue; + } + p->set = true; if (p->type == SYSCTL_INT) { p->old = (((unsigned)lrand48()) % 1023) + 1; if (sysctl_write_int(p->path, p->old)) { @@ -56,6 +64,8 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { + if (!p->set) + continue; if (p->type == SYSCTL_INT) { if (sysctl_read_int(p->path, &p->new)) ret = 1; From a80c54484559a8bf7670f246db897b2710bb8688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Fri, 27 Dec 2024 03:47:35 +0530 Subject: [PATCH 624/775] sk-inet: Add support for checkpoint/restore of ICMP sockets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there is no option to checkpoint/restore programs that use ICMP sockets, such as `ping`. This patch adds support for the same. Fixes #2557 Signed-off-by: समीर सिंह Sameer Singh --- criu/sk-inet.c | 7 +++++-- criu/sockets.c | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 1238b03dc..6e0acf2ce 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -130,6 +130,8 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); @@ -922,8 +924,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) } if (ie->src_port) { - if (inet_bind(sk, ii)) - goto err; + if (ie->proto != IPPROTO_ICMP && ie->proto != IPPROTO_ICMPV6) + if (inet_bind(sk, ii)) + goto err; } /* diff --git a/criu/sockets.c b/criu/sockets.c index f9ce999be..0affccad0 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -65,7 +65,7 @@ const char *socket_proto_name(unsigned int proto, char *nm, size_t size) [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), [IPPROTO_AH] = __stringify_1(IPPROTO_AH), [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), - [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), [IPPROTO_ICMPV6] = __stringify_1(IPPROTO_ICMPV6), }; return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); } @@ -131,10 +131,12 @@ enum socket_cl_bits { INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET_RAW_CL_BIT, + INET_ICMP_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, INET6_RAW_CL_BIT, + INET6_ICMP_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, @@ -161,6 +163,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET_RAW_CL_BIT; + if (proto == IPPROTO_ICMP) + return INET_ICMP_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) @@ -171,6 +175,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET6_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET6_RAW_CL_BIT; + if (proto == IPPROTO_ICMPV6) + return INET6_ICMP_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); @@ -282,6 +288,12 @@ void preload_socket_modules(void) req.r.i.sdiag_protocol = IPPROTO_RAW; probe_diag(nl, &req, -ENOENT); + req.r.i.sdiag_protocol = IPPROTO_ICMP; + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + probe_diag(nl, &req, -ENOENT); + close(nl); pr_info("Done probing\n"); } @@ -773,6 +785,10 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) case IPPROTO_RAW: type = SOCK_RAW; break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + type = SOCK_DGRAM; + break; default: BUG_ON(1); return -1; @@ -797,7 +813,7 @@ static int collect_err(int err, struct ns_id *ns, void *arg) char family[32], proto[32]; char msg[256]; - snprintf(msg, sizeof(msg), "Sockects collect procedure family %s proto %s", + snprintf(msg, sizeof(msg), "Sockets collect procedure family %s proto %s", socket_family_name(gr->family, family, sizeof(family)), socket_proto_name(gr->protocol, proto, sizeof(proto))); @@ -905,6 +921,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv4 ICMP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_ICMP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; @@ -944,6 +967,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv6 ICMP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; From 3dc865bc80a4dfa5378bed9fe0434433d65379e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E0=A4=B8=E0=A4=AE=E0=A5=80=E0=A4=B0=20=E0=A4=B8=E0=A4=BF?= =?UTF-8?q?=E0=A4=82=E0=A4=B9=20Sameer=20Singh?= Date: Sat, 28 Dec 2024 09:35:11 +0530 Subject: [PATCH 625/775] test: add static tests for ICMP socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ZDTM static tests for IP4/ICMP and IP6/ICMP socket feature. Signed-off-by: समीर सिंह Sameer Singh Signed-off-by: Andrei Vagin --- test/zdtm/static/Makefile | 3 + test/zdtm/static/socket6_icmp.c | 1 + test/zdtm/static/socket_icmp.c | 128 ++++++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+) create mode 120000 test/zdtm/static/socket6_icmp.c create mode 100644 test/zdtm/static/socket_icmp.c diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 34fc90513..d427659e0 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -37,6 +37,8 @@ TST_NOFILE := \ socket_udp-corked \ socket6_udp \ socket_udp_shutdown \ + socket_icmp \ + socket6_icmp \ sk-freebind \ sk-freebind-false \ socket_udplite \ @@ -630,6 +632,7 @@ socket-tcp6-closed: CFLAGS += -D ZDTM_IPV6 socket-tcp6-closed: CFLAGS += -D ZDTM_IPV4V6 socket-tcp-closed-last-ack: CFLAGS += -D ZDTM_TCP_LAST_ACK socket-tcp-skip-in-flight: CFLAGS += -D ZDTM_IPV4V6 +socket6-icmp: CFLAGS += -DZDTM_IPV6 sock_ip_opts01: CFLAGS += -DZDTM_VAL_ZERO sock_tcp_opts01: CFLAGS += -DZDTM_VAL_ZERO tun_ns: CFLAGS += -DTUN_NS diff --git a/test/zdtm/static/socket6_icmp.c b/test/zdtm/static/socket6_icmp.c new file mode 120000 index 000000000..24d8fd806 --- /dev/null +++ b/test/zdtm/static/socket6_icmp.c @@ -0,0 +1 @@ +socket_icmp.c \ No newline at end of file diff --git a/test/zdtm/static/socket_icmp.c b/test/zdtm/static/socket_icmp.c new file mode 100644 index 000000000..f72e348bf --- /dev/null +++ b/test/zdtm/static/socket_icmp.c @@ -0,0 +1,128 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for ICMP socket\n"; +const char *test_author = "समीर सिंह Sameer Singh \n"; + +/* Description: + * Send a ping to localhost using ICMP socket + */ + +#include +#include +#include +#include +#if defined(ZDTM_IPV6) +#include +#else +#include +#endif +#include +#include +#include + +#include "sysctl.h" + +#define PACKET_SIZE 64 +#define RECV_TIMEOUT 1 + +static int echo_id = 1234; + +#if defined(ZDTM_IPV6) +#define TEST_ICMP_ECHOREPLY ICMP6_ECHOREPLY +#else +#define TEST_ICMP_ECHOREPLY ICMP_ECHOREPLY +#endif +int main(int argc, char **argv) +{ + int ret, sock, seq = 0; + char packet[PACKET_SIZE], recv_packet[PACKET_SIZE]; + + struct timeval tv; +#if defined(ZDTM_IPV6) + struct sockaddr_in6 addr, recv_addr; +#else + struct icmphdr icmp_header, *icmp_reply; +#endif + struct sockaddr_in addr, recv_addr; + socklen_t addr_len; + + // Allow GIDs 0-58468 to open an unprivileged ICMP socket + if (sysctl_write_str("/proc/sys/net/ipv4/ping_group_range", "0 58468")) + return -1; + + test_init(argc, argv); + +#if defined(ZDTM_IPV6) + sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); +#else + sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); +#endif + if (sock < 0) { + pr_perror("Can't create socket"); + return 1; + } + + tv.tv_sec = RECV_TIMEOUT; + tv.tv_usec = 0; + if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) { + pr_perror("Can't set socket option"); + return 1; + } + + memset(&addr, 0, sizeof(addr)); + memset(&icmp_header, 0, sizeof(icmp_header)); +#if defined(ZDTM_IPV6) + addr.sin6_family = AF_INET6; + inet_pton(AF_INET6, "::1", &addr.sin6_addr); + + icmp_header.icmp6_type = ICMP6_ECHO_REQUEST; + icmp_header.icmp6_code = 0; + icmp_header.icmp6_id = echo_id; + icmp_header.icmp6_seq = seq; +#else + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + icmp_header.type = ICMP_ECHO; + icmp_header.code = 0; + icmp_header.un.echo.id = echo_id; + icmp_header.un.echo.sequence = seq; +#endif + + memcpy(packet, &icmp_header, sizeof(icmp_header)); + memset(packet + sizeof(icmp_header), 0xa5, + PACKET_SIZE - sizeof(icmp_header)); + + test_daemon(); + test_waitsig(); + + ret = sendto(sock, packet, PACKET_SIZE, 0, + (struct sockaddr *)&addr, sizeof(addr)); + + if (ret < 0) { + fail("Can't send"); + return 1; + } + + addr_len = sizeof(recv_addr); + + ret = recvfrom(sock, recv_packet, sizeof(recv_packet), 0, + (struct sockaddr *)&recv_addr, &addr_len); + + if (ret < 0) { + fail("Can't recv"); + return 1; + } + + icmp_reply = (struct icmphdr *)recv_packet; + + if (icmp_reply->type != ICMP_ECHOREPLY) { + fail("Got no ICMP_ECHO_REPLY"); + return 1; + } + + close(sock); + + pass(); + return 0; +} From e31828ed8ce4fc7cbac4d412f7e33911f3a63b17 Mon Sep 17 00:00:00 2001 From: Chuan Qiu Date: Thu, 12 Jun 2025 22:49:26 -0700 Subject: [PATCH 626/775] mount: Fix trailing / when a file is bind-mounted E.g. I have a /etc/hosts in workspace mounted from the host, and get the following message. (00.141008) 1: mnt-v2: Create plain mountpoint /tmp/.criu.mntns.K1biY1/mnt-0000000938 for 938 (00.141546) 1: mnt-v2: Mounting unsupported @938 (0) (00.141887) 1: mnt-v2: Bind /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/ to /tmp/.criu.mntns.K1biY1/mnt-0000000938 (00.142179) 1: Error (criu/mount-v2.c:319): mnt-v2: Failed to open_tree /tmp/agent/1-d8c746c6fda3a8b2/workspace/etc/hosts/: Not a directory (00.143774) Error (criu/cr-restore.c:2320): Restoring FAILED. Signed-off-by: Chuan Qiu --- criu/mount.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/criu/mount.c b/criu/mount.c index 06b959542..b643a7f26 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -888,7 +888,11 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + if (cut_root[0] == '\0') { + p = xstrdup(match->ns_mountpoint + 1); + } else { + p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + } if (!p) return -1; From 455c67739914a6d504733605e6d70538204aa3cf Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 20 Jun 2025 13:44:32 +0800 Subject: [PATCH 627/775] zdtm: Add ztatic/mnt_ext_file_bind_auto test The test creates a file bindmount in criu mntns and binds it into test mntns, this external file bindmount is autodetected and restored via "--external mnt[]" criu option. Note: In previous patch we fix the problem on this code path where file bindmount restore fails as there is excess "/" in source path. Signed-off-by: Pavel Tikhomirov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/mnt_ext_file_bind_auto.c | 104 +++++++++++++++++++ test/zdtm/static/mnt_ext_file_bind_auto.desc | 4 + 3 files changed, 109 insertions(+) create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.c create mode 100644 test/zdtm/static/mnt_ext_file_bind_auto.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index d427659e0..ab69f389e 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -381,6 +381,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + mnt_ext_file_bind_auto \ TST_DIR = \ cwd00 \ diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.c b/test/zdtm/static/mnt_ext_file_bind_auto.c new file mode 100644 index 000000000..0c3b9f5fb --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if external file mount works"; +const char *test_author = "Pavel Tikhomirov "; + +char *filename = "mnt_ext_file_bind_auto_bind_auto.file"; +TEST_OPTION(filename, string, "file name", 1); + +char *source = "mnt_ext_file_bind_auto_bind_auto.source"; + +int create_file(const char *path) +{ + int fd; + + fd = open(path, O_CREAT | O_RDWR, 0644); + if (fd < 0) { + pr_perror("open"); + return -1; + } + + close(fd); + return 0; +} + +int main(int argc, char **argv) +{ + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char *tmp = "/tmp/zdtm_ext_file_bind_auto.tmp"; + char *sourcefile = "/tmp/zdtm_ext_file_bind_auto.file"; + char *root, tmpfile[PATH_MAX], testfile[PATH_MAX]; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare file bindmount in criu root (source for external file bindmount) */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + + sprintf(tmpfile, "%s/%s", tmp, filename); + if (create_file(tmpfile)) + return 1; + + if (create_file(sourcefile)) + return 1; + + if (mount(tmpfile, sourcefile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + + umount2(tmp, MNT_DETACH); + + /* Prepare file in test root (mount point for external file bindmount) */ + sprintf(testfile, "%s/%s", root, filename); + if (create_file(testfile)) + return 1; + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + * and will be inherited into test mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(sourcefile, testfile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.desc b/test/zdtm/static/mnt_ext_file_bind_auto.desc new file mode 100644 index 000000000..825b08127 --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.desc @@ -0,0 +1,4 @@ +{ 'opts': '--external mnt[]', + 'feature': 'mnt_id', + 'flavor': 'ns uns', + 'flags': 'suid'} From 7fbf7b2be4afd5768599cf104b8e05e52c671479 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 11 Jul 2025 22:16:49 +0100 Subject: [PATCH 628/775] images: remove symlink for descriptor.proto Currently the build scripts create the following symlink: criu-4.1/images/google/protobuf/descriptor.proto -> /usr/include/google/protobuf/descriptor.proto This symlink points to a system-wide absolute-path target. Also, this symlink ends up in the release tarball. The tarball may later be downloaded and unpacked by e.g. OS distributions. If unpacking is done using Python 3.14+, it will fail. This happens because Python 3.14 will switch the default behavior of extractall() from "fully trusting the content of archive" to "disallow common attack vectors while extracting the archive". With this new behavior, extractall() raises an exception when at least one file in the archive extracts or points to outside of the extraction directory (these are called path traversal attacks and zip slip attacks). Reported-by: Dmitrii Kuvaiskii Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 7 ------- .lgtm.yml | 5 ----- images/Makefile | 17 ++++++++++++++++- images/google/protobuf/descriptor.proto | 1 - 4 files changed, 16 insertions(+), 14 deletions(-) delete mode 120000 images/google/protobuf/descriptor.proto diff --git a/.cirrus.yml b/.cirrus.yml index a4b53a54b..bddd5a3f1 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -33,7 +32,6 @@ task: memory: 8G setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel @@ -67,7 +65,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -88,7 +85,6 @@ task: setup_script: | scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -101,7 +97,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local task: @@ -113,7 +108,6 @@ task: script: uname -a build_script: | scripts/ci/apt-install make - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci local CLANG=1 task: @@ -125,6 +119,5 @@ task: script: uname -a build_script: | scripts/ci/prepare-for-fedora-rawhide.sh - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 make -C test/zdtm -j 4 diff --git a/.lgtm.yml b/.lgtm.yml index 0dd49cda4..4beadcc63 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -23,8 +23,3 @@ extraction: - "python3-yaml" - "libnl-route-3-dev" - "gnutls-dev" - configure: - command: - - "ls -laR images/google" - - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" - - "ls -laR images/google" diff --git a/images/Makefile b/images/Makefile index d966fbfca..e94346eee 100644 --- a/images/Makefile +++ b/images/Makefile @@ -58,7 +58,6 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -91,6 +90,22 @@ endef makefile-deps := Makefile $(obj)/Makefile +# +# Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. +PROTOBUF_DIR := images/google +DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf +$(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto + $$(Q) echo "Generating descriptor.pb-c.c" + $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + +cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d + +submrproper: + $$(Q) rm -rf $(PROTOBUF_DIR) +.PHONY: submrproper +mrproper: submrproper + # # Generates rules needed to compile protobuf files. define gen-proto-rules diff --git a/images/google/protobuf/descriptor.proto b/images/google/protobuf/descriptor.proto deleted file mode 120000 index 07a4c9add..000000000 --- a/images/google/protobuf/descriptor.proto +++ /dev/null @@ -1 +0,0 @@ -/usr/include/google/protobuf/descriptor.proto \ No newline at end of file From 21c3b9c005d64db9bb6f998951384919f28d957f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 20:14:45 -0700 Subject: [PATCH 629/775] images/Makefile: fix using $(Q) Commit 68f92b551 used `$$(Q)` instead of `$(Q)` in the Makefile target, which resulted in the following error: $(Q) echo "Generating descriptor.pb-c.c" /bin/sh: 1: Q: not found Generating descriptor.pb-c.c $(Q) protoc --proto_path=/usr/include --proto_path=images/ --c_out=images/ /usr/include/google/protobuf/descriptor.proto /bin/sh: 1: Q: not found as well as: $(Q) rm -rf images/google /bin/sh: line 1: Q: command not found Fix it. Signed-off-by: Kir Kolyshkin --- images/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/images/Makefile b/images/Makefile index e94346eee..cb30a5126 100644 --- a/images/Makefile +++ b/images/Makefile @@ -96,13 +96,13 @@ PROTOBUF_DIR := images/google DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $$(Q) echo "Generating descriptor.pb-c.c" - $$(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + $(Q) echo "Generating descriptor.pb-c.c" + $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $$(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -rf $(PROTOBUF_DIR) .PHONY: submrproper mrproper: submrproper From 066bf7bf3c68c899644aab9ecfa9d7c8d551ea8f Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 22:44:50 -0700 Subject: [PATCH 630/775] Keep images/google/protobuf directory Commit 68f92b551 removed images/google/protobuf directory, so it is re-created each time during the build process. This resulted in a weird behavior change. Previously, one could do something like this: git clone $CRURL criu (cd criu && sudo make install-criu) rm -rf criu This worked fine, including running rm -rf as a non-root user, since no new directories were created under criu -- all directories were still owned by the original user. Since commit 68f92b551 the same sequence fails: rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.c': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.d': Permission denied rm: cannot remove '/home/runner/criu/images/google/protobuf/descriptor.pb-c.h': Permission denied A workaround is to keep empty images/google/protobuf directory, which is what this commit does. Signed-off-by: Kir Kolyshkin --- .gitignore | 2 -- images/Makefile | 5 ++--- images/google/protobuf/.gitignore | 2 ++ 3 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 images/google/protobuf/.gitignore diff --git a/.gitignore b/.gitignore index 854657d1c..94daa13ea 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,6 @@ compel/compel compel/compel-host-bin images/*.c images/*.h -images/google/protobuf/*.c -images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest diff --git a/images/Makefile b/images/Makefile index cb30a5126..6f310e553 100644 --- a/images/Makefile +++ b/images/Makefile @@ -92,8 +92,7 @@ makefile-deps := Makefile $(obj)/Makefile # # Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. -PROTOBUF_DIR := images/google -DESCRIPTOR_DIR := $(PROTOBUF_DIR)/protobuf +DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto $(Q) echo "Generating descriptor.pb-c.c" @@ -102,7 +101,7 @@ $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d submrproper: - $(Q) rm -rf $(PROTOBUF_DIR) + $(Q) rm -f $(DESCRIPTOR_DIR)/* .PHONY: submrproper mrproper: submrproper diff --git a/images/google/protobuf/.gitignore b/images/google/protobuf/.gitignore new file mode 100644 index 000000000..68359a786 --- /dev/null +++ b/images/google/protobuf/.gitignore @@ -0,0 +1,2 @@ +*.c +*.h From 22c83e3eba403c1402826dc9edd770d74965879c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Tue, 22 Jul 2025 23:07:37 -0700 Subject: [PATCH 631/775] images/Makefile: use msg-gen In general, we use "$(E)" instead of "$(Q) echo", but we also have a msg-gen macro which can be used here. Signed-off-by: Kir Kolyshkin --- images/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/Makefile b/images/Makefile index 6f310e553..2c33152e9 100644 --- a/images/Makefile +++ b/images/Makefile @@ -95,7 +95,7 @@ makefile-deps := Makefile $(obj)/Makefile DESCRIPTOR_DIR := images/google/protobuf GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf $(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $(Q) echo "Generating descriptor.pb-c.c" + $(call msg-gen, $@) $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d From 95d5e2e59b1b83ba5400e7eea6db57f77424fb80 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:32:25 +0200 Subject: [PATCH 632/775] compel: flush caches after parasite injection After the CRIU process saves the parasite code for the target thread in the shared mmap, it is necessary to call __clear_cache before the target thread executes the code. Without this step, the target thread may not see the correct code to execute, which can result in a SIGILL signal. For the specific arm64 case. this is important so that the newly copied code is flushed from d-cache to RAM, so that the target thread sees the new code. The change is based on commit 6be10a2 by @fu.lin and on input received from @adrianreber. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- compel/src/lib/infect.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 4ea27bc63..22fcf24fa 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1054,6 +1054,16 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; From 64276874d89825452baee6c756046e1277a41c48 Mon Sep 17 00:00:00 2001 From: Ignacio Moreno Gonzalez Date: Wed, 16 Jul 2025 16:38:13 +0200 Subject: [PATCH 633/775] restore: flush caches during restore See the previous commit for rationale and architecture-specific details. [ avagin: tweak code comment ] Signed-off-by: Ignacio Moreno Gonzalez Signed-off-by: Andrei Vagin --- criu/cr-restore.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index c1d1f4b9d..b37603563 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2569,6 +2569,17 @@ static int remap_restorer_blob(void *addr) restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(addr, addr + pbd.hdr.bsize); + return 0; } From 0d1e280d09d1a7422f9706cadb332586d520c352 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 07:53:55 +0100 Subject: [PATCH 634/775] vagrant: fix 'qemu' install Installing this package currently fails with the following message: Package qemu is not available, but is referred to by another package. This may mean that the package is missing, has been obsoleted, or is only available from another source E: Package 'qemu' has no installation candidate Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index ed5a01178..c3e15007c 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,7 +22,7 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ + ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ openssh-client systemctl restart libvirtd From 2762b21e4a529f14b845f5bfe5153864d59b3e02 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:34:31 +0100 Subject: [PATCH 635/775] vagrant: update image to fedora 42 Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c3e15007c..81af5d2e5 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,9 +6,9 @@ set -e set -x -VAGRANT_VERSION=2.4.1 -FEDORA_VERSION=40 -FEDORA_BOX_VERSION=40.20240414.0 +VAGRANT_VERSION=2.4.7 +FEDORA_VERSION=42 +FEDORA_BOX_VERSION=1.1.0 setup() { if [ -n "$TRAVIS" ]; then @@ -27,7 +27,7 @@ setup() { openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt - vagrant init fedora/${FEDORA_VERSION}-cloud-base --box-version ${FEDORA_BOX_VERSION} + vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. From d586b30c6bede3767f86ef40217d462085b734e7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 25 Jul 2025 08:50:29 +0100 Subject: [PATCH 636/775] vagrant: fix tar including archive in itself The tar command was failing with the following message: $ tar cf criu.tar ../../../criu tar: Removing leading `../../../' from member names tar: ../../../criu/scripts/ci/criu.tar: archive cannot contain itself; not dumped In addition, the /vagrant no-longer exist in the new Fedora images. bash: line 1: cd: /vagrant: No such file or directory Signed-off-by: Radostin Stoyanov --- scripts/ci/vagrant.sh | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 81af5d2e5..008a01fb3 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -17,7 +17,7 @@ setup() { fi # Tar up the git checkout to have vagrant rsync it to the VM - tar cf criu.tar ../../../criu + tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb @@ -28,10 +28,16 @@ setup() { systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} + # The default libvirt Vagrant VM uses 512MB. # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' + # Sync /tmp/criu.tar into the VM + # We want to use $HOME without expansion + # shellcheck disable=SC2016 + sed -i Vagrantfile -e 's|^end$| config.vm.provision "file", source: "/tmp/criu.tar", destination: "$HOME/criu.tar"'"\n"'end|g' + vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config @@ -40,8 +46,11 @@ setup() { libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel + # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd + + ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' ssh default cat /proc/cmdline } @@ -49,7 +58,7 @@ fedora-no-vdso() { ssh default sudo grubby --update-kernel ALL --args="vdso=0" vagrant reload ssh default cat /proc/cmdline - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only @@ -74,12 +83,12 @@ fedora-rawhide() { # In the container it is not possible to change the state of selinux. # Let's just disable it for this test run completely. ssh default 'sudo setenforce Permissive' - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' + ssh default 'cd /vagrant/criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' } fedora-non-root() { ssh default uname -a - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' # Setting the capability should be the only line needed to run as non-root on Fedora # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' From c6c6f6f231c8142ea8ee562e92c0bd4b6984f113 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:46:39 +0000 Subject: [PATCH 637/775] zdtm/socket-tcp-closing: fill socket buffers effectivly Send large chunks to fill socket buffers. Signed-off-by: Andrei Vagin --- test/zdtm/static/socket-tcp-closing.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/socket-tcp-closing.c b/test/zdtm/static/socket-tcp-closing.c index 87e1d7533..df291d446 100644 --- a/test/zdtm/static/socket-tcp-closing.c +++ b/test/zdtm/static/socket-tcp-closing.c @@ -31,10 +31,13 @@ static int port = 8880; int fill_sock_buf(int fd) { + char zdtm[512]; int flags; int size; int ret; + memset(zdtm, 5, sizeof(zdtm)); + flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); @@ -47,7 +50,6 @@ int fill_sock_buf(int fd) size = 0; while (1) { - char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) From 5f94dd71e7fc59f31633faab57d59b924c3f0273 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 10 Aug 2025 21:50:41 +0000 Subject: [PATCH 638/775] CI: Consolidate arm64 tests on GitHub runners The arm64 tests are currently being executed on both actuated and GitHub runners. This change removes the actuated runner to avoid redundancy and streamline our CI process. Signed-off-by: Andrei Vagin --- .github/workflows/aarch64-test.yaml | 32 +++++++++++ .github/workflows/actuated-aarch64-test.yaml | 58 -------------------- 2 files changed, 32 insertions(+), 58 deletions(-) create mode 100644 .github/workflows/aarch64-test.yaml delete mode 100644 .github/workflows/actuated-aarch64-test.yaml diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml new file mode 100644 index 000000000..32b19e176 --- /dev/null +++ b/.github/workflows/aarch64-test.yaml @@ -0,0 +1,32 @@ +name: aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: aarch64-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04-arm + strategy: + matrix: + target: [GCC=1, CLANG=1] + + steps: + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} + # Following tests are failing on the VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/actuated-aarch64-test.yaml b/.github/workflows/actuated-aarch64-test.yaml deleted file mode 100644 index 567746a5f..000000000 --- a/.github/workflows/actuated-aarch64-test.yaml +++ /dev/null @@ -1,58 +0,0 @@ -name: aarch64 test - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: actuated-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - # Actuated runners are not available in all repositories. - if: ${{ github.repository == 'checkpoint-restore/criu' }} - # The memory size and the number of CPUs can be freely selected for - # the actuated runners. 3GB and 4 CPUs seems to be enough according to the - # result from 'vmmeter'. - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [actuated-arm64-4cpu-3gb, ubuntu-24.04-arm] - target: [GCC=1, CLANG=1] - - steps: - # https://gist.github.com/alexellis/1f33e581c75e11e161fe613c46180771#file-metering-gha-md - # vmmeter start - - name: Prepare arkade - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: alexellis/arkade-get@master - with: - crane: latest - print-summary: false - - - name: Install vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - run: | - crane export --platform linux/arm64 ghcr.io/openfaasltd/vmmeter:latest | sudo tar -xvf - -C /usr/local/bin - - - name: Run vmmeter - if: ${{ matrix.os == 'actuated-arm64-4cpu-3gb' }} - uses: self-actuated/vmmeter-action@master - # vmmeter end - - - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }}/${{ matrix.os }} - # Following tests are failing on the actuated VMs: - # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out - # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) - # - # In combination with '--remote-lazy-pages' following error occurs: - # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) - run: | - # The 'sched_policy00' needs the following: - sudo sysctl -w kernel.sched_rt_runtime_us=-1 - # etc/hosts entry is needed for netns_lock_iptables - echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts - sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ - ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" From fce491113bcb5bfe95e078ba92e2601b7f671c23 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:22:57 +0200 Subject: [PATCH 639/775] criu/include/mman: define MADV_GUARD_INSTALL Signed-off-by: Alexander Mikhalitsyn --- criu/include/mman.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/include/mman.h b/criu/include/mman.h index 086753bcf..43e0b6cc7 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -19,5 +19,8 @@ #ifndef MADV_WIPEONFORK #define MADV_WIPEONFORK 18 #endif +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif #endif /* __CR_MMAN_H__ */ From 2bb77daa92d26266d32e08ac21c0ed91f438a945 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:37:48 +0200 Subject: [PATCH 640/775] kerndat: add madvise(MADV_GUARD_INSTALL) feature-detection Signed-off-by: Alexander Mikhalitsyn --- criu/include/kerndat.h | 1 + criu/kerndat.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index c5deb3283..66db75649 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -91,6 +91,7 @@ struct kerndat_s { bool has_close_range; bool has_timer_cr_ids; bool has_breakpoints; + bool has_madv_guard; }; extern struct kerndat_s kdat; diff --git a/criu/kerndat.c b/criu/kerndat.c index fa43f7d3f..7e2edb72d 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -31,6 +31,7 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" +#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -1813,6 +1814,33 @@ err: return exit_code; } +static int kerndat_has_madv_guard(void) +{ + void *map; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap a page for has_madv_guard feature test"); + return -1; + } + + if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { + if (errno != EINVAL) { + pr_perror("madvise failed (has_madv_guard check)"); + goto mmap_cleanup; + } + } else { + kdat.has_madv_guard = true; + } + + munmap(map, PAGE_SIZE); + return 0; + +mmap_cleanup: + munmap(map, PAGE_SIZE); + return -1; +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -2081,6 +2109,10 @@ int kerndat_init(void) pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_has_madv_guard()) { + pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); From 4fc07a8a41f468b72e912fe38c96be18d37518d6 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:36:45 +0200 Subject: [PATCH 641/775] kerndat: add pagemap_scan_guard_pages feature check logic Signed-off-by: Alexander Mikhalitsyn --- criu/cr-check.c | 8 ++++++++ criu/include/kerndat.h | 3 +++ criu/include/pagemap_scan.h | 1 + criu/kerndat.c | 12 ++++++++++++ 4 files changed, 24 insertions(+) diff --git a/criu/cr-check.c b/criu/cr-check.c index 9c4778490..7c3dc76dd 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -1599,6 +1599,12 @@ static int check_breakpoints(void) return 0; } +static int check_pagemap_scan_guard_pages(void) +{ + kerndat_warn_about_madv_guards(); + + return kdat.has_pagemap_scan_guard_pages ? 0 : -1; +} static int (*chk_feature)(void); @@ -1724,6 +1730,7 @@ int cr_check(void) ret |= check_pagemap_scan(); ret |= check_overlayfs_maps(); ret |= check_timer_cr_ids(); + ret |= check_pagemap_scan_guard_pages(); if (kdat.lsm == LSMTYPE__APPARMOR) ret |= check_apparmor_stacking(); @@ -1853,6 +1860,7 @@ static struct feature_list feature_list[] = { { "timer_cr_ids", check_timer_cr_ids }, { "overlayfs_maps", check_overlayfs_maps }, { "breakpoints", check_breakpoints }, + { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 66db75649..e4922f401 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -92,6 +92,7 @@ struct kerndat_s { bool has_timer_cr_ids; bool has_breakpoints; bool has_madv_guard; + bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -114,4 +115,6 @@ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); +extern void kerndat_warn_about_madv_guards(void); + #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h index 0ad4c9bc0..9046e01ed 100644 --- a/criu/include/pagemap_scan.h +++ b/criu/include/pagemap_scan.h @@ -14,6 +14,7 @@ #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags diff --git a/criu/kerndat.c b/criu/kerndat.c index 7e2edb72d..997181ce7 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -87,6 +87,10 @@ static int check_pagemap(void) if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { pr_debug("PAGEMAP_SCAN is supported\n"); kdat.has_pagemap_scan = true; + + args.return_mask |= PAGE_IS_GUARD; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) + kdat.has_pagemap_scan_guard_pages = true; } else { switch (errno) { case EINVAL: @@ -1841,6 +1845,14 @@ mmap_cleanup: return -1; } +void kerndat_warn_about_madv_guards(void) +{ + if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) + pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " + "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " + "Please, consider updating your kernel.\n"); +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the From 1873e8f502f2495d8792716df277664f6e3c4852 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sat, 19 Apr 2025 20:42:43 +0200 Subject: [PATCH 642/775] cr-dump: warn if MADV_GUARD is supported but isn't shown in pagemap Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index b8cf7d64d..f02db1a57 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2124,6 +2124,8 @@ int cr_dump_tasks(pid_t pid) int pre_dump_ret = 0; int ret = -1; + kerndat_warn_about_madv_guards(); + pr_info("========================================\n"); pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); From 42580fcb1614a002c54ab0115e81a77a81871418 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:51:24 +0200 Subject: [PATCH 643/775] criu/pagemap-cache: pagescan: look for PAGE_IS_GUARD pages Signed-off-by: Alexander Mikhalitsyn --- criu/pagemap-cache.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index f04a517de..457c0d649 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -194,6 +194,9 @@ int pmc_fill(pmc_t *pmc, u64 start, u64 end) }; long ret; + if (kdat.has_pagemap_scan_guard_pages) + args.return_mask |= PAGE_IS_GUARD; + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); if (ret == -1) { pr_perror("PAGEMAP_SCAN"); From 5843cbf97552f8ddb794931a8daf179aae71d78d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Thu, 1 May 2025 20:02:37 +0200 Subject: [PATCH 644/775] criu/mem: refactor should_dump_page helper Make should_dump_page to return int to indicate failure, also return useful data back through the struct page_info structure passed as a pointer. Also, correspondingly convert all call sites. No functional changes intended, except fixing a bug in should_dump_page() as it could return (-1) when pmc_fill() fails, while caller didn't expect that before. Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 8 +++++- criu/mem.c | 69 ++++++++++++++++++++++++++++++---------------- criu/shmem.c | 27 +++++++++++------- 3 files changed, 69 insertions(+), 35 deletions(-) diff --git a/criu/include/mem.h b/criu/include/mem.h index 3618c9cc3..0ce97822b 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -49,5 +49,11 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty); +struct page_info { + u64 next; + bool softdirty; +}; + +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); + #endif /* __CR_MEM_H__ */ diff --git a/criu/mem.c b/criu/mem.c index 803cb545b..9fcf7a44c 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -115,27 +115,37 @@ static bool should_dump_entire_vma(VmaEntry *vmae) } /* - * should_dump_page returns vaddr if an addressed page has to be dumped. - * Otherwise, it returns an address that has to be inspected next. + * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. + * Otherwise, it writes an address that has to be inspected next. */ -u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) { + if (!page_info) + goto err; + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) - return -1; + goto err; if (pmc->regs) { while (1) { - if (pmc->regs_idx == pmc->regs_len) - return pmc->end; + if (pmc->regs_idx == pmc->regs_len) { + page_info->next = pmc->end; + return 0; + } + if (vaddr < pmc->regs[pmc->regs_idx].end) break; pmc->regs_idx++; } - if (vaddr < pmc->regs[pmc->regs_idx].start) - return pmc->regs[pmc->regs_idx].start; - if (softdirty) - *softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; - return vaddr; + + if (vaddr < pmc->regs[pmc->regs_idx].start) { + page_info->next = pmc->regs[pmc->regs_idx].start; + return 0; + } + + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + page_info->next = vaddr; + return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; @@ -143,16 +153,26 @@ u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, bool *softdirty) * Optimisation for private mapping pages, that haven't * yet being COW-ed */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return vaddr + PAGE_SIZE; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { - if (softdirty) - *softdirty = pme & PME_SOFT_DIRTY; - return vaddr; + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { + page_info->next = vaddr + PAGE_SIZE; + return 0; } - return vaddr + PAGE_SIZE; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + page_info->softdirty = pme & PME_SOFT_DIRTY; + page_info->next = vaddr; + return 0; + } + + page_info->next = vaddr + PAGE_SIZE; + return 0; } + +err: + pr_err("should_dump_page failed on vma " + "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", + vmae->start, vmae->end, vaddr); + return -1; } bool page_is_zero(u64 pme) @@ -202,14 +222,15 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct nr_scanned = 0; for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; - bool softdirty = false; - u64 next; + struct page_info page_info = {}; int st; /* If dump_all_pages is true, should_dump_page is called to get pme. */ - next = should_dump_page(pmc, vma->e, vaddr, &softdirty); - if (!dump_all_pages && next != vaddr) { - vaddr = next - PAGE_SIZE; + if (should_dump_page(pmc, vma->e, vaddr, &page_info)) + return -1; + + if (!dump_all_pages && page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } @@ -223,7 +244,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(softdirty)) { + if (has_parent && page_in_parent(page_info.softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { diff --git a/criu/shmem.c b/criu/shmem.c index 9e3178352..bc7aa3669 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,31 +206,34 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) +static int update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; u64 vaddr; if (!is_shmem_tracking_en()) - return; + return 0; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { - bool softdirty = false; - u64 next; + struct page_info page_info = {}; - next = should_dump_page(pmc, vma, vaddr, &softdirty); - if (next != vaddr) { - vaddr = next - PAGE_SIZE; + if (should_dump_page(pmc, vma, vaddr, &page_info)) + return -1; + + if (page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (softdirty) + if (page_info.softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } + + return 0; } int collect_sysv_shmem(unsigned long shmid, unsigned long size) @@ -667,7 +670,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } @@ -684,7 +689,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, pmc, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } From cc047d595f742e416220d2d7740334500eb96a85 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:42:26 +0200 Subject: [PATCH 645/775] criu/mem: dump: skip MADV_GUARD pages content dump 1. get info about MADV_GUARD_INSTALL-protected pages with help of pagemap by looking for PME_GUARD_REGION flag if /proc//pagemap is used or by looking for PAGE_IS_GUARD flag if ioctl(PAGEMAP_SCAN) is used 2. skip those pages Signed-off-by: Alexander Mikhalitsyn --- criu/include/mem.h | 1 + criu/mem.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/criu/include/mem.h b/criu/include/mem.h index 0ce97822b..b2cbd4b64 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -35,6 +35,7 @@ extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_l #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) +#define PME_GUARD_REGION (1ULL << 58) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) diff --git a/criu/mem.c b/criu/mem.c index 9fcf7a44c..58c4130c6 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -143,12 +143,18 @@ int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *pa return 0; } + if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) + goto skip_guard_page; + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; page_info->next = vaddr; return 0; } else { u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + if (pme & PME_GUARD_REGION) + goto skip_guard_page; + /* * Optimisation for private mapping pages, that haven't * yet being COW-ed @@ -173,6 +179,10 @@ err: "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", vmae->start, vmae->end, vaddr); return -1; + +skip_guard_page: + page_info->next = vaddr + PAGE_SIZE; + return 0; } bool page_is_zero(u64 pme) From 63c7029686ea90c649b2909f37ae93c111f11418 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 09:42:10 +0200 Subject: [PATCH 646/775] criu/{mem, vdso, cr-restore}: introduce VMA_AREA_GUARD fake VMAs Introduce a new kind of VMA - VMA_AREA_GUARD. In fact, it is not a real VMA as it is not represented as struct vm_area_struct in the kernel. We want to reuse an existing vma infrastructure in CRIU to dump an information about MADV_GUARD_INSTALL-covered address space ranges as VMAs. Then, on restore, we need to carefully skip those fake VMAs everywhere we expect a normal VMAs to be processed. And only in restorer we use these VMAs to get an information about where to call MADV_GUARD_INSTALL. Suggested-by: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/cr-restore.c | 6 ++++-- criu/include/image.h | 7 +++++++ criu/mem.c | 13 +++++++++++-- criu/vdso.c | 6 ++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b37603563..1c3b36451 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2447,7 +2447,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if (s_vma->list.next == self_vma_list) { + if ((s_vma->list.next == self_vma_list) || + vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { s_vma = &end_vma; continue; } @@ -2460,7 +2461,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he } if (prev_vma_end + vma_len > t_vma->e->start) { - if (t_vma->list.next == tgt_vma_list) { + if ((t_vma->list.next == tgt_vma_list) || + vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { t_vma = &end_vma; continue; } diff --git a/criu/include/image.h b/criu/include/image.h index afa7d5e12..934f7d4e9 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -68,6 +68,12 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. + * - guard + * stands for a fake VMA (not represented in the kernel + * by a struct vm_area_struct). Used to keep an information + * about virtual address space ranges covered by + * MADV_GUARD_INSTALL guards. These ones must be always at + * the end of the vma_area_list and properly skipped a.e. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -87,6 +93,7 @@ #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) +#define VMA_AREA_GUARD (1 << 16) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/mem.c b/criu/mem.c index 58c4130c6..ee841aca2 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -599,6 +599,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (vma_area_is(vma_area, VMA_AREA_GUARD)) + continue; + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) @@ -861,14 +864,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if (&vma->list == &vmas->h) + if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if (&pvma->list == &pvmas->h) + if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) return; } } @@ -1069,6 +1072,9 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -1276,6 +1282,9 @@ err_read: unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!vma_inherited(vma)) continue; diff --git a/criu/vdso.c b/criu/vdso.c index d4d351131..2d9e57c4d 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -145,6 +145,9 @@ static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); @@ -276,6 +279,9 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list } list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO From 59b4d662ae8fd704dfc16f47628a676852f4c886 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 6 Apr 2025 20:10:10 +0200 Subject: [PATCH 647/775] criu/pie/restorer: add madvise(MADV_GUARD_INSTALL) restore logic Signed-off-by: Alexander Mikhalitsyn --- criu/pie/restorer.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 9867a3ddd..394d3dea0 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -28,6 +28,7 @@ #include #include #include +#include "mman.h" #include "signal.h" #include "prctl.h" #include "criu-log.h" @@ -1665,6 +1666,30 @@ static int restore_membarrier_registrations(int mask) return ret; } +static int restore_madv_guard_regions(struct task_restore_args *args) +{ + int i, ret; + + for (i = 0; i < args->vmas_n; i++) { + VmaEntry *vma_entry = args->vmas + i; + size_t len; + + if (!vma_entry_is(vma_entry, VMA_AREA_GUARD)) + continue; + + len = vma_entry->end - vma_entry->start; + ret = sys_madvise(vma_entry->start, len, MADV_GUARD_INSTALL); + if (ret) { + pr_err("madvise(%" PRIx64 ", %zu, MADV_GUARD_INSTALL) " + "failed with %d\n", + vma_entry->start, len, ret); + return -1; + } + } + + return 0; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1972,6 +1997,13 @@ __visible long __export_restore_task(struct task_restore_args *args) } } + /* + * Restore madvise(MADV_GUARD_INSTALL) + */ + ret = restore_madv_guard_regions(args); + if (ret) + goto core_restore_end; + /* * Tune up the task fields. */ From 9c0f725a625126063e09d01ebc087d1e36a0dcc5 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 4 Aug 2025 10:48:47 +0200 Subject: [PATCH 648/775] criu/mem: dump: note MADV_GUARD pages as VMA_AREA_GUARD VMAs Signed-off-by: Alexander Mikhalitsyn --- criu/cr-dump.c | 17 ++++++++++++ criu/include/mem.h | 1 + criu/mem.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f02db1a57..10c485cbe 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -130,6 +130,23 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap if (ret < 0) goto err; + /* + * In addition to real process VMAs we should keep an info about + * madvise(MADV_GUARD_INSTALL) pages. While these are not represented + * as a struct vm_area_struct in the kernel, it is convenient to treat + * them as mappings in CRIU and reuse the same VMA images but with only + * VMA_AREA_GUARD flag set. + * + * Also, we don't need to dump them during pre-dump. + */ + if (dump_file) { + ret = collect_madv_guards(pid, vma_area_list); + if (ret < 0) { + pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); + goto err; + } + } + pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); diff --git a/criu/include/mem.h b/criu/include/mem.h index b2cbd4b64..e9ce3518a 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -31,6 +31,7 @@ extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); +extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) diff --git a/criu/mem.c b/criu/mem.c index ee841aca2..0636273cb 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1548,3 +1548,72 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } + +int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) +{ + int pagemap_fd = -1; + struct page_region *regs = NULL; + long regs_len = 0; + int i, ret = -1; + + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = 0, + .end = kdat.task_size, + .walk_end = 0, + .vec_len = 1000, /* this should be enough for most cases */ + .max_pages = 0, + .category_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + }; + + if (!kdat.has_pagemap_scan_guard_pages) { + ret = 0; + goto out; + } + + pagemap_fd = open_proc(pid, "pagemap"); + if (pagemap_fd < 0) + goto out; + + regs = xmalloc(args.vec_len * sizeof(struct page_region)); + if (!regs) + goto out; + args.vec = (long)regs; + + do { + /* start from where we finished the last time */ + args.start = args.walk_end; + regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); + if (regs_len == -1) { + pr_perror("PAGEMAP_SCAN"); + goto out; + } + + for (i = 0; i < regs_len; i++) { + struct vma_area *vma; + + BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); + + vma = alloc_vma_area(); + if (!vma) + goto out; + + vma->e->start = regs[i].start; + vma->e->end = regs[i].end; + vma->e->status = VMA_AREA_GUARD; + + list_add_tail(&vma->list, &vma_area_list->h); + vma_area_list->nr++; + } + } while (args.walk_end != kdat.task_size); + + ret = 0; + +out: + xfree(regs); + if (pagemap_fd >= 0) + close(pagemap_fd); + return ret; +} From 01265cfc69e178ca5cb1ae691e1b615c2ddc7eb1 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 20 Apr 2025 20:20:20 +0200 Subject: [PATCH 649/775] test/zdtm/static/maps12: add madv guards test Test for madvise(MADV_GUARD_INSTALL). Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/static/Makefile | 1 + test/zdtm/static/maps12.c | 350 +++++++++++++++++++++++++++++++++++ test/zdtm/static/maps12.desc | 1 + 3 files changed, 352 insertions(+) create mode 100644 test/zdtm/static/maps12.c create mode 100644 test/zdtm/static/maps12.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ab69f389e..e73f964be 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -315,6 +315,7 @@ TST_FILE = \ write_read02 \ write_read10 \ maps00 \ + maps12 \ link10 \ file_attr \ deleted_unix_sock \ diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c new file mode 100644 index 000000000..b645595be --- /dev/null +++ b/test/zdtm/static/maps12.c @@ -0,0 +1,350 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test madvise(MADV_GUARD_INSTALL)"; +const char *test_author = "Alexander Mikhalitsyn "; +/* some parts of code were taken from Linux kernel's kselftest guard-pages.c + written by Lorenzo Stoakes */ + +char *filename; +int fd; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif + +uint8_t *map_base; + +struct { + unsigned int pages_num; + bool filemap; +} vmas[] = { + { 2, false }, + { 2, false }, + { 2, false }, + { 2, true }, + { 2, true }, + { 2, true }, +}; + +struct { + bool guarded; + bool wipeonfork; +} pages[] = { + { false, false }, /* vmas[0] */ + { true, false }, + { true, false }, /* vmas[1] */ + { false, false }, + { false, false }, /* vmas[2] */ + { true, true }, + { true, false }, /* vmas[3] */ + { false, false }, + { true, false }, /* vmas[4] */ + { true, false }, + { false, false }, /* vmas[5] */ + { true, false }, +}; + +static volatile sig_atomic_t signal_jump_set; +static sigjmp_buf signal_jmp_buf; + +static void handle_sigsegv(int signo) +{ + if (!signal_jump_set) + return; + + siglongjmp(signal_jmp_buf, 1); +} + +static bool try_write_to_addr(uint8_t *ptr) +{ + bool failed; + + /* Tell signal handler to jump back here on fatal signal. */ + signal_jump_set = true; + /* If a fatal signal arose, we will jump back here and failed is set. */ + failed = sigsetjmp(signal_jmp_buf, 1) != 0; + + if (!failed) + *ptr = 'x'; + + signal_jump_set = false; + return !failed; +} + +static int setup_sigsegv_handler(void) +{ + uint8_t write_me; + + if (signal(SIGSEGV, handle_sigsegv) == SIG_ERR) { + pr_perror("setting SIGSEGV handler failed"); + return 1; + } + + /* ensure that try_write_to_addr() works properly */ + if (!try_write_to_addr(&write_me)) { + pr_err("Failed to write at valid addr. Buggy try_write_to_addr()?\n"); + return 1; + } + + if (try_write_to_addr(NULL)) { + pr_err("Failed to detect an invalid write. Buggy try_write_to_addr()?\n"); + return 1; + } + + return 0; +} + +static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap) +{ + char *map; + + map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), + filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) + return MAP_FAILED; + + return map; +} + +static int __check_guards(const char *when, bool in_child) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + /* + * Skip pages that were never guarded, and also those + * that were, but have MADV_WIPEONFORK which means that + * guards were removed on fork. + */ + if (!pages[i].guarded || (in_child && pages[i].wipeonfork)) + continue; + + if (try_write_to_addr(&map_base[i * PAGE_SIZE])) { + pr_err("successful write to a guarded area %d %s C/R\n", + i, when); + return 1; + } + } + + return 0; +} + +static int check_guards(const char *when) +{ + int status; + pid_t pid; + + /* + * First of all, check that guards are on their places + * in a main test process. + */ + if (__check_guards(when, false)) { + return 1; + } + + /* + * Now, check that guards are on their places + * after fork(). This allows to ensure that + * combo MADV_WIPEONFORK + MADV_GUARD_INSTALL + * is restored properly too. + */ + + pid = test_fork(); + if (pid < 0) { + pr_perror("check_guards: fork failed"); + return 1; + } + + if (pid == 0) { + if (__check_guards(when, true)) { + pr_err("check_guards(\"%s\") failed in child\n", when); + exit(1); + } + + exit(0); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("check_guards: waitpid"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("check_guards: process didn't exit cleanly: status=%d\n", status); + return 1; + } + + return 0; +} + +static void gen_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + datagen(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc); + } +} + +static int set_pages_madvs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + if (pages[i].guarded) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_GUARD_INSTALL)) { + pr_perror("MADV_GUARD_INSTALL failed on page %d", i); + return 1; + } + } + + if (pages[i].wipeonfork) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_WIPEONFORK)) { + pr_perror("MADV_WIPEONFORK failed on page %d", i); + return 1; + } + } + } + + return 0; +} + +static int check_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + if (datachk(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc)) { + pr_err("Page %d is corrupted\n", i); + return 1; + } + } + + return 0; +} + +static int prepare_vmas(void) +{ + char *map; + int i, shift; + + shift = 0; + for (i = 0; i < ARRAY_SIZE(vmas); i++) { + map = mmap_pages(&map_base[shift * PAGE_SIZE], + vmas[i].pages_num, vmas[i].filemap); + if (map == MAP_FAILED) { + pr_err("mmap of [%d,%d] pages failed\n", + shift, shift + vmas[i].pages_num); + return 1; + } + + shift += vmas[i].pages_num; + } + + if (shift != ARRAY_SIZE(pages)) { + pr_err("Different number of pages in vmas and pages arrays.\n"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned int pages_num = ARRAY_SIZE(pages); + + test_init(argc, argv); + + fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("Unable to create a test file"); + return -1; + } + + if (ftruncate(fd, pages_num * PAGE_SIZE)) { + pr_perror("Unable to ftruncate a test file"); + return -1; + } + + if (setup_sigsegv_handler()) { + pr_err("setup_sigsegv_handler() failed\n"); + return 1; + } + + /* let's find a large enough area in address space */ + map_base = mmap_pages(NULL, pages_num, false); + if (map_base == MAP_FAILED) { + pr_err("mmap of %d pages failed\n", pages_num); + return 1; + } + + /* + * Now we know that we have a free vm address space area + * [map_base, map_base + pages_num * PAGE_SIZE). + * We can use (map_base) as a hint for our further mmaps. + */ + if (prepare_vmas()) { + pr_err("prepare_vmas() failed\n"); + return 1; + } + + /* fill non-guarded pages with data and preserve checksums */ + gen_pages_data(); + + if (set_pages_madvs()) { + pr_err("set_pages_madvs() failed\n"); + return 1; + } + + /* ensure that madvise(MADV_GUARD_INSTALL) works like expected */ + if (check_guards("before")) { + pr_err("check_guards(\"before\") failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* ensure that guards are at their places */ + if (check_guards("after")) { + fail("check_guards(\"after\") failed"); + return 1; + } + + /* check that non-guarded pages still contain original data */ + if (check_pages_data()) { + fail("check_pages_data() failed"); + return 1; + } + + pass(); + munmap(map_base, pages_num * PAGE_SIZE); + close(fd); + return 0; +} diff --git a/test/zdtm/static/maps12.desc b/test/zdtm/static/maps12.desc new file mode 100644 index 000000000..3f7627ff3 --- /dev/null +++ b/test/zdtm/static/maps12.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'feature': 'pagemap_scan_guard_pages'} From 98f2bd525a5eb6db84bdabf4566b18aeaacf32af Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Sun, 10 Aug 2025 18:22:23 +0200 Subject: [PATCH 650/775] ci/vagrant: install vanilla kernel for Fedora Rawhide test We need at least 6.16 to test MADV_GUARD_INSTALL support, but our current Fedora Rawhide test uses only Rawhide's user space, while using Fedora 42 kernel. Let's start using a vanilla kernel. Suggested-by: Adrian Reber Signed-off-by: Alexander Mikhalitsyn --- scripts/ci/vagrant.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 008a01fb3..98942e756 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -66,6 +66,10 @@ fedora-no-vdso() { } fedora-rawhide() { + # Upgrade the kernel to the latest vanilla one + ssh default sudo dnf -y copr enable @kernel-vanilla/stable + ssh default sudo dnf upgrade -y + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously # installed this reboots the VM. From dcee5bd6ff2d632bd4e1d4d09d2ffb2bf683d6a2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 15 Aug 2025 01:44:01 +0000 Subject: [PATCH 651/775] make: Disable branch-protection for PIE code on ARM64 Branch protection uses PAC. It cryptographically "signs" a function's return address before it is stored on the stack. Upon return, the address is authenticated using a secret key. If the signature is invalid, the program will fault. The PIE code is used for the parasite and the restorer. In both cases, it runs in a foreign process. The case of the restorer is even trickier because it needs to restore the original PAC keys, which invalidates all previously "signed" pointers within the restorer itself. Fixes #2709 Signed-off-by: Andrei Vagin --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 5d8e89ac1..7272cfce1 100644 --- a/Makefile +++ b/Makefile @@ -64,6 +64,8 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 + CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") + CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) From 2ba343010663f12979ca29fa22c54e511f2d6473 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 16 Aug 2025 15:45:05 +0100 Subject: [PATCH 652/775] test/zdtm/static/maps12: fix pointer-to-int cast The `offset` argument to `mmap()` was computed with a direct cast from pointer to `off_t`: `(off_t)addr_hint - (off_t)map_base` This causes a build failure when compiling since pointers and `off_t` may differ in size on some platforms. maps12.c: In function 'mmap_pages': maps12.c:114:50: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); | ^ maps12.c:114:69: error: cast from pointer to integer of different size [-Werror=pointer-to-int-cast] 114 | filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); The fix in this patch is to cast both pointers to `intptr_t`, perform the subtraction in that type, and then cast the result back to `off_t`. Signed-off-by: Radostin Stoyanov --- test/zdtm/static/maps12.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c index b645595be..f0d6c2381 100644 --- a/test/zdtm/static/maps12.c +++ b/test/zdtm/static/maps12.c @@ -111,7 +111,8 @@ static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), - filemap ? fd : -1, filemap ? ((off_t)addr_hint - (off_t)map_base) : 0); + filemap ? fd : -1, + filemap ? (off_t)((intptr_t)addr_hint - (intptr_t)map_base) : 0); if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) return MAP_FAILED; From fa1b399064575be2aff7d3c6486f0503b0098038 Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:18 +0000 Subject: [PATCH 653/775] zdtm/static/sock_opts00: use unix socket to test SO_PASSCRED and SO_PASSSEC SO_PASSCRED and SO_PASSSEC are only valid for AF_UNIX and AF_NETLINK This patch updates the test logic to use a unix socket for these options, while preserving the original value consistency check Fixes: #2705 Signed-off-by: Dong Sunchao --- test/zdtm/static/sock_opts00.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index fcf00ffed..854aaa591 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -31,7 +31,7 @@ int main(int argc, char **argv) static const int NOPTS = sizeof(vname) / sizeof(*vname); #undef OPT - int sock, ret = 0, val[NOPTS], rval, i; + int sock, usock, sk, ret = 0, val[NOPTS], rval, i; socklen_t len = sizeof(int); test_init(argc, argv); @@ -42,8 +42,15 @@ int main(int argc, char **argv) return 1; } + usock = socket(AF_UNIX, SOCK_STREAM, 0); + if (usock < 0) { + pr_perror("can't create unix socket"); + return 1; + } + for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { pr_perror("can't get %s", vname[i].name); return 1; @@ -51,13 +58,13 @@ int main(int argc, char **argv) val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i].opt, &val[i], len); + ret = setsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't re-get %s", vname[i].name); return 1; @@ -78,7 +85,8 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i].opt, &rval, &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { pr_perror("can't verify %s", vname[i].name); return 1; @@ -93,6 +101,7 @@ int main(int argc, char **argv) pass(); close(sock); + close(usock); return 0; } From 4b73985955ecc01604d8ed1247605a5875042e4c Mon Sep 17 00:00:00 2001 From: Dong Sunchao Date: Wed, 20 Aug 2025 12:38:37 +0000 Subject: [PATCH 654/775] criu/sockets: Restrict SO_PASSCRED and SO_PASSSEC to supported families Linux 6.16+ restricts SO_PASSCRED and SO_PASSSEC to AF_UNIX, AF_NETLINK, and AF_BLUETOOTH This patch updates CRIU to check the socket family before dumping these options Fixes: #2705 Signed-off-by: Dong Sunchao --- criu/include/sockets.h | 2 +- criu/sk-inet.c | 2 +- criu/sk-netlink.c | 2 +- criu/sk-packet.c | 2 +- criu/sk-unix.c | 2 +- criu/sockets.c | 16 +++++++++------- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/criu/include/sockets.h b/criu/include/sockets.h index c3e7c879a..6c81d3edd 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -25,7 +25,7 @@ struct socket_desc { }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 6e0acf2ce..422edc656 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -581,7 +581,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa if (dump_ip_opts(lfd, family, type, proto, &ipopts)) goto err; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, family, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index a219b69be..dc2baa1b8 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -165,7 +165,7 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_NETLINK, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 1d2e23522..6530bff58 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -173,7 +173,7 @@ static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_PACKET, &skopts)) return -1; psk.protocol = sd->proto; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 70ca16be4..6145fe734 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -527,7 +527,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) } } dump: - if (dump_socket_opts(lfd, skopts)) + if (dump_socket_opts(lfd, AF_UNIX, skopts)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); diff --git a/criu/sockets.c b/criu/sockets.c index 0affccad0..e4adae03c 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -649,7 +649,7 @@ int do_dump_opt(int sk, int level, int name, void *val, int len) return 0; } -int dump_socket_opts(int sk, SkOptsEntry *soe) +int dump_socket_opts(int sk, int family, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; @@ -688,13 +688,15 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) soe->so_reuseport = val ? true : false; soe->has_so_reuseport = true; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); - soe->has_so_passcred = true; - soe->so_passcred = val ? true : false; + if (family == AF_UNIX || family == AF_NETLINK) { + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); - soe->has_so_passsec = true; - soe->so_passsec = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; + } ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; From 254ba3e8cc60790eec2369e2fb9ca3702a3f7019 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 8 Sep 2025 12:48:34 -0700 Subject: [PATCH 655/775] ci: avoid Docker 28 due to regression This change modifies the CI script to avoid Docker version 28, which has a known regression that breaks Checkpoint/Restore (C/R) functionality. The issue is tracked in the moby/moby project as https://github.com/moby/moby/issues/50750. Signed-off-by: Andrei Vagin --- scripts/ci/docker-test.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index aaf443afd..ae7f52454 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,6 +2,24 @@ set -x -e -o pipefail +# Workaround: Docker 28.x has a known regression that breaks the checkpoint and +# restore (C/R) feature. Let's install previous, or next major version. See +# https://github.com/moby/moby/issues/50750 for details on the bug. +export DEBIAN_FRONTEND=noninteractive +apt remove -y docker-ce docker-ce-cli +./apt-install -y ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc +# shellcheck disable=SC1091 +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list +apt update -y +apt-cache madison docker-ce | awk '{ print $3 }' +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" +./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" + # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json service docker restart From a779417a3fa59e55209c50a1a0c40f48a1c456ee Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 19:29:16 -0700 Subject: [PATCH 656/775] zdtm: stop importing junit_xml We are dropping support for generating JUnit XML reports in zdtm.py as we've migrated testing infrastructure entirely to `GitHub Actions` and other third-party test runners. This package has been removed from some distribution repositories (e.g., Fedora), making it simpler to remove the dependency than to force installation via pip. Signed-off-by: Andrei Vagin --- .cirrus.yml | 2 +- scripts/build/Dockerfile.alpine | 2 -- scripts/build/Dockerfile.archlinux | 1 - scripts/build/Dockerfile.centos8 | 2 -- scripts/ci/prepare-for-fedora-rawhide.sh | 1 - scripts/ci/run-ci-tests.sh | 2 +- scripts/ci/vagrant.sh | 2 +- test/jenkins/criu-lazy-migration.pipeline | 1 - test/zdtm.py | 24 +---------------------- 9 files changed, 4 insertions(+), 33 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index bddd5a3f1..848e14132 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python-junit_xml python3-importlib-metadata xmlto libdrm-devel libuuid-devel + dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index d843793ea..819fda0c3 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -48,6 +48,4 @@ RUN apk add \ # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml --break-system-packages - RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 9d11194bb..d4b432f8d 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -32,7 +32,6 @@ RUN pacman -Syu --noconfirm \ go \ python-yaml \ asciidoctor \ - python-junit-xml \ python-importlib-metadata \ libdrm \ util-linux-libs \ diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 index a67212344..5ab6c9cfa 100644 --- a/scripts/build/Dockerfile.centos8 +++ b/scripts/build/Dockerfile.centos8 @@ -45,6 +45,4 @@ RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 test -RUN pip3 install junit_xml - RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8ad9cf97..f8f797c1e 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -26,7 +26,6 @@ dnf install -y \ protobuf-devel \ python3-PyYAML \ python3-protobuf \ - python3-junit_xml \ python3-pip \ python3-importlib-metadata \ python-unversioned-command \ diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 0c4a08975..617f54fc6 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -6,7 +6,7 @@ CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev libnl-3-dev gdb bash libnet-dev util-linux asciidoctor libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata python3-junit.xml libdrm-dev) + python3-importlib-metadata libdrm-dev) X86_64_PKGS=(gcc-multilib) diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 98942e756..c222e30e0 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -44,7 +44,7 @@ setup() { ssh default sudo dnf upgrade -y ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata python3-junit_xml \ + protobuf-devel python3-protobuf python3-importlib-metadata \ rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket diff --git a/test/jenkins/criu-lazy-migration.pipeline b/test/jenkins/criu-lazy-migration.pipeline index 2c863f170..45dc2c776 100644 --- a/test/jenkins/criu-lazy-migration.pipeline +++ b/test/jenkins/criu-lazy-migration.pipeline @@ -21,7 +21,6 @@ pipeline { stage('Test'){ steps { sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' - junit 'test/report/criu-testreport*.xml' } } } diff --git a/test/zdtm.py b/test/zdtm.py index 3339dd816..7e83aa4df 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2078,8 +2078,6 @@ class Launcher: self.__subs = {} self.__fail = False self.__file_report = None - self.__junit_file = None - self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: @@ -2091,22 +2089,14 @@ class Launcher: if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase - from junit_xml import TestCase, TestSuite now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") - junitreport = os.path.join(report_dir, "criu-testreport.xml") - while os.access(reportname, os.F_OK) or os.access( - junitreport, os.F_OK): + while os.access(reportname, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) - junitreport = os.path.join(report_dir, - "criu-testreport" + ".%d.xml" % att) att += 1 - self.__junit_file = open(junitreport, 'a') - self.__junit_test_cases = [] - self.__file_report = open(reportname, 'a') print(u"TAP version 13", file=self.__file_report) print(u"# Hardware architecture: " + arch, file=self.__file_report) @@ -2141,10 +2131,6 @@ class Launcher: self.__runtest += 1 self.__nr_skip += 1 - if self.__junit_test_cases is not None: - tc = TestCase(name) - tc.add_skipped_info(reason) - self.__junit_test_cases.append(tc) if self.__file_report: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report) @@ -2247,10 +2233,6 @@ class Launcher: # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() tc = None - if self.__junit_test_cases is not None: - tc = TestCase(sub['name'], - elapsed_sec=time.time() - sub['start']) - self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2307,10 +2289,6 @@ class Launcher: if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: - ts = TestSuite(opts['title'], self.__junit_test_cases, - os.getenv("NODE_NAME")) - self.__junit_file.write(TestSuite.to_xml_string([ts])) - self.__junit_file.close() self.__file_report.close() if opts['keep_going']: From 053a22a23bf05c91223d48dc609defa641354a87 Mon Sep 17 00:00:00 2001 From: Lorenzo Fontana Date: Thu, 18 Sep 2025 10:01:48 +0200 Subject: [PATCH 657/775] pagemap: prevent integer overflow in pagemap_len Fixes #2738 Original-patch-by: Andrey Vagin Signed-off-by: Lorenzo Fontana --- criu/include/pagemap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9..fae110108 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return pe->nr_pages * PAGE_SIZE; + return (unsigned long)pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) From 80c280610e43fc78e1479ad681bc22e69b4b5287 Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Thu, 18 Sep 2025 03:09:30 +1000 Subject: [PATCH 658/775] compel/mips: Relax ELF magic check to support MIPS libraries On MIPS platforms, shared libraries may use EI_ABIVERSION = 5 to indicate support for .MIPS.xhash sections. The previous ELF header check in handle_binary() strictly compared e_ident against a hardcoded value, causing legitimate shared objects to be rejected. This patch replaces the memcmp-based check with a structured validation of ELF magic and class, and allows EI_ABIVERSION values beside 0. fixes: #2745 Signed-off-by: dong sunchao --- compel/arch/mips/src/lib/handle-elf.c | 31 +++++++++++++++++++-------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c index a605a5a45..e086761c2 100644 --- a/compel/arch/mips/src/lib/handle-elf.c +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -5,18 +5,31 @@ #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; + /* check ELF magic */ + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return -EINVAL; + } + + /* check ELF class and data encoding */ + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF class or data encoding\n"); + return -EINVAL; + } + + if (ehdr->e_ident[EI_ABIVERSION] != 0) { + pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); + } + + return __handle_elf(mem, size); } From a8c5e11715673926f95ecaebd6e805c2d311636b Mon Sep 17 00:00:00 2001 From: Filip Hejsek Date: Sat, 13 Sep 2025 19:49:24 +0200 Subject: [PATCH 659/775] lsm: use attr/apparmor/current to get apparmor label On some kernels, attr/current can be intercepted by BPF LSM, causing errors (#2033). Using attr/apparmor/current is preferable, because it is guaranteed to return the apparmor label. attr/current will still be used as a fallback for older kernels. Fixes: #2033 Signed-off-by: Filip Hejsek --- criu/lsm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/criu/lsm.c b/criu/lsm.c index 70b66d42e..5faf3e5b2 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -29,7 +29,9 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/apparmor/current"); + if (!f) + f = fopen_proc(pid, "attr/current"); if (!f) return -1; From c7395f4cbedc5cf0dd86a2c7aa12e58e33ffc2f4 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 14 Sep 2025 18:44:51 -0700 Subject: [PATCH 660/775] files: fork helpers without CLONE_FILES | CLONE_FS On restore, CRIU needs to change mount namespaces to properly restore files and unix sockets. However, the kernel prevents this if a process is sharing its file system information (fs) with other processes. Fixes #2687 Signed-off-by: Andrei Vagin --- criu/files.c | 1 - criu/pstree.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/criu/files.c b/criu/files.c index f16ec32a2..af4b8aeac 100644 --- a/criu/files.c +++ b/criu/files.c @@ -1329,7 +1329,6 @@ int prepare_fds(struct pstree_item *me) } } - BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) diff --git a/criu/pstree.c b/criu/pstree.c index 75c2fc8d0..cee8b5741 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -237,9 +237,8 @@ int init_pstree_helper(struct pstree_item *ret) { BUG_ON(!ret->parent); ret->pid->state = TASK_HELPER; - rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; - if (shared_fdt_prepare(ret) < 0) - return -1; + rsti(ret)->clone_flags = 0; + INIT_LIST_HEAD(&rsti(ret)->fds); task_entries->nr_helpers++; return 0; } From afb2e6c3f95dd0b15f739d9669bd7eaf120a2f31 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 14:48:42 +0000 Subject: [PATCH 661/775] pagemap: change PagemapEntry.nr_pages to uint64 to support huge mappings Update the nr_pages field in PagemapEntry to uint64 to prepare for checkpointing and restoring huge memory mappings. Backward compatibility with older pagemap images is preserved. Signed-off-by: Andrei Vagin --- criu/include/pagemap.h | 2 +- criu/page-xfer.c | 1 + criu/pagemap.c | 5 ++++- images/pagemap.proto | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index fae110108..3ae15deb9 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -121,7 +121,7 @@ extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned lo static inline unsigned long pagemap_len(PagemapEntry *pe) { - return (unsigned long)pe->nr_pages * PAGE_SIZE; + return pe->nr_pages * PAGE_SIZE; } static inline bool page_read_has_parent(struct page_read *pr) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 0314963e6..b0e04d82c 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -326,6 +326,7 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; + pe.has_nr_pages = true; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { diff --git a/criu/pagemap.c b/criu/pagemap.c index 85bb92259..d9ccc03eb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } @@ -682,6 +682,9 @@ static void init_compat_pagemap_entry(PagemapEntry *pe) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; + + if (!pe->has_nr_pages) + pe->nr_pages = pe->compat_nr_pages; } /* diff --git a/images/pagemap.proto b/images/pagemap.proto index e6d341b0f..f2436a51a 100644 --- a/images/pagemap.proto +++ b/images/pagemap.proto @@ -10,7 +10,8 @@ message pagemap_head { message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; - required uint32 nr_pages = 2; + required uint32 compat_nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; + optional uint64 nr_pages = 5; } From 7e0da4d9757e67d8bd0ee8441a581483ad97b12e Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 18 Sep 2025 15:20:32 +0000 Subject: [PATCH 662/775] pagemap: use unsigned long for page counts Variables storing page counts were previously `unsigned int`, limiting them to a maximum of 2^32 pages. With a 4k page size, this corresponds to a 16TB memory mapping, which is insufficient for larger mappings. This commit changes the type for these variables to `unsigned long` to support larger memory mappings. Signed-off-by: Andrei Vagin --- criu/include/page-pipe.h | 6 +++--- criu/include/page-xfer.h | 6 +++--- criu/include/pagemap.h | 6 +++--- criu/include/parasite.h | 2 +- criu/mem.c | 2 +- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 20 ++++++++++---------- criu/pagemap.c | 22 +++++++++++----------- criu/pie/parasite.c | 2 +- criu/uffd.c | 25 ++++++++++++------------- 10 files changed, 48 insertions(+), 49 deletions(-) diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index 15178c015..65292b7ab 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -92,9 +92,9 @@ struct kernel_pipe_buffer { struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ + unsigned long pipe_off; /* where this buf is started in a pipe */ + unsigned long pages_in; /* how many pages are there */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ @@ -149,7 +149,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 36fe67092..0d9b35019 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -69,9 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); -extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); +extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 3ae15deb9..4cbc87cc6 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,7 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -52,8 +52,8 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; diff --git a/criu/include/parasite.h b/criu/include/parasite.h index b33d6710f..176357711 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -63,7 +63,7 @@ struct parasite_dump_pages_args { unsigned int add_prot; unsigned int off; unsigned int nr_segs; - unsigned int nr_pages; + unsigned long nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) diff --git a/criu/mem.c b/criu/mem.c index 0636273cb..f8c550842 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -336,7 +336,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); diff --git a/criu/page-pipe.c b/criu/page-pipe.c index aab6742be..f8e3520f7 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -381,7 +381,7 @@ int pipe_read_dest_init(struct pipe_read_dest *prd) return 0; } -int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; @@ -406,7 +406,7 @@ int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned lo } /* clamp the request if it passes the end of iovec */ - len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, *nr_pages * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; skip += ppb->pipe_off * PAGE_SIZE; @@ -446,7 +446,7 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index b0e04d82c..4d057163d 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -32,7 +32,7 @@ static int page_server_sk = -1; struct page_server_iov { u32 cmd; - u32 nr_pages; + u64 nr_pages; u64 vaddr; u64 dst_id; }; @@ -886,7 +886,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -1071,7 +1071,7 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%u\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); if (prep_loc_xfer(pi)) return -1; @@ -1348,7 +1348,7 @@ static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; - int nr_pages = 0; + unsigned long nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); @@ -1551,13 +1551,13 @@ struct ps_async_read { static LIST_HEAD(async_reads); -static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) +static inline void async_read_set_goal(struct ps_async_read *ar, unsigned long nr_pages) { ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->nr_pages = nr_pages; } -static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, +static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; @@ -1567,7 +1567,7 @@ static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages async_read_set_goal(ar, nr_pages); } -static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) +static int page_server_start_async_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; @@ -1667,7 +1667,7 @@ int connect_to_page_server_to_recv(int epfd) return epoll_add_rfd(epfd, &ps_rfd); } -int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) +int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, @@ -1684,7 +1684,7 @@ int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) return 0; } -static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) +static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; @@ -1695,7 +1695,7 @@ static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete return ret; } -int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) +int page_server_start_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); diff --git a/criu/pagemap.c b/criu/pagemap.c index d9ccc03eb..16d680fdb 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -168,15 +168,15 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) return 0; } -static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } -static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned long int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; @@ -195,7 +195,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v */ do { - int p_nr; + unsigned long int p_nr; pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); @@ -210,7 +210,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; - pr_info("\tparent has %u pages in\n", p_nr); + pr_info("\tparent has %lu pages in\n", p_nr); if (p_nr > nr) p_nr = nr; @@ -374,7 +374,7 @@ int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, st return 0; } -static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; @@ -402,7 +402,7 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int * We cannot use maybe_read_page_local() for streaming images as it uses * pread(), seeking in the file. Instead, we use this custom page reader. */ -static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { unsigned long len = nr * PAGE_SIZE; int fd; @@ -445,7 +445,7 @@ static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vadd return ret; } -static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) +static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigned long int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; @@ -463,7 +463,7 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_ return ret; } -static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; @@ -474,9 +474,9 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int return ret; } -static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { - pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); + pr_info("pr%lu-%u Read %lx %lu pages\n", pr->img_id, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 1bc03dc2a..c966e9e62 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -101,7 +101,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) } if (spliced_bytes != args->nr_pages * PAGE_SIZE) { sys_close(p); - pr_err("Can't splice all pages to pipe (%ld/%d)\n", spliced_bytes, args->nr_pages); + pr_err("Can't splice all pages to pipe (%ld/%ld)\n", spliced_bytes, args->nr_pages); return -1; } diff --git a/criu/uffd.c b/criu/uffd.c index 98c2b7e07..8e12dcd63 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -668,12 +668,11 @@ static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned */ static int collect_iovs(struct lazy_pages_info *lpi) { + unsigned long start, end, len, nr_pages = 0; + int n_vma = 0, max_iov_len = 0, ret = -1; struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; - int nr_pages = 0, n_vma = 0, max_iov_len = 0; - int ret = -1; - unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) @@ -728,7 +727,7 @@ free_mm: return ret; } -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, unsigned long nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { @@ -822,7 +821,7 @@ static bool uffd_recoverable_error(int mcopy_rc) return false; } -static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int *nr_pages, long mcopy_rc) +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long *nr_pages, long mcopy_rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); @@ -844,7 +843,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int return 0; } -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = *nr_pages * page_size(); @@ -865,12 +864,12 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) return 0; } -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsigned long nr) { struct lazy_pages_info *lpi; - unsigned long addr = 0; - int req_pages, ret; + unsigned long addr = 0, req_pages; struct lazy_iov *req; + int ret; lpi = container_of(pr, struct lazy_pages_info, pr); @@ -920,7 +919,7 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr return drop_iovs(lpi, addr, nr * PAGE_SIZE); } -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; @@ -946,7 +945,7 @@ static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr) { int ret; @@ -961,7 +960,7 @@ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) return 0; } -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr, unsigned flags) { int ret; @@ -1003,7 +1002,7 @@ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) static int xfer_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; - unsigned int nr_pages; + unsigned long nr_pages; unsigned long len; int err; From 2e26b36d44e9ccee7d9b6978a36cbaa308f9a119 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 15:10:25 +0000 Subject: [PATCH 663/775] pagemap: print page regions in the format `start - end` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During investigations, it’s much easier to read logs when regions are printed in the start - end format rather than `start/size`. In addition, all page counters and memory sizes are now printed in hexadecimal, as they are hard to read in decimal form. Signed-off-by: Andrei Vagin --- criu/cr-dedup.c | 3 ++- criu/page-pipe.c | 6 +++--- criu/page-xfer.c | 23 +++++++++++++---------- criu/pagemap.c | 2 +- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index c0c21f53e..feeb9ebb0 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -87,7 +87,8 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); + pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", + pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) diff --git a/criu/page-pipe.c b/criu/page-pipe.c index f8e3520f7..4601d8f9c 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -446,17 +446,17 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %lu pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lx pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; - pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; - pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 4d057163d..e2913b924 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -178,12 +178,12 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le ssize_t ret, left = len; if (opts.tls) { - pr_debug("Sending %lu bytes / %lu pages\n", len, len / PAGE_SIZE); + pr_debug("Sending %lx bytes\n", len); if (tls_send_data_from_fd(p, len)) return -1; } else { - pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); + pr_debug("Splicing %lx bytes into socket\n", len); while (left > 0) { ret = splice(p, NULL, xfer->sk, NULL, left, SPLICE_F_MOVE); @@ -192,7 +192,7 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le return -1; } - pr_debug("\tSpliced: %lu bytes sent\n", (unsigned long)ret); + pr_debug("\tSpliced: %lx bytes sent\n", (unsigned long)ret); left -= ret; } } @@ -288,7 +288,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) * read_pagemap_page routine. */ - pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); + pr_debug("Checking %p - %p hole\n", iov->iov_base, iov->iov_base + iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { @@ -300,7 +300,8 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) return -1; } - pr_debug("\tFound %" PRIx64 "/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); + pr_debug("\tFound %" PRIx64 " - %" PRIx64 "\n", + p->pe->vaddr, p->pe->vaddr + pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be @@ -340,7 +341,8 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { - pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); + pr_err("Hole %p - %p not found in parent\n", + iov->iov_base, iov->iov_base + iov->iov_len); return -1; } } @@ -850,7 +852,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\t p %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\t p %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -886,7 +888,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %ld/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %lx/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -898,7 +900,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\tp %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -1071,7 +1073,8 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%lu\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 " - %" PRIx64 "\n", + pi->vaddr, pi->vaddr + pi->nr_pages * PAGE_SIZE); if (prep_loc_xfer(pi)) return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index 16d680fdb..b6ec3e333 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lu vs %lx:%lu\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From 2d2168fc9c142a2eaf18f319b0d21825775d5660 Mon Sep 17 00:00:00 2001 From: dong sunchao Date: Tue, 23 Sep 2025 01:00:12 +1000 Subject: [PATCH 664/775] vdso: relax EI_OSABI check to support linux in ELF header On some ARM/aarch64 systems, the VDSO ELF header sets EI_OSABI to 3 (Linux), while CRIU expects 0 (System V). This strict check causes restore to fail with "ELF header magic mismatch" This patch relaxes the check to accept both values, improving compatibility with modern toolchains and kernels (e.g. Linux 6.12+) Fixes: #2751 Signed-off-by: dong sunchao --- criu/pie/util-vdso.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 8daf5c71f..45fb6a648 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -98,25 +98,45 @@ static unsigned long elf_gnu_hash(const unsigned char *name) static int has_elf_identity(Ehdr_t *ehdr) { - /* - * See Elf specification for this magic values. - */ + /* check ELF magic */ + + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return false; + }; + + /* check ELF class */ #if defined(CONFIG_VDSO_32) - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS32) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #else - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #endif - BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); - - if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { - pr_err("ELF header magic mismatch\n"); + /* check ELF data encoding */ + if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF data encoding: %d\n", ehdr->e_ident[EI_DATA]); return false; - } + }; + /* check ELF version */ + if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + pr_err("Unsupported ELF version: %d\n", ehdr->e_ident[EI_VERSION]); + return false; + }; + /* check ELF OSABI */ + if (ehdr->e_ident[EI_OSABI] != ELFOSABI_NONE && + ehdr->e_ident[EI_OSABI] != ELFOSABI_LINUX) { + pr_err("Unsupported OSABI version: %d\n", ehdr->e_ident[EI_OSABI]); + return false; + }; return true; } From 91758a68e929f1bf9fb2e682aa53e924806ac475 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 22 Sep 2025 17:59:29 +0000 Subject: [PATCH 665/775] zdtm: Remove junit_xml leftovers The previous commit 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") removed the junit_xml library, but some variables related to it were left in the code. This commit removes the unused `tc` variable and a call to its `add_error_info` method. Fixes: 4cd4a6b1ac15 ("zdtm: stop importing junit_xml") Signed-off-by: Andrei Vagin --- test/zdtm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/zdtm.py b/test/zdtm.py index 7e83aa4df..e21356c30 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -2232,7 +2232,6 @@ class Launcher: # The following wait() is not useful for our domain logic. # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() - tc = None if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2243,7 +2242,6 @@ class Launcher: with open(sub['log']) as sublog: output = sublog.read() details = {'output': output} - tc.add_error_info(output=output) print(testline, file=self.__file_report) print("%s" % yaml.safe_dump(details, explicit_start=True, From 67751bc11b2906a3bc6e7bf65fce19717c272356 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 19 Sep 2025 00:34:56 +0000 Subject: [PATCH 666/775] docs: add developer overviews for AI assistants This commit adds the document to provide high-level overviews of the CRIU project for AI assistants like Claude and Gemini. These documents are intended to be used as context for AI-powered developer assistants to help them understand the project's goals, architecture, and development process. This will allow them to provide more accurate and helpful responses to developer questions. The documents include: - A brief introduction to CRIU - A quick start guide for checkpointing and restoring a simple process - An overview of the dump and restore process - A description of the Compel subproject - Information about the project's coding style, code layout, and tests Signed-off-by: Andrei Vagin --- CLAUDE.md | 1 + GEMINI.md | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 120000 CLAUDE.md create mode 100644 GEMINI.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 000000000..e3c5a92d9 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +GEMINI.md \ No newline at end of file diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 000000000..e56c1de12 --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,136 @@ +# CRIU (Checkpoint/Restore In User-space) + +CRIU is a tool for saving the state of a running application to a set of files +(checkpointing) and restoring it back to a live state. It is primarily used for +live migration of containers, in-place updates, and fast application startup. + +It is implemented as a command-line tool called `criu`. The two primary commands +are `dump` and `restore`. + +- `dump`: Saves a process tree and all its related resources (file + descriptors, IPC, sockets, namespaces, etc.) into a collection of image + files. +- `restore`: Restores processes from image files to the same state they were + in before the dump. + +## Quick Start + +To get a feel for `criu`, you can try checkpointing and restoring a simple +process. + +1. **Run a simple process:** + Open a terminal and run a command that will run for a while. Find its PID. + ```bash + sleep 1000 & + [1] 12345 + ``` + +2. **Dump the process:** + As root, use `criu dump` with the process ID (`-t`) and a directory for the + image files (`-D`). + ```bash + sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will no longer be running. + +3. **Restore the process:** + Use `criu restore` to bring the process back to life from the images. + ```bash + sudo criu restore -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will be running again as if nothing happened. + +# For Developers and Contributors + +This section contains more technical details about CRIU's internals and +development process. + +## Dump Process + +On dump, CRIU uses available kernel interfaces to collect information about +processes. For properties that can only be retrieved from within the process +itself, CRIU injects a binary blob (called a "parasite") into the process's +address space and executes it in the context of one of the process's threads. +This injection is handled by a subproject called **Compel**. + +## Restore Process + +On restore, CRIU reads the image files to reconstruct the processes. The goal is +to restore them to the exact state they were in before the dump. The restore +process is divided into several stages (defined as `CR_STATE_*` in +`./criu/include/restorer.h`). + +The main `criu` process acts as a coordinator. It first restores resources with +inter-process dependencies (file descriptors, sockets, shared memory, +namespaces, etc.). It then forks the process tree and sets up namespaces. +Finally, it restores process-specific resources like file descriptors and memory +mappings. + +A key step involves a small, self-contained binary called the "restorer". All +restored processes switch to executing this code, which unmaps the CRIU-specific +memory and restores the application's original memory mappings. On the final +step, the restorer calls `sigreturn` on a prepared signal frame to resume the +process with the state it had at the moment of the dump. + +## Compel + +Compel is a subproject responsible for generating the binary blobs used for the +parasite code (for dumping) and the restorer code (for restoring). It provides a +library for injecting and executing this code within the target process's +address space. It is a separate project because the logic for generating and +injecting Position-Independent Executable (PIE) code is complex and +self-contained. + +## Coding Style + +The C code in the CRIU project follows the +[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). +Here are some of the main points: + +- **Indentation**: Use tabs, which are set to 8 characters. +- **Line Length**: The preferred line limit is 80 characters, but it can be + extended to 120 if it improves code readability. +- **Braces**: + - The opening brace for a function goes on a new line. + - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes + on the same line. +- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, + `=`, etc.). +- **Naming**: Use descriptive names for functions and variables. +- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, + the preferred format is: + ```c + /* + * This is a multi-line + * comment. + */ + ``` + +## Code Layout + +The code is organized into the following directories: + +- `./compel`: The Compel sub-project. +- `./criu`: The main `criu` tool source code. +- `./images`: Protobuf descriptions for the image files. +- `./test`: All tests. +- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. +- `./test/zdtm.py`: The executor script for ZDTM tests. +- `./scripts`: Helper scripts. +- `./scripts/build`: Docker image files used for CI and cross-compilation + checks. +- `./crit`: A tool to inspect and manipulate CRIU image files. +- `./soccr`: A library for TCP socket checkpoint/restore. + +## Tests + +The main test suite is ZDTM. Here is an example of how to run a single test: + +```bash +sudo ./test/zdtm.py run -t zdtm/static/env00 +``` + +Each ZDTM test has three stages: preparation, C/R, and results checks. During +the test, a process calls `test_daemon()` to signal it is ready for C/R, then +calls `test_waitsig()` to wait for the C/R stage to complete. After being +restored, the test checks that all its resources are still in a valid state. From 25f8be0f6016bd6ef0e0a1222cd3fbfca8e0b6fd Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 17 Sep 2025 19:14:36 +0900 Subject: [PATCH 667/775] ci: use package-manager dependency install scripts Currently, adding a package which is required either for development or testing requires it to be added in multiple places due to many duplicated Dockerfiles and installation scripts. This makes it difficult to ensure that all scripts are updated appropriately and can lead to some places being missed. This patch consolidates the list of dependencies and adds installation scripts for each package-manager used in our CI (apk, apt, dnf, pacman). This change also replaces the `debian/dev-packages.lst` as this subfolder conflicts with the Ubuntu/Debian packing scripts used for CRIU: https://github.com/rst0git/criu-deb-packages This patch also removes the CentOS 8 build scripts as it is EOL and the container registry is no longer available. Signed-off-by: Shashank Balaji Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 12 +-- .github/workflows/check-commits.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/nftables-test.yml | 2 +- CONTRIBUTING.md | 84 ++++++++++++------- Makefile | 3 +- {scripts/ci => contrib}/apt-install | 0 contrib/debian/dev-packages.lst | 19 ----- contrib/dependencies/apk-packages.sh | 38 +++++++++ contrib/dependencies/apt-cross-packages.sh | 34 ++++++++ contrib/dependencies/apt-packages.sh | 40 +++++++++ contrib/dependencies/dnf-packages.sh | 35 ++++++++ contrib/dependencies/pacman-packages.sh | 31 +++++++ scripts/build/Dockerfile.alpine | 43 +--------- scripts/build/Dockerfile.archlinux | 35 +------- scripts/build/Dockerfile.centos8 | 48 ----------- scripts/build/Dockerfile.fedora.tmpl | 5 +- scripts/build/Dockerfile.hotspot-alpine | 25 +----- scripts/build/Dockerfile.hotspot-ubuntu | 28 +------ scripts/build/Dockerfile.linux32.tmpl | 26 +----- scripts/build/Dockerfile.openj9-ubuntu | 28 +------ .../Dockerfile.riscv64-stable-cross.tmpl | 33 +------- scripts/build/Dockerfile.stable-cross.tmpl | 25 +----- scripts/build/Dockerfile.tmpl | 36 +------- scripts/build/Dockerfile.unstable-cross.tmpl | 26 +----- scripts/build/Dockerfile.x86_64.hdr | 2 +- scripts/build/Makefile | 2 +- scripts/ci/Makefile | 2 +- scripts/ci/docker-test.sh | 4 +- scripts/ci/java-test.sh | 2 + scripts/ci/loongarch64-qemu-test.sh | 4 +- scripts/ci/prepare-for-fedora-rawhide.sh | 29 +------ scripts/ci/run-ci-tests.sh | 12 +-- scripts/ci/vagrant.sh | 12 +-- scripts/install-debian-pkgs.sh | 25 ------ 35 files changed, 295 insertions(+), 459 deletions(-) rename {scripts/ci => contrib}/apt-install (100%) delete mode 100644 contrib/debian/dev-packages.lst create mode 100755 contrib/dependencies/apk-packages.sh create mode 100755 contrib/dependencies/apt-cross-packages.sh create mode 100755 contrib/dependencies/apt-packages.sh create mode 100755 contrib/dependencies/dnf-packages.sh create mode 100755 contrib/dependencies/pacman-packages.sh delete mode 100644 scripts/build/Dockerfile.centos8 delete mode 100755 scripts/install-debian-pkgs.sh diff --git a/.cirrus.yml b/.cirrus.yml index 848e14132..99dd70d63 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -13,7 +13,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-no-vdso @@ -34,7 +34,7 @@ task: setup_script: | dnf config-manager --set-enabled crb # Same as CentOS 8 powertools dnf -y install epel-release epel-next-release - dnf -y install --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python-devel python-PyYAML python-protobuf python3-importlib-metadata xmlto libdrm-devel libuuid-devel + contrib/dependencies/dnf-packages.sh # The image has a too old version of nettle which does not work with gnutls. # Just upgrade to the latest to make the error go away. dnf -y upgrade nettle nettle-devel @@ -63,7 +63,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-rawhide @@ -83,7 +83,7 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok build_script: | make -C scripts/ci vagrant-fedora-non-root @@ -96,7 +96,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local task: @@ -107,7 +107,7 @@ task: memory: 4G script: uname -a build_script: | - scripts/ci/apt-install make + contrib/apt-install make make -C scripts/ci local CLANG=1 task: diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml index 354873909..bf7d06697 100644 --- a/.github/workflows/check-commits.yml +++ b/.github/workflows/check-commits.yml @@ -19,7 +19,7 @@ jobs: # Checkout pull request HEAD commit instead of merge commit ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies - run: sudo scripts/ci/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev + run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - name: Configure git user details run: | git config --global user.email "checkpoint-restore@users.noreply.github.com" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 88e21d3d1..9c9e46c1b 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -34,7 +34,7 @@ jobs: - name: Install Packages (cpp) if: ${{ matrix.language == 'cpp' }} run: | - sudo scripts/ci/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml index eb3d8e814..7a7d8bd30 100644 --- a/.github/workflows/nftables-test.yml +++ b/.github/workflows/nftables-test.yml @@ -15,7 +15,7 @@ jobs: - name: Remove iptables run: sudo apt remove -y iptables - name: Install libnftables-dev - run: sudo scripts/ci/apt-install libnftables-dev + run: sudo contrib/apt-install libnftables-dev - name: chmod 755 /home/runner # CRIU's tests are sometimes running as some random user and need # to be able to access the test files. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 712e7b813..3ad4aa101 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,19 +27,43 @@ The repository may contain multiple branches. Development happens in the **criu- To clone CRIU repo and switch to the proper branch, run: ``` - git clone https://github.com/checkpoint-restore/criu criu - cd criu - git checkout criu-dev +git clone https://github.com/checkpoint-restore/criu criu +cd criu +git checkout criu-dev ``` -### Compile +### Building from source -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. Alternatively, you can use the Nix flake to set up a development environment by running `nix develop`. +Follow these steps to compile CRIU from source code. -To compile CRIU, run: +#### Installing build dependencies + +First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). + +##### On Ubuntu/Debian-based systems: ``` - make +./contrib/dependencies/apt-packages.sh +``` + +##### On Fedora/CentOS-based systems: + +``` +./contrib/dependencies/dnf-packages.sh +``` + +##### Using Nix: + +``` +nix develop +``` + +#### Compiling CRIU + +Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: + +``` +make ``` This should create the `./criu/criu` executable. @@ -63,7 +87,7 @@ The following command can be used to automatically run a code linter for Python text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). ``` - make lint +make lint ``` In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) @@ -73,7 +97,7 @@ results in decreased readability, we may choose to ignore these errors. Run the following command to check if your changes are compliant with the clang-format rules: ``` - make indent +make indent ``` This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to @@ -83,7 +107,7 @@ can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. ``` - make indent OPTS=--diff BASE=HEAD~N +make indent OPTS=--diff BASE=HEAD~N ``` Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected @@ -96,7 +120,7 @@ Here are some bad examples of clang-format-ing: ``` @@ -58,8 +59,7 @@ static int register_membarriers(void) } - + if (!all_ok) { - fail("can't register membarrier()s - tried %#x, kernel %#x", - barriers_registered, barriers_supported); @@ -129,7 +153,7 @@ Here are some bad examples of clang-format-ing: CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` - make test +make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. @@ -166,21 +190,21 @@ If your change fixes a bug in a specific commit, e.g. you found an issue using the SHA-1 ID, and the one line summary. For example: ``` - Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") +Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` - [pretty] - fixes = Fixes: %h (\"%s\") +[pretty] + fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` - Fixes: #339 +Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. @@ -263,7 +287,7 @@ can certify the below: then you just add a line saying ``` - Signed-off-by: Random J Developer +Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if @@ -275,14 +299,14 @@ commit message. To append such line to a commit you already made, use ``` From: Random J Developer - Subject: [PATCH] component: Short patch description +Subject: [PATCH] component: Short patch description - Long patch description (could be skipped if patch - is trivial enough) +Long patch description (could be skipped if patch +is trivial enough) - Signed-off-by: Random J Developer - --- - Patch body here +Signed-off-by: Random J Developer +--- +Patch body here ``` ## Submit your work upstream @@ -316,8 +340,8 @@ contains the following: revisions should be listed. For example: ``` - v3: rebase on the current criu-dev - v2: add commit to foo() and update bar() coding style +v3: rebase on the current criu-dev +v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is @@ -335,7 +359,7 @@ Historically, CRIU worked with mailing lists and patches so if you still prefer To create a patch, run ``` - git format-patch --signoff origin/criu-dev +git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches @@ -346,8 +370,8 @@ at all. We recommend to post patches using `git send-email` ``` - git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev +git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@openvz.org criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -359,14 +383,14 @@ If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` - git config --global sendemail.smtpServer stmp.example.net +git config --global sendemail.smtpServer stmp.example.net ``` If you get tired of typing `--to=criu@openvz.org` all the time, you can configure that to be automatically handled as well: ``` - git config sendemail.to criu@openvz.org +git config sendemail.to criu@openvz.org ``` If a developer is sending another version of the patch (e.g. to address diff --git a/Makefile b/Makefile index 7272cfce1..3e5d62726 100644 --- a/Makefile +++ b/Makefile @@ -464,7 +464,8 @@ ruff: shellcheck: shellcheck --version shellcheck scripts/*.sh - shellcheck scripts/ci/*.sh scripts/ci/apt-install + shellcheck scripts/ci/*.sh + shellcheck contrib/apt-install contrib/dependencies/*.sh shellcheck -x test/others/crit/*.sh shellcheck -x test/others/libcriu/*.sh shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh diff --git a/scripts/ci/apt-install b/contrib/apt-install similarity index 100% rename from scripts/ci/apt-install rename to contrib/apt-install diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst deleted file mode 100644 index ce45f1b7c..000000000 --- a/contrib/debian/dev-packages.lst +++ /dev/null @@ -1,19 +0,0 @@ -# Required packages for development in Debian -build-essential -libprotobuf-dev -libprotobuf-c-dev -protobuf-c-compiler -protobuf-compiler -python3-protobuf -libnet-dev - -# Extra packages, required for testing and building other tools -pkg-config -libnl-3-dev -libbsd0 -libbsd-dev -iproute2 -libcap-dev -libaio-dev -python3-yaml -libnl-route-3-dev diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh new file mode 100755 index 000000000..0084dea3a --- /dev/null +++ b/contrib/dependencies/apk-packages.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env sh + +apk add --no-cache \ + asciidoctor \ + bash \ + build-base \ + coreutils \ + e2fsprogs \ + git \ + gnutls-dev \ + go \ + ip6tables \ + iproute2 \ + iptables \ + iptables-legacy \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libcap-utils \ + libdrm-dev \ + libnet-dev \ + libnl3-dev \ + nftables \ + nftables-dev \ + pkgconfig \ + procps \ + protobuf-c-compiler \ + protobuf-c-dev \ + protobuf-dev \ + py3-importlib-metadata \ + py3-pip \ + py3-protobuf \ + py3-yaml \ + python3 \ + sudo \ + tar \ + util-linux \ + util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh new file mode 100755 index 000000000..588be40d0 --- /dev/null +++ b/contrib/dependencies/apt-cross-packages.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + crossbuild-essential-"${DEBIAN_ARCH}" \ + iproute2:"${DEBIAN_ARCH}" \ + libaio-dev:"${DEBIAN_ARCH}" \ + libbz2-dev:"${DEBIAN_ARCH}" \ + libc6-"${DEBIAN_ARCH}"-cross \ + libc6-dev-"${DEBIAN_ARCH}"-cross \ + libcap-dev:"${DEBIAN_ARCH}" \ + libexpat1-dev:"${DEBIAN_ARCH}" \ + libgnutls28-dev:"${DEBIAN_ARCH}" \ + libnet-dev:"${DEBIAN_ARCH}" \ + libnftables-dev:"${DEBIAN_ARCH}" \ + libnl-3-dev:"${DEBIAN_ARCH}" \ + libnl-route-3-dev:"${DEBIAN_ARCH}" \ + libprotobuf-c-dev:"${DEBIAN_ARCH}" \ + libprotobuf-dev:"${DEBIAN_ARCH}" \ + libssl-dev:"${DEBIAN_ARCH}" \ + ncurses-dev:"${DEBIAN_ARCH}" \ + uuid-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + build-essential \ + pkg-config \ + git \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh new file mode 100755 index 000000000..c60ba9041 --- /dev/null +++ b/contrib/dependencies/apt-packages.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + asciidoctor \ + bash \ + bsdmainutils \ + build-essential \ + gdb \ + git-core \ + iptables \ + kmod \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libdrm-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnet-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + libperl-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-importlib-metadata \ + python3-pip \ + python3-protobuf \ + python3-yaml \ + time \ + util-linux \ + uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh new file mode 100755 index 000000000..efbb659c5 --- /dev/null +++ b/contrib/dependencies/dnf-packages.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env sh + +dnf install -y \ + asciidoc \ + binutils \ + gcc \ + git \ + glibc-devel \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libbpf-devel \ + libbsd-devel \ + libcap-devel \ + libdrm-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + libuuid-devel \ + make \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + protobuf-c-devel \ + protobuf-compiler \ + protobuf-devel \ + python-devel \ + python3-importlib-metadata \ + python3-protobuf \ + python3-pyyaml \ + rubygem-asciidoctor \ + xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh new file mode 100755 index 000000000..5fe6995fb --- /dev/null +++ b/contrib/dependencies/pacman-packages.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env sh + +pacman -Syu --noconfirm \ + asciidoctor \ + base-devel \ + bash \ + coreutils \ + diffutils \ + git \ + gnutls \ + go \ + iproute2 \ + iptables \ + libaio \ + libbsd \ + libcap \ + libdrm \ + libnet \ + libnl \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + python-importlib-metadata \ + python-pip \ + python-protobuf \ + python-yaml \ + sudo \ + tar \ + util-linux \ + util-linux-libs diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index 819fda0c3..ed883f300 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -1,49 +1,12 @@ FROM alpine ARG CC=gcc -RUN apk update && apk add \ - $CC \ - bash \ - build-base \ - coreutils \ - procps \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - nftables \ - nftables-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - py3-pip \ - py3-protobuf \ - python3 \ - sudo \ - libcap-utils \ - libdrm-dev \ - util-linux \ - util-linux-dev - COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date -RUN apk add \ - ip6tables \ - iptables \ - iptables-legacy \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - e2fsprogs \ - py-yaml \ - py3-importlib-metadata \ - asciidoctor +RUN apk add --no-cache "$CC" && /criu/contrib/dependencies/apk-packages.sh + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index d4b432f8d..261bd2d79 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -5,40 +5,11 @@ ARG CC=gcc # Initialize machine ID RUN systemd-machine-id-setup -RUN pacman -Syu --noconfirm \ - $CC \ - bash \ - make \ - coreutils \ - git \ - gnutls \ - libaio \ - libcap \ - libnet \ - libnl \ - nftables \ - pkgconfig \ - protobuf-c \ - protobuf \ - python-pip \ - python-protobuf \ - which \ - sudo \ - iptables \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - python-yaml \ - asciidoctor \ - python-importlib-metadata \ - libdrm \ - util-linux-libs \ - diffutils - COPY . /criu WORKDIR /criu + +RUN pacman -Syu --noconfirm "$CC" && contrib/dependencies/pacman-packages.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 deleted file mode 100644 index 5ab6c9cfa..000000000 --- a/scripts/build/Dockerfile.centos8 +++ /dev/null @@ -1,48 +0,0 @@ -FROM registry.centos.org/centos/centos:8 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core -RUN yum config-manager --set-enabled powertools -RUN yum install -y --allowerasing \ - asciidoc \ - coreutils \ - chkconfig \ - diffutils \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libselinux-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-devel \ - python3-PyYAML \ - python3-protobuf \ - python3-pip \ - sudo \ - tar \ - which \ - xmlto - -RUN alternatives --set python /usr/bin/python3 -ENV PYTHON=python3 - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 9d3bb0f87..c26a5fd57 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,11 +1,10 @@ ARG CC=gcc -COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh -RUN /bin/prepare-for-fedora-rawhide.sh - COPY . /criu WORKDIR /criu +RUN dnf install -y "$CC" && scripts/ci/prepare-for-fedora-rawhide.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine index 6caf9d0b1..cd632dddf 100644 --- a/scripts/build/Dockerfile.hotspot-alpine +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -1,30 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-alpine ARG CC=gcc -RUN apk update && apk add \ - bash \ - build-base \ - coreutils \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - python3 \ - sudo \ - maven \ - ip6tables \ - iptables \ - util-linux-dev \ - bash - COPY . /criu WORKDIR /criu +RUN apk add --no-cache maven "$CC" && contrib/dependencies/apk-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 67de916ac..76aa571fa 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,33 +1,11 @@ FROM docker.io/library/eclipse-temurin:11-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index d218e0641..a37f16e49 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -1,32 +1,10 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - uuid-dev \ - python3-minimal - COPY . /criu WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN uname -m && setarch linux32 uname -m && setarch --list RUN make mrproper && date && \ diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 0ae4727d2..825495659 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,34 +1,12 @@ FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - uuid-dev \ - maven - RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -f test/javaTests/pom.xml test +ENTRYPOINT ["mvn", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl index e95a43306..8933a6c82 100644 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -1,5 +1,3 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 @@ -12,33 +10,6 @@ COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ apt-get update -y -# Install required packages -RUN apt-get install -y --no-install-recommends \ - build-essential \ - pkg-config \ - git \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libnftables-dev:${DEBIAN_ARCH} \ - libgnutls28-dev:${DEBIAN_ARCH} \ - iproute2:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -55,4 +26,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 65ae55833..56104081f 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -1,30 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} \ - libdrm-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -41,6 +18,8 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu +RUN contrib/dependencies/apt-cross-packages.sh + # amdgpu_plugin with armv7 is not supported RUN make mrproper && date && \ make -j $(nproc) && \ diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 3d6de1044..498b99be9 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -1,40 +1,12 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -# On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default -# We need to install kmod to enable iptables to load these modules for us. -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnftables-dev \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - iproute2 \ - kmod \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-minimal \ - python3-protobuf \ - uuid-dev \ - python3-yaml - COPY . /criu WORKDIR /criu +# On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default +# We need to install kmod to enable iptables to load these modules for us. +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN git clean -dfx && date && \ # Check single object build make -j $(nproc) CC="$CC" criu/parasite-syscall.o && \ diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index 3504b0433..7edb289b6 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -1,29 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - uuid-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -40,4 +18,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 566b4c916..a666f6c26 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,5 +1,5 @@ FROM ubuntu:24.04 -COPY scripts/ci/apt-install /bin/apt-install +COPY contrib/apt-install /bin/apt-install RUN apt-install gcc-multilib diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 389315227..a420cea94 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,4 +1,4 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf centos8 +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 9dc0190b3..ed30e4268 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos8 archlinux +TARGETS := alpine fedora-rawhide archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index ae7f52454..bc5a74667 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -7,7 +7,7 @@ set -x -e -o pipefail # https://github.com/moby/moby/issues/50750 for details on the bug. export DEBIAN_FRONTEND=noninteractive apt remove -y docker-ce docker-ce-cli -./apt-install -y ca-certificates curl +../../contrib/apt-install -y ca-certificates curl install -m 0755 -d /etc/apt/keyrings curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc chmod a+r /etc/apt/keyrings/docker.asc @@ -18,7 +18,7 @@ echo \ apt update -y apt-cache madison docker-ce | awk '{ print $3 }' verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" -./apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" +../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" # docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh index 7cf704f07..a5b13a107 100755 --- a/scripts/ci/java-test.sh +++ b/scripts/ci/java-test.sh @@ -2,6 +2,8 @@ cd ../.. || exit 1 +sudo modprobe iptable_filter + failures="" docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh index d5646468e..7e00ab65a 100755 --- a/scripts/ci/loongarch64-qemu-test.sh +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -4,7 +4,7 @@ set -o nounset set -o errexit set -x -./apt-install \ +../../contrib/apt-install \ apt-transport-https \ ca-certificates \ curl \ @@ -19,7 +19,7 @@ add-apt-repository \ $(lsb_release -cs) \ stable test" -./apt-install docker-ce +../../contrib/apt-install docker-ce # shellcheck source=/dev/null . /etc/lsb-release diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f8f797c1e..ff75717c5 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -1,43 +1,22 @@ #!/bin/bash set -e -x +contrib/dependencies/dnf-packages.sh dnf install -y \ diffutils \ + e2fsprogs \ findutils \ gawk \ - gcc \ - git \ - gnutls-devel \ gzip \ - iproute \ - iptables \ - nftables \ - nftables-devel \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libbsd-devel \ + kmod \ libselinux-utils \ - make \ procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-PyYAML \ - python3-protobuf \ python3-pip \ - python3-importlib-metadata \ python-unversioned-command \ redhat-rpm-config \ sudo \ tar \ - which \ - e2fsprogs \ - rubygem-asciidoctor \ - libdrm-devel \ - libuuid-devel \ - kmod + which # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 617f54fc6..9fbdd8e30 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,12 +1,7 @@ #!/bin/bash set -x -e -CI_PKGS=(protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev - libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev - libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time libbsd-dev python3-yaml uuid-dev - libperl-dev pkg-config python3-protobuf python3-pip - python3-importlib-metadata libdrm-dev) +CI_PKGS=() X86_64_PKGS=(gcc-multilib) @@ -60,7 +55,8 @@ ci_prep () { CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "${CI_PKGS[@]}" + contrib/dependencies/apt-packages.sh + contrib/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" } @@ -187,7 +183,7 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then done apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "${IA32_PKGS[@]}" + contrib/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index c222e30e0..f69b11352 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -22,9 +22,8 @@ setup() { wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ - ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ - openssh-client + ../../contrib/apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ + ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} @@ -41,16 +40,13 @@ setup() { vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config - ssh default sudo dnf upgrade -y - ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ - libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-protobuf python3-importlib-metadata \ - rubygem-asciidoctor iptables libselinux-devel libbpf-devel python3-yaml libuuid-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' + ssh default sudo dnf upgrade -y + ssh default sudo /vagrant/criu/contrib/dependencies/dnf-packages.sh ssh default cat /proc/cmdline } diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh deleted file mode 100755 index 8be49c787..000000000 --- a/scripts/install-debian-pkgs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Install required packages for development environment in Debian Distro - -REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} - -help_msg="Install required packages for development environment in Debian Distro -Usage: - scripts/install-debian-pkgs.sh" - -function print_help() -{ - exec echo -e "$help_msg" -} - -function process() -{ - sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' "${REQ_PKGS}" )" -} - -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - print_help -else - process -fi From b25ff1d3363ce9ccfc0854009ac0f96431439848 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 26 Sep 2025 16:54:49 +0100 Subject: [PATCH 668/775] Remove travis-ci leftovers Travis CI stopped providing CI minutes for open-source projects some time ago and we have migrated to GitHub actions. Signed-off-by: Radostin Stoyanov --- .travis.yml | 35 ----------------------------------- CONTRIBUTING.md | 7 ------- Makefile | 2 +- Makefile.compel | 4 ++-- scripts/ci/Makefile | 4 ++-- scripts/ci/run-ci-tests.sh | 16 +++++++--------- scripts/ci/vagrant.sh | 7 +------ test/inhfd/memfd.py.checkskip | 2 +- test/zdtm/Makefile.inc | 2 +- 9 files changed, 15 insertions(+), 64 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 94841b3f3..000000000 --- a/.travis.yml +++ /dev/null @@ -1,35 +0,0 @@ -language: c -os: linux -dist: bionic -services: - - docker -jobs: - include: - - os: linux - arch: ppc64le - env: TR_ARCH=local - dist: bionic - - os: linux - arch: ppc64le - env: TR_ARCH=local CLANG=1 - dist: bionic - - os: linux - arch: s390x - env: TR_ARCH=local - dist: bionic - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local RUN_TESTS=1 - dist: focal - group: edge - virt: vm - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local CLANG=1 RUN_TESTS=1 - group: edge - virt: vm - dist: bionic -script: - - sudo make -C scripts/ci $TR_ARCH -after_success: - - make -C scripts/ci after_success diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ad4aa101..2d1dc8227 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -158,11 +158,6 @@ make test The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. -In case you'd rather have someone else run the tests, you can use travis-ci for your -own GitHub fork of CRIU. It will check the compilation for various supported platforms, -as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu -for more details. - ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or @@ -420,5 +415,3 @@ sometimes a patch may fly around a week before it gets reviewed. Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. - -We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. diff --git a/Makefile b/Makefile index 3e5d62726..611bcdd5a 100644 --- a/Makefile +++ b/Makefile @@ -43,7 +43,7 @@ ifeq ($(ARCH),arm) endif ifeq ($(ARMV),8) - # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. + # Running 'setarch linux32 uname -m' returns armv8l on aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. ARCHCFLAGS += -march=armv7-a diff --git a/Makefile.compel b/Makefile.compel index 764afadc8..a4209edc5 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series (used on -# Travis) is not, so we have to write them here explicitly. +# match targeting, where GNU make 3.x series is not, +# so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index ed30e4268..bad8065f2 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -30,9 +30,9 @@ endif export CONTAINER_TERMINAL +# Here we assume that any CPU architecture besides x86_64 is running in containers +# that may not support running docker with '--privileged'. ifeq ($(UNAME),x86_64) - # On anything besides x86_64 Travis is running unprivileged LXD - # containers which do not support running docker with '--privileged'. CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run else CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 9fbdd8e30..7a8345b7c 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,13 +11,11 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # For Travis only x86_64 seems to be baremetal. Other - # architectures are running in unprivileged LXD containers. - # That seems to block most of CRIU's interfaces. - - # But with the introduction of baremetal aarch64 systems in - # Travis (arch: arm64-graviton2) we can override this using - # an environment variable + # Some tests rely on kernel features that may not be availble + # when running in a container. Here we assume that x86_64 + # systems are baremetal, and skip the tests for all other + # CPU architectures. We can override this using the RUN_TESTS + # environment variable (e.g., for aarch64). [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi @@ -31,7 +29,7 @@ ci_prep () { # not run anymore with 'sudo -u \#1000' if the UID does not exist. adduser -u 1000 --disabled-password --gecos "criutest" criutest || : - # This can fail on aarch64 travis + # This can fail on aarch64 service apport stop || : # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user @@ -258,7 +256,7 @@ if [ -z "$SKIP_EXT_DEV_TEST" ]; then fi make -C test/others/make/ run CC="$CC" -if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then +if [ -n "$CIRCLECI" ]; then # GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with: # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index f69b11352..5f2de32b8 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -11,11 +11,6 @@ FEDORA_VERSION=42 FEDORA_BOX_VERSION=1.1.0 setup() { - if [ -n "$TRAVIS" ]; then - # Load the kvm modules for vagrant to use qemu - modprobe kvm kvm_intel - fi - # Tar up the git checkout to have vagrant rsync it to the VM tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. @@ -29,7 +24,7 @@ setup() { vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. - # Travis VMs should have around 7.5GB. + # VMs in our CI typically have around 16GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' # Sync /tmp/criu.tar into the VM diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 27e2b7b15..32c57d929 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -3,5 +3,5 @@ import ctypes libc = ctypes.CDLL(None) -# libc may not have memfd_create (e.g., centos on travis) +# libc may not have memfd_create (e.g., centos) libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index c19888da3..3b349ed4d 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -27,7 +27,7 @@ ifeq ($(ARCH),arm) else ifeq ($(ARMV),7) ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) - # To build aarch32 on armv8 Travis-CI (see criu Makefile) + # To build aarch32 on armv8 (see criu Makefile) ARCHCFLAGS += -march=armv7-a ARMV := 7 endif From 0a81dc8bbe9aa4acadc5a47f7a0e276940f9edb5 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 26 Sep 2025 23:38:08 +0900 Subject: [PATCH 669/775] ci/java: update base image from focal to jammy Ubuntu Focal Fossa (20.04) reached its end-of-life on 31 May 2025. So, move over to using Ubuntu Jammy (22.04) base images. Also, focal repos do not have libtracefs, which the uprobes zdtm test needs. Signed-off-by: Shashank Balaji --- scripts/build/Dockerfile.hotspot-ubuntu | 2 +- scripts/build/Dockerfile.openj9-ubuntu | 2 +- scripts/ci/run-ci-tests.sh | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu index 76aa571fa..a459e1ec7 100644 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/eclipse-temurin:11-focal +FROM docker.io/library/eclipse-temurin:11-jammy ARG CC=gcc COPY . /criu diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 825495659..18664f100 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,4 +1,4 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-jammy ARG CC=gcc RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 7a8345b7c..05a3b71e8 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -11,11 +11,10 @@ IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # Some tests rely on kernel features that may not be availble - # when running in a container. Here we assume that x86_64 - # systems are baremetal, and skip the tests for all other - # CPU architectures. We can override this using the RUN_TESTS - # environment variable (e.g., for aarch64). + # Some tests rely on kernel features that may not be available + # when running in a container. Here we assume that x86_64 systems + # are baremetal, and skip the tests for all other CPU architectures. + # The RUN_TESTS environment variable can override this, e.g., for aarch64. [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi From 76394e93a818af92a682946a0dcb97fdabb71099 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 27 Sep 2025 09:21:26 +0100 Subject: [PATCH 670/775] ci: consolidate aarch64 tests on GitHub runners Currently we run aarch64 tests on both Cirrus CI and GitHub runners. However, Cirrus CI fails with "Monthly compute limit exceeded!". This change removes the redundant tests to streamline our CI process. Signed-off-by: Radostin Stoyanov --- .cirrus.yml | 22 ---------------------- .github/workflows/aarch64-test.yaml | 6 ++++-- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 99dd70d63..72dbb3898 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -88,28 +88,6 @@ task: build_script: | make -C scripts/ci vagrant-fedora-non-root -task: - name: aarch64 build GCC (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local - -task: - name: aarch64 build CLANG (native) - arm_container: - image: docker.io/library/ubuntu:jammy - cpu: 4 - memory: 4G - script: uname -a - build_script: | - contrib/apt-install make - make -C scripts/ci local CLANG=1 - task: name: aarch64 Fedora Rawhide arm_container: diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml index 32b19e176..ebbecadb3 100644 --- a/.github/workflows/aarch64-test.yaml +++ b/.github/workflows/aarch64-test.yaml @@ -9,14 +9,16 @@ concurrency: jobs: build: - runs-on: ubuntu-24.04-arm strategy: matrix: + os: [ubuntu-24.04-arm, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} + steps: - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} + - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} # Following tests are failing on the VMs: # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) From 7a4b35a91032d36be3469ac4c142ea2d0c399313 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 1 Oct 2025 11:20:13 +0100 Subject: [PATCH 671/775] contributing: update links to mailing list Our previous mailing list had some technical issues and we created a new one that is hopefully more reliable. Signed-off-by: Radostin Stoyanov --- CONTRIBUTING.md | 12 ++++++------ crit/pyproject.toml | 2 +- crit/setup.cfg | 2 +- lib/pyproject.toml | 2 +- lib/setup.cfg | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d1dc8227..03875639d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,8 +8,8 @@ Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); @@ -366,7 +366,7 @@ We recommend to post patches using `git send-email` ``` git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev + --confirm=always --to=criu@lists.linux.dev criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -381,11 +381,11 @@ configure it to point it to your SMTP server with something like: git config --global sendemail.smtpServer stmp.example.net ``` -If you get tired of typing `--to=criu@openvz.org` all the time, +If you get tired of typing `--to=criu@lists.linux.dev` all the time, you can configure that to be automatically handled as well: ``` -git config sendemail.to criu@openvz.org +git config sendemail.to criu@lists.linux.dev ``` If a developer is sending another version of the patch (e.g. to address @@ -398,7 +398,7 @@ version if needed though). ### Mail patches -The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. +The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). diff --git a/crit/pyproject.toml b/crit/pyproject.toml index 9089f0a39..f0b185eb7 100644 --- a/crit/pyproject.toml +++ b/crit/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "crit" description = "CRiu Image Tool" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/crit/setup.cfg b/crit/setup.cfg index fbc9a5143..37895923f 100644 --- a/crit/setup.cfg +++ b/crit/setup.cfg @@ -7,7 +7,7 @@ name = crit description = CRiu Image Tool author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: crit.__version__ diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 8eb4b7084..c9e11551b 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" name = "pycriu" description = "Python bindings for CRIU" authors = [ - {name = "CRIU team", email = "criu@openvz.org"}, + {name = "CRIU team", email = "criu@lists.linux.dev"}, ] license = {text = "GPLv2"} dynamic = ["version"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 23ee48dd5..5d75719ca 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -7,7 +7,7 @@ name = pycriu description = Python bindings for CRIU author = CRIU team -author_email = criu@openvz.org +author_email = criu@lists.linux.dev license = GPLv2 version = attr: pycriu.__version__ From 3379c122e53524a47a31867fa96d5809253c7c4a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 2 Oct 2025 08:39:30 +0100 Subject: [PATCH 672/775] page-xfer: fix incompatible pointer type on armv7 page_pipe_read() expects an 'unsigned long *', but pi->nr_pages is u64. On 32-bit platforms (e.g., armv7), passing &pi->nr_pages directly causes a compiler error. To fix this we introduce a temporary variable and copy the result back to pi->nr_pages. Fixes: #2756 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/page-xfer.c | 9 +++++++-- criu/pagemap.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index e2913b924..463d4c506 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -1139,13 +1139,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; - unsigned long len; + unsigned long len, nr_pages; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; - ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); + /* page_pipe_read() uses 'unsigned long *' but pi->nr_pages is u64. + * Use a temporary variable to fix the incompatible pointer type + * on 32-bit platforms (e.g. armv7). */ + nr_pages = pi->nr_pages; + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); if (ret) return ret; @@ -1154,6 +1158,7 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * .dst_id all remain intact. */ + pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index b6ec3e333..6c9c4f7fe 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -171,7 +171,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%lx vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%" PRIx64 " vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } From 77553f07d3057dec544ed243cb6a20d933bdd7b5 Mon Sep 17 00:00:00 2001 From: Pepper Gray Date: Tue, 30 Sep 2025 22:58:29 +0200 Subject: [PATCH 673/775] make: prevent redefinition of 'struct sigcontext' Compilation on gentoo/arm64 (llvm+musl) fails with: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ This is happening because and are mutually incompatible on Linux. To fix, use instead of for arm64 (like all others arches do). Fixes: #2766 Signed-off-by: Pepper Gray --- compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h | 3 ++- criu/arch/aarch64/include/asm/restorer.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index 9152024fd..a3528500d 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,10 +1,11 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include +#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 64a9c24eb..2174df4fa 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" From 790b3cf425400cdea794466f3f11c55ca42e8552 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 2 Oct 2025 12:03:57 -0700 Subject: [PATCH 674/775] ci: run alpine tests on arm64 These tests reveal the following build error: In file included from compel/include/uapi/compel/asm/sigframe.h:4, from compel/plugins/std/infect.c:14: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ In file included from criu/arch/aarch64/include/asm/restorer.h:4, from criu/arch/aarch64/crtools.c:11: /usr/include/asm/sigcontext.h:28:8: error: redefinition of 'struct sigcontext' 28 | struct sigcontext { | ^~~~~~~~~~ Inspired by #2766 / #2767. Signed-off-by: Kir Kolyshkin Signed-off-by: Radostin Stoyanov --- .github/workflows/alpine-test.yml | 3 ++- contrib/dependencies/apk-packages.sh | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 73530d79a..0f5c20f48 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -9,10 +9,11 @@ concurrency: jobs: build: - runs-on: ubuntu-22.04 strategy: matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index 0084dea3a..d02704b15 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -22,6 +22,7 @@ apk add --no-cache \ libnl3-dev \ nftables \ nftables-dev \ + perl \ pkgconfig \ procps \ protobuf-c-compiler \ From 520266d8959b48bac345985874f6008f70755af2 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 3 Oct 2025 17:02:25 +0100 Subject: [PATCH 675/775] zdtm: add sk-unix-restore-fs-share test Add a ZDTM test case where CRIU uses a helper process to restore a non-empty process group with a terminated leader and a Unix domain socket. This reproduces a corner case in which mount namespace switching can fail during restore: https://github.com/checkpoint-restore/criu/issues/2687 Signed-off-by: Qiao Ma Signed-off-by: Radostin Stoyanov --- test/zdtm/static/Makefile | 1 + test/zdtm/static/sk-unix-restore-fs-share.c | 196 ++++++++++++++++++ .../zdtm/static/sk-unix-restore-fs-share.desc | 1 + 3 files changed, 198 insertions(+) create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.c create mode 100644 test/zdtm/static/sk-unix-restore-fs-share.desc diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index e73f964be..6b262c443 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -382,6 +382,7 @@ TST_FILE = \ sk-unix-listen02 \ sk-unix-listen03 \ sk-unix-listen04 \ + sk-unix-restore-fs-share \ mnt_ext_file_bind_auto \ TST_DIR = \ diff --git a/test/zdtm/static/sk-unix-restore-fs-share.c b/test/zdtm/static/sk-unix-restore-fs-share.c new file mode 100644 index 000000000..d4f6dde75 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test non-empty process group with terminated parent and unix socket"; +const char *test_author = "Qiao Ma "; + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +static int create_and_connect(void) +{ + struct sockaddr_un addr; + int client_fd; + + client_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (client_fd == -1) { + pr_perror("socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", filename) >= (int)sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + close(client_fd); + return -1; + } + + if (connect(client_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("connect"); + close(client_fd); + return -1; + } + + return 0; +} + +static int child(int ready_fd) +{ + int listen_fd; + struct sockaddr_un addr; + int ret = EXIT_FAILURE; + + listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_fd == -1) { + pr_perror("socket"); + return EXIT_FAILURE; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (strlen(filename) >= sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + goto cleanup; + } + strncpy(addr.sun_path, filename, sizeof(addr.sun_path)); + + unlink(filename); /* Ignore error if file doesn't exist */ + + if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("bind"); + goto cleanup; + } + + if (listen(listen_fd, 5) == -1) { + pr_perror("listen"); + goto cleanup; + } + + if (create_and_connect() != 0) { + pr_err("Failed to create and connect\n"); + goto cleanup; + } + + /* Signal parent that socket is ready */ + if (write(ready_fd, "1", 1) != 1) { + pr_perror("write ready_fd"); + goto cleanup; + } + + /* Wait indefinitely */ + pause(); + + ret = EXIT_SUCCESS; +cleanup: + if (listen_fd != -1) + close(listen_fd); + unlink(filename); + + return ret; +} + +static int zombie_leader(int *cpid) +{ + char buf; + pid_t pid; + int pipefd[2]; + + if (pipe(pipefd) == -1) { + pr_perror("pipe"); + return EXIT_FAILURE; + } + + if (setpgid(0, 0) == -1) { + pr_perror("setpgid"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork child"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /* Close read end */ + close(pipefd[0]); + exit(child(pipefd[1])); + } + + /* Close write end in parent */ + close(pipefd[1]); + + /* Wait for child to set up socket */ + if (read(pipefd[0], &buf, 1) != 1) { + pr_err("Failed to receive readiness signal from child\n"); + close(pipefd[0]); + return EXIT_FAILURE; + } + close(pipefd[0]); + + *cpid = pid; + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ret = EXIT_FAILURE, status; + pid_t pid; + int *cpid; + + test_init(argc, argv); + + cpid = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (cpid == MAP_FAILED) { + pr_perror("mmap"); + return EXIT_FAILURE; + } + *cpid = 0; + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork zombie"); + goto out; + } + + if (pid == 0) + exit(zombie_leader(cpid)); + + if (waitpid(pid, &status, 0) < 0) { + pr_perror("Failed to waitpid zombie"); + goto out; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != EXIT_SUCCESS) { + pr_err("Unexpected exit code: %d\n", WEXITSTATUS(status)); + goto out; + } + + if (!*cpid) { + pr_err("Don't know grandchild's pid\n"); + goto out; + } + + test_daemon(); + test_waitsig(); + + ret = EXIT_SUCCESS; + pass(); +out: + /* Clean up */ + if (*cpid) + kill(*cpid, SIGKILL); + + munmap(cpid, sizeof(int)); + + return ret; +} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.desc b/test/zdtm/static/sk-unix-restore-fs-share.desc new file mode 100644 index 000000000..6c4afe5f0 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} From 7bf402f6b3f117e9e464c39fcebf23b2a1af3644 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 11:00:07 +0900 Subject: [PATCH 676/775] vma: introduce VMA_AREA_UPROBES flag This flag will be used for a "[uprobes]" vma. Signed-off-by: Shashank Balaji --- criu/include/image.h | 7 +++++++ criu/util.c | 1 + 2 files changed, 8 insertions(+) diff --git a/criu/include/image.h b/criu/include/image.h index 934f7d4e9..b5951d3d4 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -74,6 +74,12 @@ * about virtual address space ranges covered by * MADV_GUARD_INSTALL guards. These ones must be always at * the end of the vma_area_list and properly skipped a.e. + * - uprobes + * stands for a "[uprobes]" vma that's automatically mapped by + * the kernel when an active uprobe is hit. Contents of this vma + * are not dumped and neither are its madvise bits restored, + * because the kernel is in complete control of this vma. This is + * just used to track the existence of the uprobes vma. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -94,6 +100,7 @@ #define VMA_AREA_MEMFD (1 << 14) #define VMA_AREA_SHSTK (1 << 15) #define VMA_AREA_GUARD (1 << 16) +#define VMA_AREA_UPROBES (1 << 17) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) diff --git a/criu/util.c b/criu/util.c index 58c18e20b..e2f80e4c6 100644 --- a/criu/util.c +++ b/criu/util.c @@ -195,6 +195,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_UPROBES, "uprobes"); #undef opt2s } From 0ff2e0a66e49c0ad0f8b8997ea773a0fc94b1223 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:53:18 +0900 Subject: [PATCH 677/775] criu-coredump: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- coredump/criu_coredump/coredump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index c6a758c8a..9454d8f0b 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -55,6 +55,7 @@ status = { "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, "VMA_AREA_MEMFD": 1 << 14, + "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } From 74bf40feeb683a668a9f1b192da627bb2d16fa67 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Mon, 18 Aug 2025 10:54:28 +0900 Subject: [PATCH 678/775] crit: add VMA_AREA_UPROBES flag Signed-off-by: Shashank Balaji --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index 6c4f68889..a35dd3c3f 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -105,6 +105,7 @@ mmap_status_map = [ ('VMA_AREA_AIORING', 1 << 13), ('VMA_AREA_MEMFD', 1 << 14), ('VMA_AREA_SHSTK', 1 << 15), + ('VMA_AREA_UPROBES', 1 << 17), ('VMA_UNSUPP', 1 << 31), ] From bab72af9a5d5d9f715c351cdc5de51eabc3f7727 Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:03:39 +0900 Subject: [PATCH 679/775] vma: introduce --allow-uprobes option This commit teaches criu to deal with processes which have a "[uprobes]" vma. This vma is mapped by the kernel when execution hits a uprobe location. This is done so as to execute the uprobe'd instruciton out-of-line in the special vma. The uprobe'd location is replaced by a software breakpoint instruction, which is int3 on x86. When execution reaches that location, control is transferred over to the kernel, which then executes whatever handler code it has to, for the uprobe, and then executed the replaced instruction out-of-line in the special vma. For more details, refer to this commit: https://github.com/torvalds/linux/commit/d4b3b6384f98f8692ad0209891ccdbc7e78bbefe Reason for adding a new option ------------------------------ A new option is added instead of making the uprobes vma handling transparent to the user, so that when a dump is attempted on a process tree in which a process has the uprobes vma, criu will error, asking the user to use this option. This gives the user a chance to check what uprobes are attached to the processes being dumped, and try to ensure that those uprobes are active on restore as well. Again, the same reason for requiring this option on restore as well. Because if a process is dumped with an active uprobe, and on restore if the uprobe is not active, then if execution reaches the uprobe location, then the process will be sent a SIGTRAP, whose default behaviour will terminate and core dump the process. This is because the code pages are dumped with the software breakpoint instruction replacement at the uprobe'd locations. On restore, if execution reaches these locations and the kernel sees no associated active uprobes, then it'll send a SIGTRAP. So, using this option is on dump and restore is an implicit guarantee on the user's behalf that they'll take care of the active uprobes and that any future SIGTRAPs because of this are not on us! :) Handling uprobes vma on dump ---------------------------- We don't need to store any information about the uprobes vma because it's completely handled by the kernel, transparent to userspace. So, when a uprobes vma is detected, we check if the --allow-uprobes option was specified or not. If so, then the allow_uprobes boolean in the inventory image is set (this is used on restore). The uprobes vma is skipped from being added to the vma list. Handling uprobes vma on restore ------------------------------- If allow_uprobes is set in the inventory image, then check if --allow-uprobes is specified or not. Restoring the vma is not required. Fixes: checkpoint-restore#1961 Signed-off-by: Shashank Balaji --- criu/config.c | 2 ++ criu/cr-dump.c | 4 ++++ criu/crtools.c | 2 ++ criu/image.c | 5 +++++ criu/include/cr_options.h | 1 + criu/include/image.h | 2 ++ criu/include/proc_parse.h | 2 ++ criu/proc_parse.c | 24 +++++++++++++++++++++++- images/inventory.proto | 1 + 9 files changed, 42 insertions(+), 1 deletion(-) diff --git a/criu/config.c b/criu/config.c index 1322a490a..d7ef3f8e8 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,6 +18,7 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" +#include "image.h" #include "irmap.h" #include "mount.h" #include "mount-v2.h" @@ -703,6 +704,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), BOOL_OPT("unprivileged", &opts.unprivileged), BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), {}, }; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 10c485cbe..60b8e793c 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2319,6 +2319,10 @@ int cr_dump_tasks(pid_t pid) goto err; he.has_pre_dump_mode = false; + if (found_uprobes_vma()) { + he.has_allow_uprobes = true; + he.allow_uprobes = true; + } ret = write_img_inventory(&he); if (ret) diff --git a/criu/crtools.c b/criu/crtools.c index 509e73d74..203bded81 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -427,6 +427,8 @@ usage: " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" " consult documentation for further details\n" + " --allow-uprobes allow dump/restore with uprobes vma\n" + " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/image.c b/criu/image.c index f3747d6ff..c4f05e159 100644 --- a/criu/image.c +++ b/criu/image.c @@ -95,6 +95,11 @@ int check_img_inventory(bool restore) goto out_err; } + if (restore && he->allow_uprobes && !opts.allow_uprobes) { + pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); + goto out_err; + } + if (restore) { if (!he->has_network_lock_method) { /* diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 4df8056b7..8c5707b41 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -196,6 +196,7 @@ struct cr_options { char *work_dir; int network_lock_method; int skip_file_rwx_check; + int allow_uprobes; /* * When we scheduler for removal some functionality we first diff --git a/criu/include/image.h b/criu/include/image.h index b5951d3d4..b06dbf706 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -114,6 +114,8 @@ #define CR_PARENT_LINK "parent" +#define OPT_ALLOW_UPROBES "allow-uprobes" + extern bool ns_per_id; extern bool img_common_magic; diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0bd79bf55..76d3242d2 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -105,4 +105,6 @@ extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); +extern bool found_uprobes_vma(void); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/proc_parse.c b/criu/proc_parse.c index d7eb25662..0d3b5b23f 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -74,6 +74,8 @@ struct buffer { static struct buffer __buf; static char *buf = __buf.buf; +/* only ever goes from false to true, if at all */ +static bool uprobes_vma_exists = false; /* * This is how AIO ring buffers look like in proc @@ -202,8 +204,11 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + * + * The uprobes vma is also mapped by the kernel with VM_IO, among other flags */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED) + && !vma_area_is(vma_area, VMA_AREA_UPROBES)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) @@ -603,6 +608,14 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else if (!strcmp(file_path, "[uprobes]")) { + uprobes_vma_exists = true; + if (!opts.allow_uprobes) { + pr_err("PID %d has uprobes vma. Consider using --" OPT_ALLOW_UPROBES ".\n", + pid); + goto err; + } + vma_area->e->status |= VMA_AREA_UPROBES; } else { vma_area->e->status = VMA_AREA_REGULAR; } @@ -739,6 +752,10 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area */ pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", vma_area->e->start, vma_area->e->end); + } else if (vma_area->e->status & VMA_AREA_UPROBES) { + pr_debug("Skipping uprobes vma %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + return 0; } else if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); @@ -2929,3 +2946,8 @@ int parse_uptime(uint64_t *upt) fclose(f); return 0; } + +bool found_uprobes_vma(void) +{ + return uprobes_vma_exists; +} diff --git a/images/inventory.proto b/images/inventory.proto index 1e18815bb..feed5b850 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -33,4 +33,5 @@ message inventory_entry { // This is currently used to delete the correct nftables // network locking rule. optional string dump_criu_run_id = 13; + optional bool allow_uprobes = 14; } From aeec40bf026df5218be0a8c381f33bc47de94203 Mon Sep 17 00:00:00 2001 From: "Mahadasyam, Shashank (SGC)" Date: Mon, 18 Aug 2025 01:04:10 +0900 Subject: [PATCH 680/775] docs: add documentation for --allow-uprobes Signed-off-by: Shashank Balaji --- Documentation/criu.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 606935790..40ede84e2 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -465,6 +465,30 @@ The 'mode' may be one of the following: *skip*::: Don't lock the network. If *--tcp-close* is not used, the network must be locked externally to allow CRIU to dump TCP connections. +*--allow-uprobes*:: + Allow dumping when uprobes vma is present. When used on dump, this option is + required on restore as well. + + A uprobes vma is automatically created by the kernel once a uprobe is + triggered. This mapping is not removed even once the uprobe is deleted. So, + even if a process once had uprobes attached to it, and they're removed by + the time the process is dumped, this option is still required because criu + has no way of knowing whether there are active uprobes or not. + + When using this option on restore, make sure the uprobes (if any) active on + the dumped processes are still active. Otherwise, when execution reaches + a uprobe'd location in any of the restored processes, that process will be + sent a SIGTRAP. + + As an example, say a uprobe is set at function foo in the executable of the + process p_bar. Whenever execution in p_bar reaches function foo, the uprobe + is triggered. If the uprobe has been triggered at least once, then the kernel + will have created the uprobes vma. To dump p_bar, this option is + necessary. After dumping, say the uprobe is deleted. Now, on restoring with + this option, once execution reaches function foo, SIGTRAP will be sent to + the restored p_bar. Unless it has a signal handler installed for SIGTRAP, + it will be terminated and core dumped. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. @@ -692,6 +716,10 @@ The 'mode' may be one of the following: *--skip-file-rwx-check*:: Skip checking file permissions (r/w/x for u/g/o) on restore. +*--allow-uprobes*:: + Required when dumped with this option. Refer to this option in the section + on dumping for more details. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to From f548d3af4a8fd2d71dcb0592dec7d66e54786f26 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 20 Aug 2025 22:05:03 +0900 Subject: [PATCH 681/775] crtools: remove "consult documentation" Most people know this, don't they? :) Suggested-by: Radostin Stoyanov Signed-off-by: Shashank Balaji --- criu/crtools.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/crtools.c b/criu/crtools.c index 203bded81..e207133ac 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -426,9 +426,7 @@ usage: " --network-lock METHOD network locking/unlocking method; argument\n" " can be 'nftables' or 'iptables' (default).\n" " --unprivileged accept limitations when running as non-root\n" - " consult documentation for further details\n" " --allow-uprobes allow dump/restore with uprobes vma\n" - " consult documentation for further details\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" From dcce9bd0e2fb330cf2dc124c6ea2ee09af5133e1 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Fri, 22 Aug 2025 12:47:16 +0900 Subject: [PATCH 682/775] zdtm: add a test for --allow-uprobes option Program flow: - Parse the test's own executable to calculate the file offset of the uprobe target function symbol - Enable the uprobe at the target function - Call the target function to trigger the uprobe, and hence the uprobes vma creation - C/R - Call the target function again to check that no SIGTRAP is sent, since the uprobe is still active At least v1.7 of libtracefs is required because that's when tracefs_instance_reset was introduced. The uprobes API was introduced in v1.4, and the dynamic events API was introduced in v1.3. Ubuntu Focal doesn't have libtracefs. Jammy has v1.2.5, and Noble has v1.7. Signed-off-by: Shashank Balaji --- contrib/dependencies/apk-packages.sh | 3 + contrib/dependencies/apt-cross-packages.sh | 5 +- contrib/dependencies/apt-packages.sh | 3 + contrib/dependencies/dnf-packages.sh | 5 +- contrib/dependencies/pacman-packages.sh | 3 + test/zdtm/static/Makefile | 9 +- test/zdtm/static/uprobes.c | 295 +++++++++++++++++++++ test/zdtm/static/uprobes.desc | 6 + 8 files changed, 326 insertions(+), 3 deletions(-) create mode 100644 test/zdtm/static/uprobes.c create mode 100644 test/zdtm/static/uprobes.desc diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh index d02704b15..c47fb9fe0 100755 --- a/contrib/dependencies/apk-packages.sh +++ b/contrib/dependencies/apk-packages.sh @@ -6,6 +6,7 @@ apk add --no-cache \ build-base \ coreutils \ e2fsprogs \ + elfutils-dev \ git \ gnutls-dev \ go \ @@ -20,6 +21,8 @@ apk add --no-cache \ libdrm-dev \ libnet-dev \ libnl3-dev \ + libtraceevent-dev \ + libtracefs-dev \ nftables \ nftables-dev \ perl \ diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh index 588be40d0..30ce6874c 100755 --- a/contrib/dependencies/apt-cross-packages.sh +++ b/contrib/dependencies/apt-cross-packages.sh @@ -14,6 +14,8 @@ fi libc6-"${DEBIAN_ARCH}"-cross \ libc6-dev-"${DEBIAN_ARCH}"-cross \ libcap-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + libelf-dev:"${DEBIAN_ARCH}" \ libexpat1-dev:"${DEBIAN_ARCH}" \ libgnutls28-dev:"${DEBIAN_ARCH}" \ libnet-dev:"${DEBIAN_ARCH}" \ @@ -23,9 +25,10 @@ fi libprotobuf-c-dev:"${DEBIAN_ARCH}" \ libprotobuf-dev:"${DEBIAN_ARCH}" \ libssl-dev:"${DEBIAN_ARCH}" \ + libtraceevent-dev:"${DEBIAN_ARCH}" \ + libtracefs-dev:"${DEBIAN_ARCH}" \ ncurses-dev:"${DEBIAN_ARCH}" \ uuid-dev:"${DEBIAN_ARCH}" \ - libdrm-dev:"${DEBIAN_ARCH}" \ build-essential \ pkg-config \ git \ diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh index c60ba9041..1fd42d4e6 100755 --- a/contrib/dependencies/apt-packages.sh +++ b/contrib/dependencies/apt-packages.sh @@ -19,6 +19,7 @@ fi libbsd-dev \ libcap-dev \ libdrm-dev \ + libelf-dev \ libgnutls28-dev \ libgnutls30 \ libnet-dev \ @@ -28,6 +29,8 @@ fi libprotobuf-c-dev \ libprotobuf-dev \ libselinux-dev \ + libtraceevent-dev \ + libtracefs-dev \ pkg-config \ protobuf-c-compiler \ protobuf-compiler \ diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index efbb659c5..00dc91a2e 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -3,6 +3,7 @@ dnf install -y \ asciidoc \ binutils \ + elfutils-libelf-devel \ gcc \ git \ glibc-devel \ @@ -18,6 +19,8 @@ dnf install -y \ libnet-devel \ libnl3-devel \ libselinux-devel \ + libtraceevent-devel \ + libtracefs-devel \ libuuid-devel \ make \ nftables \ @@ -27,9 +30,9 @@ dnf install -y \ protobuf-c-devel \ protobuf-compiler \ protobuf-devel \ - python-devel \ python3-importlib-metadata \ python3-protobuf \ python3-pyyaml \ + python-devel \ rubygem-asciidoctor \ xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh index 5fe6995fb..260797606 100755 --- a/contrib/dependencies/pacman-packages.sh +++ b/contrib/dependencies/pacman-packages.sh @@ -15,8 +15,11 @@ pacman -Syu --noconfirm \ libbsd \ libcap \ libdrm \ + libelf \ libnet \ libnl \ + libtraceevent \ + libtracefs \ nftables \ pkg-config \ protobuf \ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 6b262c443..ea901a805 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -290,6 +290,7 @@ TST_NOFILE := \ PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') +pkg-config-atleast-version = $(shell sh -c '$(PKG_CONFIG) --atleast-version=$(2) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ @@ -298,7 +299,10 @@ endif ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) - TST_NOFILE += maps03 + TST_NOFILE += maps03 +ifeq ($(call pkg-config-atleast-version,libtracefs,1.7),y) + TST_NOFILE += uprobes +endif endif endif @@ -727,6 +731,9 @@ sk-unix-listen04: CFLAGS += -DSK_UNIX_LISTEN02 -DSK_UNIX_LISTEN03 cgroupv2_01: LDLIBS += -pthread +uprobes: CFLAGS += $(call pkg-cflags, libtracefs libtraceevent) +uprobes: LDLIBS += $(call pkg-libs, libtracefs libelf) + $(LIB): force $(Q) $(MAKE) -C $(LIBDIR) diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c new file mode 100644 index 000000000..4164375b7 --- /dev/null +++ b/test/zdtm/static/uprobes.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test the --allow-uprobes option"; +const char *test_author = "Shashank Balaji "; + +#define UPROBE_GROUP_NAME "zdtm" +#define UPROBE_EVENT_NAME "uprobes_test" +#define UPROBED_FUNCTION uprobe_target + +/* + * A uprobe can be set at the start of a function, but not all instructions + * will trigger the creation of a uprobes vma. + * + * Examples: + * - aarch64: if the function is a single `ret`, then no vma creation + * - x64: if the function is `nop; ret`, then no vma creation + * + * So to guarantee vma creation, create a volatile dummy variable (to prevent + * compiler optimization) and use it (to prevent "unused variable" warning) + */ +void UPROBED_FUNCTION(void) { + volatile int dummy = 0; + dummy += 1; +} +/* Calling via volatile function pointer ensures noinline at callsite */ +typedef void (*func_ptr)(void); +volatile func_ptr uprobe_target_alias = UPROBED_FUNCTION; + +struct uprobe_context { + struct tracefs_instance *instance; + struct tracefs_dynevent *uprobe; +}; + +volatile bool got_sigtrap = false; + +/* + * Returns the file offset of a symbol in the executable of this program + * Returns 0 on failure +*/ +uint64_t calc_sym_offset(const char *sym_name) +{ + GElf_Shdr section_header; + Elf_Scn *section = NULL; + Elf_Data *symtab_data; + uint64_t offset = 0; + char buf[PATH_MAX]; + GElf_Sym symbol; + ssize_t n_bytes; + int n_entries; + Elf *elf; + int fd; + int i; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_err("ELF version of libelf is lower than that of the program\n"); + return 0; + } + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 0; + } + buf[n_bytes] = '\0'; + + fd = open(buf, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open self-executable"); + return 0; + } + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_err("%s\n", elf_errmsg(elf_errno())); + goto out_fd; + } + + /* Look for the symbol table section and its header */ + while ((section = elf_nextscn(elf, section)) != NULL) { + gelf_getshdr(section, §ion_header); + if (section_header.sh_type == SHT_SYMTAB) + break; + } + if (!section) { + pr_err("Failed to find symbol table\n"); + goto out_elf; + } + symtab_data = elf_getdata(section, NULL); + n_entries = section_header.sh_size / section_header.sh_entsize; + + /* Look for a symbol with the required name */ + for (i = 0; i < n_entries; i++) { + gelf_getsym(symtab_data, i, &symbol); + /* Symbol table's sh_link is the index of the string table section header */ + if (!strcmp(sym_name, + elf_strptr(elf, section_header.sh_link, symbol.st_name))) + break; + } + if (i == n_entries) { + pr_err("Failed to find symbol \"%s\"\n", sym_name); + goto out_elf; + } + + /* Get the section the symbol belongs to (mostly .text) */ + section = elf_getscn(elf, symbol.st_shndx); + gelf_getshdr(section, §ion_header); + offset = symbol.st_value - section_header.sh_addr + section_header.sh_offset; + +out_elf: + elf_end(elf); +out_fd: + close(fd); + return offset; +} + +/* + * Set and enable a uprobe on the file at the given offset + * Returns struct uprobe_context with members set to NULL on failure +*/ +struct uprobe_context enable_uprobe(const char *file, uint64_t offset) +{ + struct tracefs_instance *trace_instance; + struct tracefs_dynevent *uprobe; + struct uprobe_context context = {}; + + trace_instance = tracefs_instance_create("zdtm_uprobes_test"); + if (!trace_instance) { + pr_perror("Failed to create tracefs instance"); + return context; + } + tracefs_instance_reset(trace_instance); + + uprobe = tracefs_uprobe_alloc(UPROBE_GROUP_NAME, UPROBE_EVENT_NAME, file, offset, NULL); + if (!uprobe) { + pr_perror("Failed to allocate uprobe"); + goto instance_destroy; + } + + if (tracefs_dynevent_create(uprobe)) { + pr_perror("Failed to create uprobe"); + goto uprobe_free; + } + + if (tracefs_event_enable(trace_instance, UPROBE_GROUP_NAME, UPROBE_EVENT_NAME)) { + pr_perror("Failed to enable uprobe"); + goto uprobe_destroy; + } + + context.instance = trace_instance; + context.uprobe = uprobe; + return context; + +uprobe_destroy: + tracefs_dynevent_destroy(uprobe, false); +uprobe_free: + tracefs_dynevent_free(uprobe); +instance_destroy: + tracefs_instance_destroy(trace_instance); + tracefs_instance_free(trace_instance); + return context; +} + +void destroy_uprobe(struct uprobe_context context) +{ + tracefs_dynevent_destroy(context.uprobe, true); + tracefs_dynevent_free(context.uprobe); + tracefs_instance_destroy(context.instance); + tracefs_instance_free(context.instance); +} + +/* + * Check for the existence of the "[uprobes]" vma in /proc/self/maps + * Returns -1 on failure, 0 if not found, 1 if found +*/ +int uprobes_vma_exists(void) +{ + FILE *f; + char buf[LINE_MAX]; + int ret = 0; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + pr_perror("Failed to open /proc/self/maps"); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + if (strstr(buf, "[uprobes]")) { + ret = 1; + break; + } + } + if (ret == 0 && !feof(f)) { + pr_err("Failed to finish reading /proc/self/maps\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +/* + * SIGTRAP is sent if execution reaches a previously set uprobed location, and + * the corresponding uprobe is not active. We don't want this to happen on restore +*/ +void sigtrap_handler(int signo, siginfo_t *info, void* context) +{ + if (info->si_code == SI_KERNEL) { + got_sigtrap = true; + fail("SIGTRAP on attempting to call uprobed function"); + } +} + +int main(int argc, char **argv) +{ + struct uprobe_context context; + struct sigaction sa; + char buf[PATH_MAX]; + uint64_t offset; + int n_bytes; + int ret = 1; + + test_init(argc, argv); + + offset = calc_sym_offset(__stringify(UPROBED_FUNCTION)); + if (!offset) + return 1; + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 1; + } + buf[n_bytes] = '\0'; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = sigtrap_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGTRAP, &sa, NULL)) { + pr_perror("Failed to set SIGTRAP handler"); + return 1; + } + + context = enable_uprobe(buf, offset); + if (!context.instance) + return 1; + + /* + * Execution must reach the uprobed location at least once + * for the kernel to create the uprobes vma + */ + uprobe_target_alias(); + + switch (uprobes_vma_exists()) { + case -1: + goto out_uprobe; + break; + case 0: + pr_err("uprobes vma does not exist\n"); + goto out_uprobe; + break; + case 1: + test_msg("Found uprobes vma\n"); + break; + } + + test_daemon(); + test_waitsig(); + + /* + * Calling the uprobed function after restore should not cause + * a SIGTRAP, since the uprobe is still active + */ + uprobe_target_alias(); + if (!got_sigtrap) { + pass(); + ret = 0; + } + +out_uprobe: + destroy_uprobe(context); + return ret; +} diff --git a/test/zdtm/static/uprobes.desc b/test/zdtm/static/uprobes.desc new file mode 100644 index 000000000..6eab1f498 --- /dev/null +++ b/test/zdtm/static/uprobes.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'cgroupns', + 'flags': 'suid nouser', + 'flavor': 'h', + 'opts': '--allow-uprobes' +} From c03c08d1bca96132a34833c0233ddd48b016f2d7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 10 Sep 2025 10:50:46 +0100 Subject: [PATCH 683/775] cr-service: refactor rpc config parsing When an additional configuration file is specified via RPC, this file is parsed twice: first at an early stage to load options such as --log-file, --work-dir, and --images-dir; and again after all RPC options and configuration files have been evaluated. This allows users to overwrite options specified via RPC by the container runtime (e.g., --tcp-established). However, processing the RPC config file twice leads to silently duplicating the values of repeatable options such as `--action-script`. To address this problem, we adjust the order of options parsing so that the RPC config file is evaluated only once. This change should not introduce any functional changes. Note that this change does not affect the logging functionality, as early log messages are temporarily buffered and only written to the log file once it has been initialized (see commit 1ff2333 "Printout early log messages"). Fixes #2727 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 299 +++++++++++++++++++++------------------------- 1 file changed, 138 insertions(+), 161 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index a1089ad5c..e6aac232e 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -312,156 +312,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; - /* - * Evaluate an additional configuration file if specified. - * This needs to happen twice, because it is needed early to detect - * things like work_dir, imgs_dir and logfile. The second parsing - * of the optional RPC configuration file happens at the end and - * overwrites all options set via RPC. - */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - char *tmp_imgs = opts.imgs_dir; - - opts.output = NULL; - opts.work_dir = NULL; - opts.imgs_dir = NULL; - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - goto err; - } - /* If this is non-NULL, the RPC configuration file had a value, use it.*/ - if (opts.output) - output_changed_by_rpc_conf = true; - /* If this is NULL, use the old value if it was set. */ - if (!opts.output && tmp_output) { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) - work_changed_by_rpc_conf = true; - if (!opts.work_dir && tmp_work) { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - } - - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. - * The idea is that only the RPC configuration file is able to - * overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else if (req->images_dir_fd != -1) - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - else if (req->images_dir) - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - else { - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); - goto err; - } - - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); - - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - /* chdir to work dir */ - if (work_changed_by_rpc_conf) - /* Use the value from the RPC configuration file first. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - /* Use the value set via RPC. */ - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - /* Use the value from one of the other configuration files. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - /* Use the images directory a work directory. */ - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } - - /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); - goto err; - } - - if (req->config_file) { - pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); - } - if (req->has_unprivileged) opts.unprivileged = req->unprivileged; @@ -753,14 +603,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } - - if (req->n_irmap_scan_paths) { - for (i = 0; i < req->n_irmap_scan_paths; i++) { - if (irmap_scan_path_add(req->irmap_scan_paths[i])) - goto err; - } - } - if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); @@ -781,13 +623,148 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_display_stats) opts.display_stats = req->display_stats; - /* Evaluate additional configuration file a second time to overwrite - * all RPC settings. */ + /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + + opts.output = NULL; + opts.work_dir = NULL; + + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(opts.imgs_dir); + opts.imgs_dir = NULL; + + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) + if (i) { + xfree(tmp_output); + xfree(tmp_work); goto err; + } + + /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ + /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ + if (opts.output) { + output_changed_by_rpc_conf = true; + } else { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) { + work_changed_by_rpc_conf = true; + } else { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + + xfree(tmp_output); + xfree(tmp_work); + } + + /* + * open images_dir - images_dir_fd is a required RPC parameter + * + * This assumes that if opts.imgs_dir is set we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + } else if (req->images_dir_fd != -1) { + sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + } else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + goto err; + } + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + goto err; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + goto err; + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + goto err; + } + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } + } + + /* initiate log file in work dir */ + if (req->log_file && !output_changed_by_rpc_conf) { + /* + * If RPC sets a log file and if there nothing from the + * RPC configuration file, use the RPC value. + */ + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + goto err; + } + + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + /* This is needed later to correctly set the log_level */ + opts.log_level = req->log_level; + log_set_loglevel(req->log_level); + if (log_init(opts.output) == -1) { + pr_perror("Can't initiate log"); + goto err; } if (req->mntns_compat_mode) From 9d072222ef7a895c644ffe5be30ed4821dc9e30c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 18:29:34 +0100 Subject: [PATCH 684/775] test/others/rpc: parse action-script via config Extend the test for overwriting config options via RPC with repeatable option (--action-script) and verify that the value will not be silently duplicated. Signed-off-by: Radostin Stoyanov --- test/others/rpc/Makefile | 1 + test/others/rpc/action-script.sh | 17 +++++++++++++++++ test/others/rpc/config_file.py | 11 +++++++++++ 3 files changed, 29 insertions(+) create mode 100755 test/others/rpc/action-script.sh diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index 384eb0539..c0e56d528 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -12,6 +12,7 @@ run: all chmod a+rwx build chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + rm -f build/_marker_* @# Create all log files to be accessible for anybody @# so that they can be displayed by any user. for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ diff --git a/test/others/rpc/action-script.sh b/test/others/rpc/action-script.sh new file mode 100755 index 000000000..991e315de --- /dev/null +++ b/test/others/rpc/action-script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +MARKER_FILE="_marker_${CRTOOLS_SCRIPT_ACTION}" + +if [ -z "$CRTOOLS_SCRIPT_ACTION" ]; then + echo "Error: CRTOOLS_SCRIPT_ACTION is not set." + exit 2 +fi + +if [ ! -f "$MARKER_FILE" ]; then + touch "$MARKER_FILE" +else + echo "Error: Running the same action hook for the second time" + exit 1 +fi + +exit 0 diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 6cffe270d..f5ec40818 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -13,6 +13,9 @@ from setup_swrk import setup_swrk log_file = 'config_file_test.log' does_not_exist = 'does-not.exist' +script_path = os.path.dirname(os.path.abspath(__file__)) +action_script_file = os.path.join(script_path, 'action-script.sh') + def setup_config_file(content): # Creating a temporary file which will be used as configuration file. @@ -156,6 +159,7 @@ def test_rpc_with_configuration_file_overwriting_rpc(): # file settings in the default configuration. log = does_not_exist content = 'log-file ' + log + '\n' + content += 'action-script ' + action_script_file + '\n' content += 'no-tcp-established\nno-shell-job' path = setup_config_file(content) # Only set the configuration file via RPC; @@ -180,11 +184,18 @@ args = vars(parser.parse_args()) cleanup_output(args['dir']) +print("*** Test broken config file ***") test_broken_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC without config file ***") test_rpc_without_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC with config file ***") test_rpc_with_configuration_file() cleanup_output(args['dir']) + +print("*** Test configuration file overwriting RPC ***") test_rpc_with_configuration_file_overwriting_rpc() cleanup_output(args['dir']) From bb9a7202a7a7965495456d3bd5f7aa07e9d06af3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 3 Sep 2025 21:40:02 +0100 Subject: [PATCH 685/775] test/others/rpc: show logs on error Signed-off-by: Radostin Stoyanov --- test/others/rpc/config_file.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index f5ec40818..c1a8276d8 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -92,29 +92,37 @@ def test_broken_configuration_file(): sys.exit(-1) -def search_in_log_file(log, message): - with open(os.path.join(args['dir'], log)) as f: +def search_in_log_file(log_path, message): + with open(log_path) as f: if message not in f.read(): - print( - 'FAIL: Missing the expected error message (%s) in the log file' - % message) + print('FAIL: Missing the expected error message (%s) in the log file' % message) sys.exit(-1) +def print_log_file(log_path): + print("\n--- Begin log file: %s ---" % log_path) + with open(log_path, 'r') as f: + print(f.read()) + print("--- End log file ---\n") + + def check_results(resp, log): # Check if the specified log file exists - if not os.path.isfile(os.path.join(args['dir'], log)): + log_path = os.path.join(args['dir'], log) + if not os.path.isfile(log_path): print('FAIL: Expected log file %s does not exist' % log) sys.exit(-1) # Dump should have failed with: 'The criu itself is within dumped tree' if resp.type != rpc.DUMP: print('FAIL: Unexpected msg type %r' % resp.type) + print_log_file(log_path) sys.exit(-1) if 'The criu itself is within dumped tree' not in resp.cr_errmsg: print('FAIL: Missing the expected error message in RPC response') + print_log_file(log_path) sys.exit(-1) # Look into the log file for the same message - search_in_log_file(log, 'The criu itself is within dumped tree') + search_in_log_file(log_path, 'The criu itself is within dumped tree') def test_rpc_without_configuration_file(): From 3365c7c02583b6e6cdab976d484b47b1fae5f19d Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 4 Sep 2025 21:35:37 +0200 Subject: [PATCH 686/775] restorer: shstk: add restorer shadow stack stubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * shstk_restorer_stack_size() – restorer shadow stack size * shstk_set_restorer_stack() – set restorer shadow stack start Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/include/restore.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/criu/include/restore.h b/criu/include/restore.h index 04d006505..2c4e4e267 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -9,6 +9,7 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); struct task_restore_args; struct pstree_item; +struct rst_shstk_info; #ifndef arch_shstk_prepare static inline int arch_shstk_prepare(struct pstree_item *item, @@ -38,4 +39,18 @@ static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *cor #define arch_shstk_trampoline arch_shstk_trampoline #endif +#ifndef shstk_restorer_stack_size +static always_inline long shstk_restorer_stack_size(void) +{ + return 0; +} +#endif + +#ifndef shstk_set_restorer_stack +static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + return 0; +} +#endif + #endif From f29cb750dbf292249126402b5f4d40e03d6cefd7 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 4 Sep 2025 21:45:19 +0200 Subject: [PATCH 687/775] x86/criu: shstk restorer memory accounting functions * shstk_restorer_stack_size(): PAGE_SIZE * shstk_set_restorer_stack(): set restorer temporary shadow stack start Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 7814c351d..2b9a303b8 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -73,6 +73,17 @@ int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, int (*func)(void *arg), void *arg); #define arch_shstk_trampoline arch_shstk_trampoline +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + info->tmp_shstk = (unsigned long)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + #ifdef CR_NOGLIBC #include From b18c07d8a856bce56387e30c851858ee4745b5fa Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 17 Oct 2025 18:53:01 +0200 Subject: [PATCH 688/775] restorer: shstk: add shstk_min_mmap_addr() * default: return whatever passed in eg. to be used as shtk_min_mmap_addr(kdat.mmap_min_addr) * x86: ignore def and return 4G On x86, CET shadow stack is required to be mapped above 4GiB On the other hand forcing 4GiB globally would break 32-bit restores. Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 6 ++++++ criu/cr-restore.c | 9 +++++---- criu/include/restore.h | 7 +++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index 2b9a303b8..f62b8c3e9 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -84,6 +84,12 @@ static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, } #define shstk_set_restorer_stack shstk_set_restorer_stack +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long __maybe_unused def) +{ + return !(info->cet & ARCH_SHSTK_SHSTK) ? def : (4UL << 30); +} +#define shstk_min_mmap_addr shstk_min_mmap_addr + #ifdef CR_NOGLIBC #include diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 1c3b36451..9781dbfa0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2431,16 +2431,15 @@ err: return ret; } -static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long min_addr, long vma_len) { struct vma_area *t_vma, *s_vma; - long prev_vma_end = 0; + long prev_vma_end = min_addr; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; - prev_vma_end = kdat.mmap_min_addr; s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); @@ -3226,7 +3225,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * or inited from scratch). */ - mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); + mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, + shstk_min_mmap_addr(&task_args->shstk, kdat.mmap_min_addr), + task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; diff --git a/criu/include/restore.h b/criu/include/restore.h index 2c4e4e267..189051826 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -53,4 +53,11 @@ static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, } #endif +#ifndef shstk_min_mmap_addr +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long def) +{ + return def; +} +#endif + #endif From 02462c19c443e18ee6cdd54d849086eb22815b7d Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 17:25:06 +0200 Subject: [PATCH 689/775] restorer: shstk: allocate restorer shadow stack * reserve space for restorer shadow stack * set tmp_shstk at mem, advance mem by PAGE_SIZE * forget the extra PAGE_SIZE (shstk) for premapped VMAs Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin [ alex: small code cleanups ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/shstk.c | 1 - criu/cr-restore.c | 6 +++++- criu/mem.c | 9 --------- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c index b752f114a..0810efac5 100644 --- a/criu/arch/x86/shstk.c +++ b/criu/arch/x86/shstk.c @@ -45,7 +45,6 @@ static int shstk_prepare_task(struct vm_area_list *vmas, shstk->vma_start = vma->e->start; shstk->vma_size = size; shstk->premmaped_addr = premmaped_addr; - shstk->tmp_shstk = premmaped_addr + size; break; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9781dbfa0..057ec0e93 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -3195,7 +3195,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); - task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size + shstk_restorer_stack_size(); BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); @@ -3466,6 +3466,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * self-vmas are unmaped. */ mem += rst_mem_size; + + shstk_set_restorer_stack(&task_args->shstk, mem); + mem += shstk_restorer_stack_size(); + task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; diff --git a/criu/mem.c b/criu/mem.c index f8c550842..9e8740c07 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -787,8 +787,6 @@ int prepare_mm_pid(struct pstree_item *i) ri->vmas.rst_priv_size += vma_area_len(vma); if (vma_has_guard_gap_hidden(vma)) ri->vmas.rst_priv_size += PAGE_SIZE; - if (vma_area_is(vma, VMA_AREA_SHSTK)) - ri->vmas.rst_priv_size += PAGE_SIZE; } pr_info("vma 0x%" PRIx64 " 0x%" PRIx64 "\n", vma->e->start, vma->e->end); @@ -931,13 +929,6 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void size = vma_entry_len(vma->e); - /* - * map an extra page for shadow stack VMAs, it will be used as a - * temporary shadow stack - */ - if (vma_area_is(vma, VMA_AREA_SHSTK)) - size += PAGE_SIZE; - if (!vma_inherited(vma)) { int flag = 0; /* From abf4a71d9945cb841fe8d5406cd32c3b46e9e2a0 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 18:02:37 +0200 Subject: [PATCH 690/775] x86/criu: shstk: add shstk_vma_restore() 1. create shadow stack vma during vma_remap cycle 2. copy contents from a premapped non-shstk VMA into it 3. unmap premapped non-shstk VMA 4. Mark shstk VMA for remap into the final destination Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Co-Authored-By: Alexander Mikhalitsyn [ alex: debugging, rework together with Andrei and code cleanup ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 37 +++++++++++++++++++++++++++++++ criu/include/restorer.h | 7 ++++++ 2 files changed, 44 insertions(+) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index f62b8c3e9..da4fb80cd 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -163,6 +163,43 @@ static inline int shstk_finalize(void) return ret; } +/* + * Create shadow stack vma and restore its content from premmapped anonymous (non-shstk) vma + */ +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + long shstk, i; + unsigned long *shstk_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + long ret; + + shstk = sys_map_shadow_stack(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack: %ld\n", shstk); + return -1; + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + wrssq(shstk + i * 8, shstk_data[i]); + + ret = sys_munmap(shstk_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + /* + * From that point premapped vma is (shstk) and we need + * to mremap() it to the final location. Originally premapped + * (shstk_data) has been unmapped already. + */ + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore shstk_vma_restore + /* * Restore contents of the shadow stack and set shadow stack pointer */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 56bea0fcc..14c0a3768 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -357,4 +357,11 @@ static inline int arch_shstk_restore(struct rst_shstk_info *shstk) #define arch_shstk_restore arch_shstk_restore #endif +#ifndef shstk_vma_restore +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + return -1; +} +#endif + #endif /* __CR_RESTORER_H__ */ From 6fd71b9ee9775f7b275051d0cd028397235f86e8 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 18:13:37 +0200 Subject: [PATCH 691/775] x86/criu: shstk: restore SHSTK via premap loops * call shstk_vma_restore() for VMA_AREA_SHSTK in vma_remap() * delete map/copy/unmap from shstk_restore() and keep token setup + finalize * before the loop naturally stopped at cet->ssp-8, so a -8 nudge is required here Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin [ alex: small code cleanups ] Signed-off-by: Alexander Mikhalitsyn --- criu/arch/x86/include/asm/shstk.h | 26 ++------------------------ criu/pie/restorer.c | 31 +++++++++++++++++-------------- 2 files changed, 19 insertions(+), 38 deletions(-) diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h index da4fb80cd..d113fd8ab 100644 --- a/criu/arch/x86/include/asm/shstk.h +++ b/criu/arch/x86/include/asm/shstk.h @@ -205,28 +205,11 @@ static always_inline int shstk_vma_restore(VmaEntry *vma_entry) */ static always_inline int shstk_restore(struct rst_shstk_info *cet) { - unsigned long *shstk_data = (unsigned long *)cet->premmaped_addr; - unsigned long ssp = cet->vma_start + cet->vma_size - 8; - unsigned long shstk_top = cet->vma_size / 8 - 1; - unsigned long val; - long ret; + unsigned long ssp, val; if (!(cet->cet & ARCH_SHSTK_SHSTK)) return 0; - if (shstk_map(cet->vma_start, cet->vma_size)) - return -1; - - /* - * Switch shadow stack from temporary location to the actual task's - * shadow stack VMA - */ - shstk_switch_ssp(ssp); - - /* restore shadow stack contents */ - for (; ssp >= cet->ssp; ssp -= 8, shstk_top--) - wrssq(ssp, shstk_data[shstk_top]); - /* * Add tokens for sigreturn frame and for switch of the shadow stack. * The sigreturn token will be checked by the kernel during @@ -236,6 +219,7 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) */ /* token for sigreturn frame */ + ssp = cet->ssp - 8; val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; wrssq(ssp, val); @@ -247,12 +231,6 @@ static always_inline int shstk_restore(struct rst_shstk_info *cet) /* reset shadow stack pointer to the proper location */ shstk_switch_ssp(ssp); - ret = sys_munmap(shstk_data, cet->vma_size + PAGE_SIZE); - if (ret < 0) { - pr_err("Failed to unmap premmaped shadow stack\n"); - return ret; - } - return shstk_finalize(); } #define arch_shstk_restore shstk_restore diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 394d3dea0..5c40b0e93 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1112,6 +1112,23 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) pr_info("Remap %lx->%lx len %lx\n", src, dst, len); + /* + * SHSTK VMAs are a bit special, in fact we create shstk vma right in the + * shstk_vma_restore() and populate it with contents from a premapped VMA + * (which in turns is just a normal anonymous VMA!). Then, we munmap() this + * premapped VMA. After, we need to adjust vma_premmaped_start(vma_entry) + * to point to a created shstk vma and treat it as a premmaped one in vma_remap(). + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) { + if (shstk_vma_restore(vma_entry)) { + pr_err("Unable to prepare shadow stack vma for remap %lx -> %lx\n", src, dst); + return -1; + } + + /* shstk_vma_restore() modifies vma premapped address */ + src = vma_premmaped_start(vma_entry); + } + if (src - dst < len) guard = dst; else if (dst - src < len) @@ -1811,13 +1828,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start > vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } @@ -1835,13 +1845,6 @@ __visible long __export_restore_task(struct task_restore_args *args) if (vma_entry->start < vma_entry->shmid) break; - /* - * shadow stack VMAs cannot be remapped, they must be - * recreated with map_shadow_stack system call - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) - continue; - if (vma_remap(vma_entry, args->uffd)) goto core_restore_end; } From 697c31abe442c3fe5e783994312ccfdbe5b4d265 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sat, 6 Sep 2025 19:40:35 +0200 Subject: [PATCH 692/775] zdtm: shstk: add SHSTK_ENABLE test build option * add SHSTK_ENABLE=1 toggle * passes -mshstk to compiler and -z shstk to linker Example: $ make -C test/zdtm/static clean $ make -C test/zdtm/static V=1 SHSTK_ENABLE=1 env00 $ readelf --notes test/zdtm/static/env00 | grep SHSTK Properties: x86 feature: SHSTK Signed-off-by: Igor Svilenkov Bozic Co-Authored-By: Andrei Vagin Signed-off-by: Alexander Mikhalitsyn --- test/zdtm/Makefile.inc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 3b349ed4d..465285f08 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -66,6 +66,11 @@ endif export PKG_CONFIG_PATH endif +ifeq ($(SHSTK_ENABLE),1) + CFLAGS += -mshstk + LDFLAGS += -Wl,-z,shstk +endif + define pkg-libs $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --libs $(1)) endef From a5ae3c184be47ca76b3c09f47406bcc234480966 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Sat, 18 Oct 2025 06:39:17 +0100 Subject: [PATCH 693/775] pycriu: set licence to LGPLv2.1 We use LGPL-v2.1 license for the libcriu and pycriu as they are intended to be usable by both proprietary and open-source applications. Signed-off-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- lib/pyproject.toml | 2 +- lib/setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pyproject.toml b/lib/pyproject.toml index c9e11551b..63d9b5f47 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -8,7 +8,7 @@ description = "Python bindings for CRIU" authors = [ {name = "CRIU team", email = "criu@lists.linux.dev"}, ] -license = {text = "GPLv2"} +license = {text = "LGPLv2.1"} dynamic = ["version"] requires-python = ">=3.6" diff --git a/lib/setup.cfg b/lib/setup.cfg index 5d75719ca..902fed9ee 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -8,7 +8,7 @@ name = pycriu description = Python bindings for CRIU author = CRIU team author_email = criu@lists.linux.dev -license = GPLv2 +license = LGPLv2.1 version = attr: pycriu.__version__ [options] From 540c631dd006b071cc5f46968ec3a01757d7e66b Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Fri, 17 Oct 2025 06:05:14 +0100 Subject: [PATCH 694/775] pycriu: add missing protobuf dependency pycriu depends on protobuf to function correctly. Currently, it raises an error if protobuf is not installed. Adding protobuf to the dependencies ensures it is available after installing pycriu. Signed-off-by: Andrii Herheliuk --- lib/pyproject.toml | 1 + lib/setup.cfg | 2 ++ 2 files changed, 3 insertions(+) diff --git a/lib/pyproject.toml b/lib/pyproject.toml index 63d9b5f47..ea9f88dcc 100644 --- a/lib/pyproject.toml +++ b/lib/pyproject.toml @@ -11,6 +11,7 @@ authors = [ license = {text = "LGPLv2.1"} dynamic = ["version"] requires-python = ">=3.6" +dependencies = ["protobuf"] [tool.setuptools] packages = ["pycriu", "pycriu.images"] diff --git a/lib/setup.cfg b/lib/setup.cfg index 902fed9ee..28c9e49c3 100644 --- a/lib/setup.cfg +++ b/lib/setup.cfg @@ -14,3 +14,5 @@ version = attr: pycriu.__version__ [options] packages = find: python_requires = >=3.6 +install_requires = + protobuf From d5c81f810816ae69d83d71ecd09c562f5bd50167 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Sat, 18 Oct 2025 04:00:08 +0100 Subject: [PATCH 695/775] pycriu: prevent always appending "Unknown" to error messages Regardless of the actual error message, "Unknown" was always appended to the end of the string, resulting in messages like: "DUMP failed: Error(3): No process with such pidUnknown". Fixed by changing standalone if statements to else-if blocks so "Unknown" is only added when no specific error condition matches. Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index f3e018095..5bd7ffecd 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -181,15 +181,14 @@ class CRIUExceptionExternal(CRIUException): if self.errno == errno.EBADRQC: s += "Bad options" - if self.typ == rpc.DUMP: - if self.errno == errno.ESRCH: - s += "No process with such pid" + elif self.typ == rpc.DUMP and self.errno == errno.ESRCH: + s += "No process with such pid" - if self.typ == rpc.RESTORE: - if self.errno == errno.EEXIST: - s += "Process with requested pid already exists" + elif self.typ == rpc.RESTORE and self.errno == errno.EEXIST: + s += "Process with requested pid already exists" - s += "Unknown" + else: + s += "Unknown" return s From f824dc735bb905d2a3c7adc70a7abd68cdea8a99 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:06:56 +0100 Subject: [PATCH 696/775] ci: consolidate action-script tests This patch consolidates the action-script tests into `test/others/action-script` to ensure all tests are executed consistently and reduce duplication. Since we had two tests that appear to do the same thing, we can remove the one that doesn't use zdtm.py. Signed-off-by: Radostin Stoyanov --- Makefile | 1 + test/jenkins/actions.sh | 8 --- test/others/action-script/.gitignore | 2 +- test/others/action-script/Makefile | 2 - test/others/action-script/action-script.sh | 2 - .../action-script}/check_actions.py | 0 test/others/action-script/run.sh | 59 ++----------------- .../{ => others/action-script}/show_action.sh | 3 +- 8 files changed, 9 insertions(+), 68 deletions(-) delete mode 100755 test/jenkins/actions.sh delete mode 100755 test/others/action-script/action-script.sh rename test/{ => others/action-script}/check_actions.py (100%) rename test/{ => others/action-script}/show_action.sh (66%) diff --git a/Makefile b/Makefile index 611bcdd5a..e6653bd6c 100644 --- a/Makefile +++ b/Makefile @@ -451,6 +451,7 @@ ruff: test/zdtm.py \ test/inhfd/*.py \ test/others/rpc/config_file.py \ + test/others/action-script/check_actions.py \ lib/pycriu/images/pb2dict.py \ lib/pycriu/images/images.py \ scripts/criu-ns \ diff --git a/test/jenkins/actions.sh b/test/jenkins/actions.sh deleted file mode 100755 index 801904500..000000000 --- a/test/jenkins/actions.sh +++ /dev/null @@ -1,8 +0,0 @@ -# Check how crit de/encodes images -set -e -source `dirname $0`/criu-lib.sh -# prep -rm -f actions_called.txt -./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail -./test/check_actions.py || fail -exit 0 diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore index c0b6a2490..ca9a0b541 100644 --- a/test/others/action-script/.gitignore +++ b/test/others/action-script/.gitignore @@ -1 +1 @@ -img-dir-* +actions_called.txt diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile index f1ce191db..594edc070 100644 --- a/test/others/action-script/Makefile +++ b/test/others/action-script/Makefile @@ -1,5 +1,3 @@ run: - @make -C .. loop ./run.sh - .PHONY: run diff --git a/test/others/action-script/action-script.sh b/test/others/action-script/action-script.sh deleted file mode 100755 index aba8292c0..000000000 --- a/test/others/action-script/action-script.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -touch action-hook-"$CRTOOLS_SCRIPT_ACTION" diff --git a/test/check_actions.py b/test/others/action-script/check_actions.py similarity index 100% rename from test/check_actions.py rename to test/others/action-script/check_actions.py diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh index a82fccf35..f18301502 100755 --- a/test/others/action-script/run.sh +++ b/test/others/action-script/run.sh @@ -1,60 +1,11 @@ #!/bin/bash -set -ebm +set -e -# shellcheck source=test/others/env.sh -source ../env.sh || exit 1 +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -SELFDIR="$(dirname "$(readlink -f "$0")")" -SCRIPT="$SELFDIR/action-script.sh" -IMGDIR="$SELFDIR/img-dir-$$" +rm -f "${SCRIPT_DIR}"/actions_called.txt +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/check_actions.py || exit 1 -rm -rf "$IMGDIR" -mkdir "$IMGDIR" - -trap "cleanup" QUIT TERM INT HUP EXIT - -# shellcheck disable=SC2317 -# https://github.com/koalaman/shellcheck/issues/2660 -function cleanup() -{ - if [[ -n "$PID" ]]; then - kill -9 "$PID" - fi -} - -PID=$(../loop) -if ! $CRIU dump -v4 -o dump.log -t "$PID" -D "$IMGDIR" --action-script "$SCRIPT"; then - echo "Failed to checkpoint process $PID" - cat dump.log - kill -9 "$PID" - exit 1 -fi - -if ! $CRIU restore -v4 -o restore.log -D "$IMGDIR" -d --pidfile test.pidfile --action-script "$SCRIPT"; then - echo "CRIU restore failed" - echo FAIL - exit 1 -fi - -PID=$(cat "$IMGDIR"/test.pidfile) - -found_missing_file=false -hooks=("pre-dump" "post-dump" "pre-restore" "pre-resume" "post-restore" "post-resume") - -for hook in "${hooks[@]}" -do - if [ ! -e "$IMGDIR/action-hook-$hook" ]; then - echo "ERROR: action-hook-$hook does not exist" - found_missing_file=true - fi -done - -if [ "$found_missing_file" = true ]; then - exit 1 -fi - -echo PASS - -rm -rf "$IMGDIR" exit 0 diff --git a/test/show_action.sh b/test/others/action-script/show_action.sh similarity index 66% rename from test/show_action.sh rename to test/others/action-script/show_action.sh index 86468b67a..afbfc3f27 100755 --- a/test/show_action.sh +++ b/test/others/action-script/show_action.sh @@ -1,3 +1,4 @@ #!/bin/bash + echo "${CRTOOLS_SCRIPT_ACTION} ${CRTOOLS_IMAGE_DIR} ${CRTOOLS_INIT_PID}" \ - >> "$(dirname $0)/actions_called.txt" + >> "$(dirname "$0")/actions_called.txt" From f74e68daf90aa2401024b1f106d84677e6354e47 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:11:45 +0100 Subject: [PATCH 697/775] ci: verify call order of action-script hooks The existing test collects all action-script hooks triggered during `h`, `ns`, and `uns` runs with ZDTM into `actions_called.txt`, then verifies that each hook appears at least once. However, the test does not verify that hooks are invoked *exactly once* or in *correct order*. This change updates the test to run ZDTM only with ns flavour as this seems to cover all action-script hooks, and checks that all hooks are called correctly. Signed-off-by: Radostin Stoyanov --- test/others/action-script/check_actions.py | 65 +++++++++++++--------- test/others/action-script/run.sh | 2 +- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/test/others/action-script/check_actions.py b/test/others/action-script/check_actions.py index 84d738dbb..0140d8762 100755 --- a/test/others/action-script/check_actions.py +++ b/test/others/action-script/check_actions.py @@ -1,41 +1,54 @@ #!/usr/bin/env python3 -import sys import os +import sys + +EXPECTED_ACTIONS = [ + 'pre-dump', + 'network-lock', + 'post-dump', + 'pre-restore', + 'setup-namespaces', + 'post-setup-namespaces', + 'post-restore', + 'network-unlock', + 'pre-resume', + 'post-resume', +] -actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ - 'post-setup-namespaces', 'post-restore', 'post-resume', \ - 'network-lock', 'network-unlock' ]) errors = [] -af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' +actions_called = [] +actions_called_file = os.path.join(os.path.dirname(__file__), 'actions_called.txt') -for act in open(af): - act = act.strip().split() - act.append('EMPTY') - act.append('EMPTY') +with open(actions_called_file) as f: + for index, line in enumerate(f): + parts = line.strip().split() + parts += ['EMPTY'] * (3 - len(parts)) + action_hook, image_dir, pid = parts - if act[0] == 'EMPTY': - raise Exception("Error in test, bogus actions line") + if action_hook == 'EMPTY': + raise ValueError("Error in test: bogus actions line") - if act[1] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) + expected_action = EXPECTED_ACTIONS[index] if index < len(EXPECTED_ACTIONS) else None + if action_hook != expected_action: + raise ValueError(f"Invalid action: {action_hook} != {expected_action}") - if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ - 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): - if act[2] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) - elif not act[2].isdigit() or int(act[2]) == 0: - errors.append('Action %s PID is not number (%s)' % - (act[0], act[2])) + if image_dir == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_IMAGE_DIR') - actions -= set([act[0]]) + if action_hook != 'pre-restore': + if pid == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_INIT_PID') + elif not pid.isdigit() or int(pid) == 0: + errors.append(f'Action {action_hook} PID is not a valid number ({pid})') -if actions: - errors.append('Not all actions called: %r' % actions) + actions_called.append(action_hook) + +if actions_called != EXPECTED_ACTIONS: + errors.append(f'Not all actions called: {actions_called!r}') if errors: - for x in errors: - print(x) + print('\n'.join(errors)) sys.exit(1) -print('PASS') +print('Check Actions PASS') diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh index f18301502..574f6fc86 100755 --- a/test/others/action-script/run.sh +++ b/test/others/action-script/run.sh @@ -5,7 +5,7 @@ set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" rm -f "${SCRIPT_DIR}"/actions_called.txt -"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 -f ns --script "$SCRIPT_DIR/show_action.sh" || exit 1 "${SCRIPT_DIR}"/check_actions.py || exit 1 exit 0 From d3dfb663b1022ec89431a0e61113f55a771bc73c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 10:51:46 +0100 Subject: [PATCH 698/775] make: don't install external dependencies Don't install external pip dependencies when running `make install`. As we are not really into developing a Python project, we should not install additional packages. CRIU does that nowhere else. Signed-off-by: Radostin Stoyanov --- Makefile.install | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Makefile.install b/Makefile.install index 455735f3b..70c607ec6 100644 --- a/Makefile.install +++ b/Makefile.install @@ -46,9 +46,13 @@ endif endif # Default flags for pip install: -# --upgrade: Upgrade crit/pycriu packages -# --ignore-installed: Ignore existing packages and reinstall them -PIPFLAGS ?= --upgrade --ignore-installed +# --ignore-installed: Overwrite already installed pycriu/crit packages +# --no-build-isolation: Use current Python environment to build pycriu/crit packages +# --no-deps: Don't install any dependencies +# --no-index: Don't use PyPI index to find packages +# --progress-bar: Cleaner output +# --upgrade: Treat the install as an upgrade when replacing the installed version +PIPFLAGS ?= --ignore-installed --no-build-isolation --no-deps --no-index --progress-bar off --upgrade export SKIP_PIP_INSTALL PIPFLAGS From 68601814747470bc0ef28b3ce42f5b8d61f230e8 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 11:43:17 +0100 Subject: [PATCH 699/775] ci: add wheel and setuptools in dnf packages These dependencies are required to for `pip install`. Signed-off-by: Radostin Stoyanov --- contrib/dependencies/dnf-packages.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 00dc91a2e..793f267a5 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -30,9 +30,11 @@ dnf install -y \ protobuf-c-devel \ protobuf-compiler \ protobuf-devel \ + python-devel \ python3-importlib-metadata \ python3-protobuf \ python3-pyyaml \ - python-devel \ + python3-setuptools \ + python3-wheel \ rubygem-asciidoctor \ xmlto From afcfcd3bf68bb0e1c45c1951b0469fe9588512b4 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 21 Oct 2025 12:49:00 +0100 Subject: [PATCH 700/775] ci: add which dependency in dnf packages which is used in Makefiles to check for dependencies: Example: export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) Signed-off-by: Radostin Stoyanov --- contrib/dependencies/dnf-packages.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 793f267a5..60f21db6d 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -37,4 +37,5 @@ dnf install -y \ python3-setuptools \ python3-wheel \ rubygem-asciidoctor \ + which \ xmlto From 07ad2473f27a2afd09e0379d18cf046782752d6c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Sun, 26 Oct 2025 17:14:03 -0700 Subject: [PATCH 701/775] Use command -v instead of which Unlike "which", which is a separate executable not always installed by default, "command -v" is a shell built-in available at least for bash, dash, and busybox shell. Unlike "which", "command -v" is also easier to grep for, and it is already used in a few places here. Inspired by commit 57251d811. Signed-off-by: Kir Kolyshkin --- .github/workflows/lint.yml | 2 +- Makefile | 2 +- contrib/dependencies/dnf-packages.sh | 1 - contrib/docker_cr.sh | 4 ++-- scripts/ci/prepare-for-fedora-rawhide.sh | 3 +-- scripts/nmk/scripts/tools.mk | 4 ++-- 6 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 862d68245..f7da4f6f6 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -14,7 +14,7 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make ruff xz clang-tools-extra which codespell git-clang-format ShellCheck + run: sudo dnf -y install git make ruff xz clang-tools-extra codespell git-clang-format ShellCheck - uses: actions/checkout@v4 diff --git a/Makefile b/Makefile index e6653bd6c..1824ea180 100644 --- a/Makefile +++ b/Makefile @@ -489,7 +489,7 @@ lint: ruff shellcheck codespell ! git --no-pager grep -E '\s+$$' \*.c \*.h .PHONY: lint ruff shellcheck codespell -codecov: SHELL := $(shell which bash) +codecov: SHELL := $(shell command -v bash) codecov: curl -Os https://uploader.codecov.io/latest/linux/codecov chmod +x codecov diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh index 60f21db6d..793f267a5 100755 --- a/contrib/dependencies/dnf-packages.sh +++ b/contrib/dependencies/dnf-packages.sh @@ -37,5 +37,4 @@ dnf install -y \ python3-setuptools \ python3-wheel \ rubygem-asciidoctor \ - which \ xmlto diff --git a/contrib/docker_cr.sh b/contrib/docker_cr.sh index 9b43d8ba1..04ef676cd 100755 --- a/contrib/docker_cr.sh +++ b/contrib/docker_cr.sh @@ -418,7 +418,7 @@ resolve_path() { local p p="${2}" - if which realpath > /dev/null; then + if command -v realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" @@ -427,7 +427,7 @@ resolve_path() { resolve_cmd() { local cpath - cpath=$(which "${2}") + cpath=$(command -v "${2}") resolve_path "${1}" "${cpath}" } diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index ff75717c5..b0b45fcc3 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -15,8 +15,7 @@ dnf install -y \ python-unversioned-command \ redhat-rpm-config \ sudo \ - tar \ - which + tar # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 724204a03..de5782c13 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null) +FULL_PYTHON := $(shell command -v python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ @@ -36,7 +36,7 @@ CTAGS := ctags export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE -export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) +export USE_ASCIIDOCTOR ?= $(shell command -v asciidoctor 2>/dev/null) # # Footer. From 2878faa74c96c0b816453d0a0c86e219e4e33fed Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 7 Oct 2025 16:31:17 +0100 Subject: [PATCH 702/775] libcriu: enable setting of RPC config file Container runtimes that use libcriu (e.g., crun) need to specify a CRIU configuration file that allows to overwrite default options set via RPC. This is particularly useful to set options such as `--tcp-established` via `/etc/criu/runc.conf` in Kubernetes. Signed-off-by: Radostin Stoyanov --- lib/c/criu.c | 19 +++++++++++++++++++ lib/c/criu.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/lib/c/criu.c b/lib/c/criu.c index c16fe5dcd..485c8b178 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -2041,3 +2041,22 @@ void criu_set_empty_ns(int namespaces) { criu_local_set_empty_ns(global_opts, namespaces); } + +int criu_local_set_config_file(criu_opts *opts, const char *path) +{ + char *new; + + new = strdup(path); + if (!new) + return -ENOMEM; + + free(opts->rpc->config_file); + opts->rpc->config_file = new; + + return 0; +} + +int criu_set_config_file(const char *path) +{ + return criu_local_set_config_file(global_opts, path); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index c1c607869..44446f664 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -116,6 +116,7 @@ void criu_set_pidfd_store_sk(int sk); int criu_set_network_lock(enum criu_network_lock_method method); int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt); void criu_set_mntns_compat_mode(bool val); +int criu_set_config_file(const char *path); /* * The criu_notify_arg_t na argument is an opaque @@ -281,6 +282,7 @@ void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk); int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method); int criu_local_join_ns_add(criu_opts *opts, const char *ns, const char *ns_file, const char *extra_opt); void criu_local_set_mntns_compat_mode(criu_opts *opts, bool val); +int criu_local_set_config_file(criu_opts *opts, const char *path); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); From 3f97cfe876b4e54be42c19263796de61633402ac Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Thu, 9 Oct 2025 11:21:35 +0100 Subject: [PATCH 703/775] test/libcriu: check setting of RPC config file Signed-off-by: Radostin Stoyanov --- test/others/libcriu/.gitignore | 1 + test/others/libcriu/Makefile | 1 + test/others/libcriu/run.sh | 1 + test/others/libcriu/test_rpc_config.c | 223 ++++++++++++++++++++++++++ 4 files changed, 226 insertions(+) create mode 100644 test/others/libcriu/test_rpc_config.c diff --git a/test/others/libcriu/.gitignore b/test/others/libcriu/.gitignore index 0f6e52bb4..30a56999c 100644 --- a/test/others/libcriu/.gitignore +++ b/test/others/libcriu/.gitignore @@ -8,3 +8,4 @@ test_pre_dump test_feature_check output/ libcriu.so.* +test_rpc_config diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index ae7330533..e0ee5b2ab 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -3,6 +3,7 @@ include ../../../../criu/Makefile.versions TESTS += test_sub TESTS += test_self TESTS += test_notify +TESTS += test_rpc_config TESTS += test_iters TESTS += test_errno TESTS += test_join_ns diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index f7d363aab..804af9b83 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -55,6 +55,7 @@ run_test() { run_test test_sub run_test test_self run_test test_notify +run_test test_rpc_config if [ "$(uname -m)" = "x86_64" ]; then # Skip this on aarch64 as aarch64 has no dirty page tracking run_test test_iters diff --git a/test/others/libcriu/test_rpc_config.c b/test/others/libcriu/test_rpc_config.c new file mode 100644 index 000000000..529f13637 --- /dev/null +++ b/test/others/libcriu/test_rpc_config.c @@ -0,0 +1,223 @@ +#include "criu.h" +#include "lib.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RANDOM_NAME_LEN 6 +#define PATH_BUF_SIZE 128 + +static volatile sig_atomic_t stop = 0; +static char base_name[RANDOM_NAME_LEN + 1]; +static char log_file[PATH_BUF_SIZE]; +static char conf_file[PATH_BUF_SIZE]; + +static void handle_signal(int sig) +{ + (void)sig; + stop = 1; +} + +static void generate_random_base_name(void) +{ + const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + size_t charset_len; + int i; + + charset_len = sizeof(charset) - 1; + + for (i = 0; i < RANDOM_NAME_LEN; i++) { + base_name[i] = charset[rand() % charset_len]; + } + base_name[i] = '\0'; + + snprintf(log_file, sizeof(log_file), "/tmp/criu-%s.log", base_name); + snprintf(conf_file, sizeof(conf_file), "/tmp/criu-%s.conf", base_name); +} + +static int create_criu_config_file(void) +{ + int fd; + FILE *fp; + + srand(time(NULL)); + generate_random_base_name(); + + fd = open(conf_file, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (fd < 0) { + perror("Failed to create config file"); + return -1; + } + + fp = fdopen(fd, "w"); + if (!fp) { + perror("fdopen failed"); + close(fd); + unlink(conf_file); + return -1; + } + + fprintf(fp, "log-file=%s\n", log_file); + fflush(fp); + fclose(fp); + + return 0; +} + +static int check_log_file(void) +{ + struct stat st; + + if (stat(log_file, &st) < 0) { + perror("Config file does not exist"); + return -1; + } + + if (st.st_size == 0) { + fprintf(stderr, "Config file is empty\n"); + return -1; + } + + unlink(log_file); + return 0; +} + +int main(int argc, char **argv) +{ + int pipe_fd[2]; + pid_t pid; + int ret; + int child_ret; + + int img_fd = open(argv[2], O_DIRECTORY); + if (img_fd < 0) { + perror("Failed to open images directory"); + goto cleanup; + } + + if (create_criu_config_file() < 0) { + printf("Failed to create config file\n"); + return EXIT_FAILURE; + } + + if (pipe(pipe_fd) < 0) { + perror("pipe"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + perror("fork failed"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /** child process **/ + printf(" `- loop: initializing\n"); + + if (setsid() < 0 || signal(SIGUSR1, handle_signal) == SIG_ERR) { + _exit(EXIT_FAILURE); + } + + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + close(pipe_fd[0]); + + child_ret = SUCC_ECODE; + write(pipe_fd[1], &child_ret, sizeof(child_ret)); + close(pipe_fd[1]); + + while (!stop) { + sleep(1); + } + + _exit(SUCC_ECODE); + } + + /** parent process **/ + close(pipe_fd[1]); + + ret = -1; + if (read(pipe_fd[0], &ret, sizeof(ret)) != sizeof(ret) || ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto cleanup; + } + + read(pipe_fd[0], &ret, 1); + close(pipe_fd[0]); + + printf("--- Loop process started (pid: %d) ---\n", pid); + + printf("--- Checkpoint ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_images_dir_fd(img_fd); + criu_set_pid(pid); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting dump RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("dump.log"); + + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + printf("criu dump failed\n"); + goto cleanup; + } + + printf(" `- Dump succeeded\n"); + waitpid(pid, NULL, 0); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + + printf("--- Restore loop ---\n"); + criu_init_opts(); + criu_set_images_dir_fd(img_fd); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting restore RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("restore.log"); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + ret = EXIT_FAILURE; + goto cleanup; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + +cleanup: + if (waitpid(pid, &ret, 0) < 0) { + perror("waitpid failed"); + return EXIT_FAILURE; + } + + printf("Remove RPC config file: %s\n", conf_file); + unlink(conf_file); + return chk_exit(ret, SUCC_ECODE); +} From 7aad7317b407925519d2b9137f87b1a0f53fc879 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Wed, 22 Oct 2025 21:51:28 +0100 Subject: [PATCH 704/775] lib/pycriu: changing the default behavior to use the system binary Use system-installed CRIU binary instead of a local file Thanks to @avagin for suggesting this solution. Co-authored-by: Andrei Vagin Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 5bd7ffecd..5973b4b91 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -103,7 +103,7 @@ class _criu_comm_bin(_criu_comm): os.close(2) css[0].send(struct.pack('i', os.getpid())) - os.execv(self.comm, + os.execvp(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) os._exit(1) From d2c46b92b0d394e04b7da5d16909ed7f88e84271 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Thu, 23 Oct 2025 10:50:40 +0100 Subject: [PATCH 705/775] pycriu: better socket error handling [Errno 2] No such file or directory -> Socket file not found. [Errno 111] Connection refused -> Service not running. Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 5973b4b91..43550c3ca 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -45,7 +45,14 @@ class _criu_comm_sk(_criu_comm): def connect(self, daemon): self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) - self.sk.connect(self.comm) + try: + self.sk.connect(self.comm) + + except FileNotFoundError: + raise FileNotFoundError("Socket file not found.") + + except ConnectionRefusedError: + raise ConnectionRefusedError("Service not running.") return self.sk From 71a637923f420dd50cef02912519b44722338ae4 Mon Sep 17 00:00:00 2001 From: Andrii Herheliuk Date: Mon, 27 Oct 2025 21:57:41 +0000 Subject: [PATCH 706/775] pycriu: set default value for sk_name This change allows users to call criu.use_sk() without any parameters to use the default socket name. Co-authored-by: Radostin Stoyanov Signed-off-by: Andrii Herheliuk --- lib/pycriu/criu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 43550c3ca..05a85c58d 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -8,6 +8,7 @@ import struct import pycriu.rpc_pb2 as rpc +CR_DEFAULT_SERVICE_ADDRESS = "./criu_service.socket" class _criu_comm: """ @@ -213,7 +214,7 @@ class criu: self.opts = rpc.criu_opts() self.sk = None - def use_sk(self, sk_name): + def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): """ Access criu using unix socket which that belongs to criu service daemon. """ From ee4100c09f7de7ef9e9db59288118646f28cd4b4 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 06:42:53 +0100 Subject: [PATCH 707/775] cr-service: refactor images/workdir setup Move the code that opens the images directory, resolves its absolute path via readlink(), selects the work_dir, and chdir()s into it into a new function: setup_images_and_workdir(). This reduces the size of `setup_opts_from_req()`, improves its readability, and allows this functionality to be reused. While at it, change open_image_dir() to take a const char *dir parameter, reflecting that the path is not modified by the function and allowing callers to pass string literals without casts. No functional changes are intended. Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 74 +++++++++++++++++++++++++------------------- criu/image.c | 2 +- criu/include/image.h | 2 +- 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index e6aac232e..36ef8d72b 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -285,13 +285,54 @@ int exec_rpc_query_external_files(char *name, int sk) static char images_dir[PATH_MAX]; +static int setup_images_and_workdir(const char *images_dir_path, + bool work_changed_by_rpc_conf, + CriuOpts *req, + pid_t peer_pid) +{ + char work_dir_path[PATH_MAX]; + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + return -1; + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + return -1; + } + + return 0; +} + static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); char images_dir_path[PATH_MAX]; - char work_dir_path[PATH_MAX]; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -701,37 +742,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); + if (setup_images_and_workdir(images_dir_path, work_changed_by_rpc_conf, req, ids.pid)) goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - if (work_changed_by_rpc_conf) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } if (req->n_irmap_scan_paths) { for (i = 0; i < req->n_irmap_scan_paths; i++) { diff --git a/criu/image.c b/criu/image.c index c4f05e159..91101c3eb 100644 --- a/criu/image.c +++ b/criu/image.c @@ -717,7 +717,7 @@ struct cr_img *img_from_fd(int fd) * This is used when opts.stream is enabled for picking the right streamer * socket name. `mode` is ignored when opts.stream is not enabled. */ -int open_image_dir(char *dir, int mode) +int open_image_dir(const char *dir, int mode) { int fd, ret; diff --git a/criu/include/image.h b/criu/include/image.h index b06dbf706..30e32323d 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -165,7 +165,7 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(char *dir, int mode); +extern int open_image_dir(const char *dir, int mode); extern void close_image_dir(void); /* * Return -1 -- parent symlink points to invalid target From 60a731ab38d53c69fbf0fc8bf7bb02701930424c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 11:04:35 +0100 Subject: [PATCH 708/775] cr-service: drop images_dir from setproctitle Commit 9089ce8 ("service: use setproctitle") extended cr-service to get the full path of images_dir using readlink(). However, the RPC API was later extended to allow setting a custom path (folder) to be set instead of passing a file descriptor, which causes readlink() to fail as the path is not a symbolic link. It would be better to drop the code setting the images-dir path as a string in the proctitle. Fixes: #2794 Suggested-by: Andrei Vagin Co-authored-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 36ef8d72b..0808be3e7 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -283,8 +283,6 @@ int exec_rpc_query_external_files(char *name, int sk) return ret; } -static char images_dir[PATH_MAX]; - static int setup_images_and_workdir(const char *images_dir_path, bool work_changed_by_rpc_conf, CriuOpts *req, @@ -304,12 +302,6 @@ static int setup_images_and_workdir(const char *images_dir_path, return -1; } - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - return -1; - } - if (work_changed_by_rpc_conf) strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); else if (req->has_work_dir_fd) @@ -802,7 +794,7 @@ static int dump_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -845,7 +837,7 @@ static int restore_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("restore --rpc -D %s", images_dir); + __setproctitle("restore --rpc"); if (cr_restore_tasks()) goto exit; @@ -940,7 +932,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) if (setup_opts_from_req(sk, req)) goto cout; - __setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("pre-dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -1276,8 +1268,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (setup_opts_from_req(sk, msg->opts)) goto cout; - __setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", - images_dir); + __setproctitle("cpuinfo %s --rpc", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check"); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); From 5966ffe8a7fa452a8c8256962436dceb4479237e Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 06:50:31 +0100 Subject: [PATCH 709/775] cr-service: refactor images_dir path resolution Move the images_dir selection logic from setup_opts_from_req() into a new function: resolve_images_dir_path(). This improves readability and allows the code to be reused. While at it, use snprintf() instead of sprintf() for the /proc path and ensure NULL termination after strncpy(). Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 59 +++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 0808be3e7..7d17a63e0 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -283,6 +283,41 @@ int exec_rpc_query_external_files(char *name, int sk) return ret; } +static int resolve_images_dir_path(char *images_dir_path, + bool imgs_changed_by_rpc_conf, + const CriuOpts *req, + pid_t peer_pid) +{ + /* + * images_dir_fd is a required RPC parameter with -1 as default value. + * + * This assumes that if opts.imgs_dir is set, we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else if (req->images_dir_fd != -1) { + snprintf(images_dir_path, PATH_MAX, "/proc/%d/fd/%d", peer_pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else { + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + return -1; + } + + return 0; +} + static int setup_images_and_workdir(const char *images_dir_path, bool work_changed_by_rpc_conf, CriuOpts *req, @@ -706,30 +741,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) xfree(tmp_work); } - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. The idea is that only the - * RPC configuration file is able to overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) { - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - } else if (req->images_dir_fd != -1) { - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - } else if (req->images_dir) { - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - } else { - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + if (resolve_images_dir_path(images_dir_path, imgs_changed_by_rpc_conf, req, ids.pid) < 0) goto err; - } if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); From 72ca94db4de93105f89c9411b87a70e8f2353745 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 28 Oct 2025 18:37:31 +0000 Subject: [PATCH 710/775] cr-service: refactor logging setup Move the logging initialization into a helper function that can be reused. No functional change intended. Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 51 ++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index 7d17a63e0..b4e8629c9 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -354,6 +354,31 @@ static int setup_images_and_workdir(const char *images_dir_path, return 0; } +static int setup_logging_from_req(CriuOpts *req, bool output_changed_by_rpc_conf) +{ + if (req->log_file && !output_changed_by_rpc_conf) { + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + return -1; + } + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; /* log_init(NULL) writes to stderr */ + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + opts.log_level = req->log_level; + log_set_loglevel(opts.log_level); + if (log_init(opts.output)) { + pr_perror("Can't initiate log"); + return -1; + } + + return 0; +} + static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; @@ -758,36 +783,12 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); + if (setup_logging_from_req(req, output_changed_by_rpc_conf)) goto err; - } if (req->mntns_compat_mode) opts.mntns_compat_mode = true; - log_set_loglevel(opts.log_level); if (check_options()) goto err; From 9371c4a789889f26d11ca04a4c7c9847a2abbbcc Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 07:12:03 +0100 Subject: [PATCH 711/775] cr-service: refactor RPC opts parsing for check() The check() functionality is very different from dump, pre-dump, and restore. It is used only to check if the kernel supports required features, and does not need the majority of options set via RPC. In particular, we don't need to open `image_dir` when running `check()` because this functionality doesn't create or process image files. In this case, `image_dir` is used as `work_dir`, only when the latter is not specified and a log file is used. This patch updates the RPC options parser so that it only handles the logging options when check() is used. Logging to a file is required when log_file is explicitly set or no log_to_stderr is used. In such case, we also resolve images_dir and work_dir where the log file will be created. Fixes: #2758 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- criu/cr-service.c | 57 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b4e8629c9..b4718dde2 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -311,6 +311,12 @@ static int resolve_images_dir_path(char *images_dir_path, strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); images_dir_path[PATH_MAX - 1] = '\0'; } else { + /* + * Since images dir is not required in CHECK mode, we need to + * check for work_dir_fd in setup_images_and_workdir() + */ + if (opts.mode == CR_CHECK) + return 0; pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); return -1; } @@ -323,18 +329,21 @@ static int setup_images_and_workdir(const char *images_dir_path, CriuOpts *req, pid_t peer_pid) { - char work_dir_path[PATH_MAX]; + char work_dir_path[PATH_MAX] = ""; - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - return -1; + /* We don't need to open images dir in CHECK mode. */ + if (opts.mode != CR_CHECK) { + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } } if (work_changed_by_rpc_conf) @@ -343,9 +352,14 @@ static int setup_images_and_workdir(const char *images_dir_path, sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); else if (opts.work_dir) strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else + else if (images_dir_path[0] != '\0') strcpy(work_dir_path, images_dir_path); + if (work_dir_path[0] == '\0') { + pr_err("images-dir or work-dir is required when using log file\n"); + return -1; + } + if (chdir(work_dir_path)) { pr_perror("Can't chdir to work_dir"); return -1; @@ -384,7 +398,7 @@ static int setup_opts_from_req(int sk, CriuOpts *req) struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); - char images_dir_path[PATH_MAX]; + char images_dir_path[PATH_MAX] = ""; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -397,6 +411,23 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } + /* + * The options relevant in CHECK mode are: log_file, log_to_stderr, and log_level. + * When logging to a file, we also need to resolve images_dir and work_dir. + */ + if (opts.mode == CR_CHECK) { + if (!req) + return 0; /* nothing to do */ + + /* + * A log file is needed only if: + * - log_file is explicitly set, or + * - log_to_stderr is NOT requested (i.e., using DEFAULT_LOG_FILENAME) + */ + if (!req->log_file || (req->has_log_to_stderr && req->log_to_stderr)) + return 0; /* no log file, don't require images_dir or work_dir */ + } + if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; From f7ccb63bdd496409d968390aa15a3a8c4b877110 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 09:28:28 +0100 Subject: [PATCH 712/775] pycriu: set RPC opts for CHECK This allows users to specify RPC options when using the check() functionality. Co-authored-by: Andrii Herheliuk Signed-off-by: Radostin Stoyanov --- lib/pycriu/criu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 05a85c58d..760d2be78 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -211,7 +211,8 @@ class criu: def __init__(self): self.use_binary('criu') - self.opts = rpc.criu_opts() + # images_dir_fd is required field with default value of -1 + self.opts = rpc.criu_opts(images_dir_fd=-1) self.sk = None def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): @@ -273,6 +274,7 @@ class criu: """ req = rpc.criu_req() req.type = rpc.CHECK + req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) From 3c841af2cf1f1769c2fa1527bf2706b705da1202 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sat, 25 Oct 2025 12:35:19 +0100 Subject: [PATCH 713/775] pycriu: use explicit imports for __init__ _init__.py defines the public API for pycriu. It is important to use explicit imports to avoid leaking every symbol from criu.py into the pycriu namespace. This avoids import-time side effects, prevents name collisions, and circular-import traps. Fixes the following lint error: F403 `from .criu import *` used; unable to detect undefined names Signed-off-by: Radostin Stoyanov --- Makefile | 2 ++ lib/pycriu/__init__.py | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1824ea180..05834d682 100644 --- a/Makefile +++ b/Makefile @@ -452,6 +452,8 @@ ruff: test/inhfd/*.py \ test/others/rpc/config_file.py \ test/others/action-script/check_actions.py \ + lib/pycriu/criu.py \ + lib/pycriu/__init__.py \ lib/pycriu/images/pb2dict.py \ lib/pycriu/images/images.py \ scripts/criu-ns \ diff --git a/lib/pycriu/__init__.py b/lib/pycriu/__init__.py index 2abcf029d..28f1e9424 100644 --- a/lib/pycriu/__init__.py +++ b/lib/pycriu/__init__.py @@ -1,4 +1,15 @@ from . import rpc_pb2 as rpc from . import images -from .criu import * -from .version import __version__ \ No newline at end of file +from .criu import criu, CRIUExceptionExternal, CRIUException +from .criu import CR_DEFAULT_SERVICE_ADDRESS +from .version import __version__ + +__all__ = ( + "rpc", + "images", + "criu", + "CRIUExceptionExternal", + "CRIUException", + "CR_DEFAULT_SERVICE_ADDRESS", + "__version__", +) \ No newline at end of file From a1dc885027f6866f01df38c9f14e71a9102e298a Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Fri, 24 Oct 2025 19:32:23 +0100 Subject: [PATCH 714/775] test/rpc: update errno check The --mntns-compat-mode option is no longer parsed with CHECK. Use --log-file instead to test the error message. Signed-off-by: Radostin Stoyanov --- test/others/rpc/errno.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index a5a3eb54d..ea841199f 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -49,8 +49,8 @@ class test: if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) - - if errmsg and errmsg not in resp.cr_errmsg: + + if errmsg and errmsg not in str(resp.cr_errmsg): raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') def no_process(self): @@ -134,20 +134,19 @@ class test: self.check_resp(resp, rpc.EMPTY, None) print('Success') - + def child_first_err(self): print('Receive correct first error message') req = self.get_base_req() req.type = rpc.CHECK - - # mntns_compat_mode options is only allowed on restore - req.opts.mntns_compat_mode = True + # Log file must not have subdirectory + req.opts.log_file = "/foo/bar.log" self.send_req(req) resp = self.recv_resp() - self.check_resp(resp, rpc.CHECK, None, "Option --mntns-compat-mode is only valid on restore\n") + self.check_resp(resp, rpc.CHECK, None, "No subdirs are allowed in log_file name") print('Success') From 567f70ce191326c56cd223ce94a079dceb7c71fd Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 20 Oct 2025 10:24:49 +0100 Subject: [PATCH 715/775] test/others: add test for check() with libcriu Signed-off-by: Radostin Stoyanov --- test/others/libcriu/Makefile | 1 + test/others/libcriu/run.sh | 1 + test/others/libcriu/test_check.c | 17 +++++++++++++++++ 3 files changed, 19 insertions(+) create mode 100644 test/others/libcriu/test_check.c diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index e0ee5b2ab..927f17c23 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -8,6 +8,7 @@ TESTS += test_iters TESTS += test_errno TESTS += test_join_ns TESTS += test_pre_dump +TESTS += test_check TESTS += test_feature_check all: $(TESTS) diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 804af9b83..6b36d4496 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -63,6 +63,7 @@ if [ "$(uname -m)" = "x86_64" ]; then fi run_test test_errno run_test test_join_ns +run_test test_check if criu check --feature mem_dirty_track > /dev/null; then export CRIU_FEATURE_MEM_TRACK=1 fi diff --git a/test/others/libcriu/test_check.c b/test/others/libcriu/test_check.c new file mode 100644 index 000000000..4af3b3630 --- /dev/null +++ b/test/others/libcriu/test_check.c @@ -0,0 +1,17 @@ +#include +#include "criu.h" +#include "lib.h" + +int main(int argc, char **argv) +{ + int ret; + + printf("--- Start check ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + + if (criu_check()) + return -1; + + return 0; +} From 0fa6ff3d188245091dce1516fa7804ebfa6be337 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 26 Oct 2025 10:00:39 +0000 Subject: [PATCH 716/775] test/others: add tests for check() with pycriu Signed-off-by: Radostin Stoyanov --- Makefile | 1 + test/others/pycriu/.gitignore | 1 + test/others/pycriu/Makefile | 63 ++++++++++++++++++++ test/others/pycriu/read.py | 1 + test/others/pycriu/test_check.py | 29 +++++++++ test/others/pycriu/test_check_fail.py | 32 ++++++++++ test/others/pycriu/test_check_images_dir.py | 44 ++++++++++++++ test/others/pycriu/test_check_work_dir_fd.py | 44 ++++++++++++++ test/others/rpc/read.py | 0 9 files changed, 215 insertions(+) create mode 100644 test/others/pycriu/.gitignore create mode 100644 test/others/pycriu/Makefile create mode 120000 test/others/pycriu/read.py create mode 100755 test/others/pycriu/test_check.py create mode 100755 test/others/pycriu/test_check_fail.py create mode 100755 test/others/pycriu/test_check_images_dir.py create mode 100755 test/others/pycriu/test_check_work_dir_fd.py mode change 100644 => 100755 test/others/rpc/read.py diff --git a/Makefile b/Makefile index 05834d682..e26807158 100644 --- a/Makefile +++ b/Makefile @@ -452,6 +452,7 @@ ruff: test/inhfd/*.py \ test/others/rpc/config_file.py \ test/others/action-script/check_actions.py \ + test/others/pycriu/*.py \ lib/pycriu/criu.py \ lib/pycriu/__init__.py \ lib/pycriu/images/pb2dict.py \ diff --git a/test/others/pycriu/.gitignore b/test/others/pycriu/.gitignore new file mode 100644 index 000000000..567609b12 --- /dev/null +++ b/test/others/pycriu/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/test/others/pycriu/Makefile b/test/others/pycriu/Makefile new file mode 100644 index 000000000..b6e3b4814 --- /dev/null +++ b/test/others/pycriu/Makefile @@ -0,0 +1,63 @@ +.SHELLFLAGS := -eu -o pipefail -c +.ONESHELL: + +CRIU ?= ../../../criu/criu +BUILD_DIR ?= build +SOCKET_NAME ?= criu_service.socket +PIDFILE_NAME ?= pidfile +SERVICE_LOG ?= service.log +PYTHON ?= python3 + +PIDFILE := $(BUILD_DIR)/$(PIDFILE_NAME) +CRIU_SOCKET := $(BUILD_DIR)/$(SOCKET_NAME) +STATUS_FIFO := $(BUILD_DIR)/startup.status +STATUS_FD := 200 + +run: start + cleanup() { $(MAKE) --no-print-directory stop || true; } + trap cleanup EXIT INT TERM + "$(PYTHON)" test_check.py + "$(PYTHON)" test_check_fail.py + "$(PYTHON)" test_check_images_dir.py + "$(PYTHON)" test_check_work_dir_fd.py + +start: + mkdir -p "$(BUILD_DIR)" + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + echo "Service running (PID $$(cat "$(PIDFILE)"))." + exit 0 + fi + if ! command -v "$(CRIU)" >/dev/null 2>&1; then + echo "CRIU not found at $(CRIU)" + exit 1 + fi + mkfifo "$(STATUS_FIFO)" + exec $(STATUS_FD)<>"$(STATUS_FIFO)" + "$(CRIU)" service \ + -v4 \ + -W "$(BUILD_DIR)" \ + --address "$(SOCKET_NAME)" \ + -d \ + --pidfile "$(PIDFILE_NAME)" \ + -o "$(SERVICE_LOG)" \ + --status-fd "$(STATUS_FD)" + "$(PYTHON)" read.py "$(STATUS_FIFO)" + +stop: + if [ ! -s "$(PIDFILE)" ]; then + echo "pidfile missing or empty" + exit 1 + fi + pid=$$(cat "$(PIDFILE)") + if kill -0 "$$pid" 2>/dev/null; then + kill -9 "$$pid" || true + fi + rm -f "$(PIDFILE)" "$(CRIU_SOCKET)" "$(STATUS_FIFO)" + +clean: + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + kill -9 "$$(cat "$(PIDFILE)")" || true + fi + rm -rf "$(BUILD_DIR)" + +.PHONY: start stop clean run \ No newline at end of file diff --git a/test/others/pycriu/read.py b/test/others/pycriu/read.py new file mode 120000 index 000000000..c2c1e1365 --- /dev/null +++ b/test/others/pycriu/read.py @@ -0,0 +1 @@ +../rpc/read.py \ No newline at end of file diff --git a/test/others/pycriu/test_check.py b/test/others/pycriu/test_check.py new file mode 100755 index 000000000..9888158db --- /dev/null +++ b/test/others/pycriu/test_check.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_fail.py b/test/others/pycriu/test_check_fail.py new file mode 100755 index 000000000..b5634c60b --- /dev/null +++ b/test/others/pycriu/test_check_fail.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + # Intentionally set only log_file (no images/work dir) to ensure check() fails + criu.opts.log_file = "check.log" + + try: + criu.check() + except Exception: + print("PASS") + return 0 + + print("FAIL: check() did not fail when log_file is set without images/work dir") + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_images_dir.py b/test/others/pycriu/test_check_images_dir.py new file mode 100755 index 000000000..f479c2a88 --- /dev/null +++ b/test/others/pycriu/test_check_images_dir.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def _log_path(images_dir, log_file): + return log_file if os.path.isabs(log_file) else os.path.join(images_dir, log_file) + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.images_dir = build_dir + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + lp = _log_path(build_dir, criu.opts.log_file) + msg = f"FAIL: {e} ({'see log: ' + lp if os.path.exists(lp) else 'no log found'})" + print(msg) + return 1 + + lp = _log_path(build_dir, criu.opts.log_file) + if not (os.path.isfile(lp) and os.path.getsize(lp) > 0): + print(f"FAIL: log file missing or empty: {lp}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_work_dir_fd.py b/test/others/pycriu/test_check_work_dir_fd.py new file mode 100755 index 000000000..e20a83097 --- /dev/null +++ b/test/others/pycriu/test_check_work_dir_fd.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + os.makedirs(build_dir, exist_ok=True) + + # Open a directory FD to use as work_dir_fd (prefer O_PATH if available) + flags = getattr(os, "O_PATH", 0) or os.O_RDONLY + fd = os.open(build_dir, flags) + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.work_dir_fd = fd + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + finally: + try: + os.close(fd) + except Exception: + pass + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/rpc/read.py b/test/others/rpc/read.py old mode 100644 new mode 100755 From cb8e1da3f483f53fcacb642b574866625f7dbb0a Mon Sep 17 00:00:00 2001 From: alam0rt Date: Fri, 31 Oct 2025 14:32:17 +1100 Subject: [PATCH 717/775] coredump: use compat_nr_pages as fallback Use nr_pages when available, falling back to compat_nr_pages for compatibility. Signed-off-by: alam0rt Signed-off-by: Radostin Stoyanov --- coredump/criu_coredump/coredump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 9454d8f0b..3c9cd45aa 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -794,7 +794,8 @@ class coredump_generator: off = 0 # in pages for m in pagemap[1:]: found = False - for i in range(m["nr_pages"]): + num_pages = m.get("nr_pages", m.compat_nr_pages) + for i in range(num_pages): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True break From 1d08ff8ca7b2a8bee5238e80bddd52a627c637cf Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 9 Nov 2025 16:24:48 +0000 Subject: [PATCH 718/775] coredump: fix handling of num_pages This patch fixes the following error: $ sudo make -C test/others/criu-coredump run ... Traceback (most recent call last): File "/home/circleci/criu/coredump/coredump", line 55, in main() File "/home/circleci/criu/coredump/coredump", line 47, in main coredump(opts) File "/home/circleci/criu/coredump/coredump", line 14, in coredump cores = generator(os.path.realpath(opts['in'])) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 192, in __call__ self.coredumps[pid] = self._gen_coredump(pid) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 214, in _gen_coredump cd.vmas = self._gen_vmas(pid) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 992, in _gen_vmas v.data = self._gen_mem_chunk(pid, vma, v.filesz) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 879, in _gen_mem_chunk page_mem = self._get_page(pid, page_no) File "/home/circleci/criu/coredump/criu_coredump/coredump.py", line 797, in _get_page num_pages = m.get("nr_pages", m.compat_nr_pages) AttributeError: 'dict' object has no attribute 'compat_nr_pages' + exit 1 make[1]: *** [Makefile:3: run] Error 1 Signed-off-by: Radostin Stoyanov Signed-off-by: Andrei Vagin --- coredump/criu_coredump/coredump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 3c9cd45aa..acb806ace 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -794,7 +794,8 @@ class coredump_generator: off = 0 # in pages for m in pagemap[1:]: found = False - num_pages = m.get("nr_pages", m.compat_nr_pages) + num_pages = m.get("nr_pages", m["compat_nr_pages"]) + for i in range(num_pages): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True From ce680fc6c71ddac19fec25669dffe123c36595e7 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 8 Nov 2025 15:57:22 +0000 Subject: [PATCH 719/775] Revert "plugins/amdgpu: Implement parallel restore" This functionality (#2527) is being reverted and excluded from this release due to issue #2812. It will be included in a subsequent release once all associated issues are resolved. Signed-off-by: Andrei Vagin --- Documentation/criu-amdgpu-plugin.txt | 1 - plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/README.md | 23 +- plugins/amdgpu/amdgpu_plugin.c | 420 +++--------------------- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 - plugins/amdgpu/amdgpu_socket_utils.c | 320 ------------------ plugins/amdgpu/amdgpu_socket_utils.h | 54 --- 8 files changed, 52 insertions(+), 771 deletions(-) delete mode 100644 plugins/amdgpu/amdgpu_socket_utils.c delete mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index fe76fc3bc..68803f3db 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,7 +15,6 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer -Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 870a039cd..3d55f8bb4 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index b808fbc4f..1078eafe6 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,8 +3,7 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _
-_Yanning Yang _ +_David Yat Sin _ # Introduction @@ -225,26 +224,6 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* -## Restoring BO content in parallel - -Restoring the BO content is an important part in the restore of GPU state and -usually takes a significant amount of time. A possible location for this -procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook -blocks the target process from performing other restore operations, which -hinders further optimization of the restore process. - -Therefore, a new plugin hook that runs in the master restore process is -introduced, and it interacts with the `cr_plugin_restore_file` hook to complete -the restore of BO content. Specifically, the target process only needs to send -the relevant BOs to the master restore process, while this new hook handles all -the restore of buffer objects. Through this method, during the restore of the BO -content, the target process can perform other restore operations, thus -accelerating the restore procedure. This is an implementation of the gCROP -method proposed in the ACM SoCC'24 paper: [On-demand and Parallel -Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). - -*This optimization technique is enabled by the `__POST_FORKING` hook.* - ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 69194fbc7..96c086162 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,13 +28,11 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" -#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" -#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -66,18 +64,6 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; -/* - * In the case of a single process (common case), this optimization can effectively - * reduce the restore latency with parallel restore. In the case of multiple processes, - * states are already restored in parallel within different processes. Therefore, this - * optimization does not introduce further improvement and will be disabled by default - * in this case. The flag, parallel_disabled, is used to control whether the - * optimization is enabled or disabled. - */ -bool parallel_disabled = false; - -pthread_t parallel_thread = 0; -int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -365,15 +351,6 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { - if (has_children(root_item)) { - pr_info("Parallel restore disabled\n"); - parallel_disabled = true; - } else { - if (install_parallel_sock() < 0) { - pr_err("Failed to install parallel socket\n"); - return -1; - } - } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1462,9 +1439,14 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas = NULL; + struct thread_data *thread_datas; int thread_i, ret = 0; - int offset = 0; + + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; + goto exit; + } for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1507,101 +1489,56 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - if (!parallel_disabled) { - parallel_restore_cmd restore_cmd; - pr_info("Begin to send parallel restore cmd\n"); - ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); - if (ret) - goto exit_parallel; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - uint32_t target_gpu_id; - struct tp_node *dev; + if (!e->device_entries[i]->gpu_id) + continue; - if (!e->device_entries[i]->gpu_id) - continue; + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; - goto exit_parallel; - } - parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); - - for (int j = 0; j < e->num_of_bos; j++) { - if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) - continue; - if (bo_buckets[j].alloc_flags & - (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { - parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, - bo_buckets[j].size, offset, &restore_cmd); - offset += bo_buckets[j].size; - } - } - } - ret = send_parallel_restore_cmd(&restore_cmd); -exit_parallel: - free_parallel_restore_cmd(&restore_cmd); - } else { - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; goto exit; } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; - if (!e->device_entries[i]->gpu_id) - continue; - - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; - goto exit; - } - - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; - - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; - } - - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; } - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; + } - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; } } exit: @@ -1609,8 +1546,8 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - if (thread_datas) - xfree(thread_datas); + + xfree(thread_datas); return ret; } @@ -1899,24 +1836,6 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; - if (!parallel_disabled) { - pr_info("Close parallel restore server\n"); - if (close_parallel_restore_server()) { - pr_err("Close parallel restore server fail\n"); - return -1; - } - - exit_code = pthread_join(parallel_thread, NULL); - if (exit_code) { - pr_err("Failed to join parallel thread ret:%d\n", exit_code); - return -1; - } - if (parallel_thread_result) { - pr_err("Parallel restore fail\n"); - return parallel_thread_result; - } - } - pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1943,244 +1862,3 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) - -int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, - amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) -{ - return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); -} - -int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) -{ - int ret = 0; - int drm_fd = -1; - uint32_t major, minor; - - struct amdgpu_gpu_info gpu_info = { 0 }; - - drm_fd = open_drm_render_device(dev_minor); - if (drm_fd < 0) { - return drm_fd; - } - - ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); - if (ret) { - pr_perror("Failed to initialize device"); - goto err; - } - - ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); - if (ret) { - pr_perror("failed to query gpuinfo via libdrm"); - goto err; - } - *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; - return 0; -err: - amdgpu_device_deinitialize(*h_dev); - return ret; -} - -FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) -{ - char img_path[PATH_MAX]; - size_t image_size = 0; - FILE *bo_contents_fp = NULL; - - snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); - bo_contents_fp = open_img_file(img_path, false, &image_size); - if (!bo_contents_fp) { - pr_perror("Cannot fopen %s", img_path); - return NULL; - } - - if (tot_size != image_size) { - pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); - fclose(bo_contents_fp); - return NULL; - } - return bo_contents_fp; -} - -struct parallel_thread_data { - pthread_t thread; - uint32_t gpu_id; - int minor; - parallel_restore_cmd *restore_cmd; - int ret; -}; - -void *parallel_restore_bo_contents(void *_thread_data) -{ - struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; - amdgpu_device_handle h_dev; - uint64_t max_copy_size; - size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; - FILE *bo_contents_fp = NULL; - parallel_restore_entry *entry; - parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; - int ret = 0; - int offset = 0; - void *buffer = NULL; - - ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); - if (ret) { - goto err; - } - - for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { - if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { - total_bo_size += restore_cmd->entries[i].size; - max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); - } - } - - buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; - - bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); - if (bo_contents_fp == NULL) { - ret = -1; - goto err_sdma; - } - offset = ftell(bo_contents_fp); - - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); - if (!buffer) { - pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); - ret = -ENOMEM; - goto err_sdma; - } - - for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { - if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) - continue; - - entry = &restore_cmd->entries[i]; - fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); - ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); - if (ret) { - pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); - goto err_sdma; - } - } - -err_sdma: - if (bo_contents_fp) - fclose(bo_contents_fp); - if (buffer) - xfree(buffer); - amdgpu_device_deinitialize(h_dev); -err: - thread_data->ret = ret; - return NULL; -} - -void *restore_device_parallel_worker(void *arg) -{ - while (1) { - parallel_restore_cmd restore_cmd = { 0 }; - struct parallel_thread_data *thread_datas = NULL; - int ret; - int error_occurred = 0, join_ret = 0, created_threads = 0; - - ret = recv_parallel_restore_cmd(&restore_cmd); - if (ret) { - if (ret == 1) { - *(int *)arg = 0; - goto exit; - } - goto err; - } - - thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); - if (!thread_datas) { - ret = -ENOMEM; - goto err; - } - - for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { - thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; - thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; - thread_datas[created_threads].restore_cmd = &restore_cmd; - - ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, - (void *)&thread_datas[created_threads]); - if (ret) { - pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); - error_occurred = 1; - break; - } - } - - for (int i = 0; i < created_threads; i++) { - join_ret = pthread_join(thread_datas[i].thread, NULL); - if (join_ret != 0) { - pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", - thread_datas[i].gpu_id, join_ret); - if (!error_occurred) { - ret = join_ret; - error_occurred = 1; - } - } - - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - /* Check thread return value */ - if (thread_datas[i].ret && !error_occurred) { - ret = thread_datas[i].ret; - error_occurred = 1; - } - } - - if (thread_datas) - xfree(thread_datas); -err: - free_parallel_restore_cmd(&restore_cmd); - - if (ret) { - *(int *)arg = ret; - return NULL; - } - } -exit: - return NULL; -} - -/* - * While the background thread is running, some processing functions (e.g., stop_cgroupd) - * in the main thread need to block SIGCHLD. To prevent interference from this background - * thread, SIGCHLD is blocked in this thread. - */ -static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) -{ - int ret = 0; - sigset_t blockmask, oldmask; - - sigemptyset(&blockmask); - sigaddset(&blockmask, SIGCHLD); - sigprocmask(SIG_BLOCK, &blockmask, &oldmask); - - ret = pthread_create(newthread, NULL, f, arg); - if (ret) { - pr_err("Create worker thread fail: %d\n", ret); - return -1; - } - - sigprocmask(SIG_SETMASK, &oldmask, NULL); - return 0; -} - -int amdgpu_plugin_post_forking(void) -{ - if (plugin_disabled) - return -ENOTSUP; - - if (parallel_disabled) - return 0; - - return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 730f2e028..5b4396a0c 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -int open_drm_render_device(int minor) +static int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index e19f8e7ce..c890e3dda 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,7 +118,6 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); -int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c deleted file mode 100644 index c8bf6d1ba..000000000 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ /dev/null @@ -1,320 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "amdgpu_socket_utils.h" -#include "criu-log.h" -#include "common/scm.h" -#include "fdstore.h" -#include "util-pie.h" -#include "util.h" - -int parallel_socket_addr_len; -struct sockaddr_un parallel_socket_addr; -int parallel_socket_id = 0; - -static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) -{ - addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); - *len = SUN_LEN(addr); - *addr->sun_path = '\0'; -} - -int install_parallel_sock(void) -{ - int ret = 0; - int sock_fd; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("socket creation failed"); - return -1; - } - - amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); - ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("bind failed"); - goto err; - } - - ret = listen(sock_fd, SOMAXCONN); - if (ret < 0) { - pr_perror("listen failed"); - goto err; - } - - parallel_socket_id = fdstore_add(sock_fd); - if (parallel_socket_id < 0) { - ret = -1; - goto err; - } -err: - close(sock_fd); - return ret; -} - -void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, - parallel_restore_cmd *restore_cmd) -{ - parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; - restore_entry->gpu_id = gpu_id; - restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; - restore_entry->write_offset = 0; - restore_entry->read_offset = offset; - restore_entry->size = size; - - restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; - - restore_cmd->cmd_head.entry_num += 1; - restore_cmd->cmd_head.fd_write_num += 1; -} - -void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) -{ - restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; - restore_cmd->cmd_head.gpu_num += 1; -} - -static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Send parallel restore command head fail"); - return -1; - } - return 0; -} - -static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { - pr_perror("Send GPU ids of parallel restore command fail"); - return -1; - } - return 0; -} - -static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { - pr_perror("Send parallel restore command fail"); - return -1; - } - return 0; -} - -static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { - pr_perror("Send dmabuf fds fail"); - return -1; - } - return 0; -} - -int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - int sock_fd; - int ret = 0; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("Socket creation failed"); - return -1; - } - - ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("Connect failed"); - goto err; - } - - ret = send_metadata(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_gpu_ids(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_cmds(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_dmabuf_fds(sock_fd, restore_cmd); - -err: - close(sock_fd); - return ret; -} - -int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) -{ - restore_cmd->cmd_head.id = id; - restore_cmd->cmd_head.fd_write_num = 0; - restore_cmd->cmd_head.entry_num = 0; - restore_cmd->cmd_head.gpu_num = 0; - - restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); - if (!restore_cmd->gpu_ids) - return -ENOMEM; - restore_cmd->fds_write = xzalloc(num * sizeof(int)); - if (!restore_cmd->fds_write) - return -ENOMEM; - restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); - if (!restore_cmd->entries) - return -ENOMEM; - return 0; -} - -void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - if (restore_cmd->gpu_ids) - xfree(restore_cmd->gpu_ids); - if (restore_cmd->fds_write) - xfree(restore_cmd->fds_write); - if (restore_cmd->entries) - xfree(restore_cmd->entries); -} - -static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) -{ - restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); - if (!restore_cmd->gpu_ids) - return -ENOMEM; - restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); - if (!restore_cmd->fds_write) - return -ENOMEM; - restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); - if (!restore_cmd->entries) - return -ENOMEM; - return 0; -} - -static int check_quit_cmd(parallel_restore_cmd *restore_cmd) -{ - return restore_cmd->cmd_head.fd_write_num == 0; -} - -static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Recv parallel restore command head fail"); - return -1; - } - return 0; -} - -static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { - pr_perror("Recv parallel restore command fail"); - return -1; - } - return 0; -} - -static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { - pr_perror("Send GPU ids of parallel restore command fail"); - return -1; - } - return 0; -} - -static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { - pr_perror("Recv dmabuf fds fail"); - return -1; - } - return 0; -} - -int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - int sock_fd, client_fd; - int ret = 0; - - sock_fd = fdstore_get(parallel_socket_id); - if (sock_fd < 0) - return -1; - - client_fd = accept(sock_fd, NULL, NULL); - if (client_fd < 0) { - ret = client_fd; - goto err_accept; - } - - ret = recv_metadata(client_fd, restore_cmd); - if (ret) { - goto err; - } - - // Return 1 to quit - if (check_quit_cmd(restore_cmd)) { - ret = 1; - goto err; - } - - ret = init_parallel_restore_cmd_by_head(restore_cmd); - if (ret) { - goto err; - } - - ret = recv_gpu_ids(client_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = recv_cmds(client_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = recv_dmabuf_fds(client_fd, restore_cmd); - -err: - close(client_fd); -err_accept: - close(sock_fd); - return ret; -} - -int close_parallel_restore_server(void) -{ - int sock_fd; - int ret = 0; - parallel_restore_cmd_head cmd_head; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("Socket creation failed"); - return -1; - } - - ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("Connect failed"); - goto err; - } - - memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); - if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Send parallel restore command head fail"); - return -1; - } - -err: - close(sock_fd); - return ret; -} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h deleted file mode 100644 index d7200c6bd..000000000 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ -#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ - -typedef struct { - int id; - int fd_write_num; /* The number of buffer objects to be restored. */ - int entry_num; /* The number of restore commands.*/ - int gpu_num; -} parallel_restore_cmd_head; - -typedef struct { - int gpu_id; - int minor; -} parallel_gpu_info; - -typedef struct { - int gpu_id; - int write_id; - uint64_t read_offset; - uint64_t write_offset; - uint64_t size; -} parallel_restore_entry; - -typedef struct { - parallel_restore_cmd_head cmd_head; - int *fds_write; - parallel_gpu_info *gpu_ids; - parallel_restore_entry *entries; -} parallel_restore_cmd; - -/* - * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU - * buffer object. However, initially, the ownership of these buffer objects and the metadata for - * restoration are all with the target process. Therefore, we introduce a series of functions to - * help the target process send these tasks to the main CRIU process. - */ -int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); - -void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -int install_parallel_sock(void); - -int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, - parallel_restore_cmd *restore_cmd); - -void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); - -int close_parallel_restore_server(void); - -#endif \ No newline at end of file From a525b3c32ea0a4b8bff66ad31941fc574914d18d Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 9 Nov 2025 20:26:50 -0800 Subject: [PATCH 720/775] test/vdso-proxy: handle merged vma-s When we compare two list of vma-s, we need to take into account that some of them could be merged. Fixes #12286 Signed-off-by: Andrei Vagin --- test/zdtm/static/vdso-proxy.c | 51 +++++++++++++++-------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/test/zdtm/static/vdso-proxy.c b/test/zdtm/static/vdso-proxy.c index 43334974f..a53e6cdc0 100644 --- a/test/zdtm/static/vdso-proxy.c +++ b/test/zdtm/static/vdso-proxy.c @@ -70,6 +70,7 @@ static int parse_maps(struct vm_area *vmas) #endif v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL; v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL; + v->is_vvar_or_vdso |= strstr(buf, "[vvar_vclock]") != NULL; test_msg("[NOTE]\tVMA: [%#" PRIx64 ", %#" PRIx64 "]\n", v->start, v->end); } @@ -86,42 +87,35 @@ static int parse_maps(struct vm_area *vmas) return i; } -int compare_vmas(struct vm_area *vmax, struct vm_area *vmay) -{ - if (vmax->start > vmay->start) - return 1; - if (vmax->start < vmay->start) - return -1; - if (vmax->end > vmay->end) - return 1; - if (vmax->end < vmay->end) - return -1; - - return 0; -} - -static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) +static int check_vvar_vdso(struct vm_area *before, int nr_before, struct vm_area *after, int nr_after) { int i, j = 0; - for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) { - int cmp = compare_vmas(&before[i], &after[j]); - - if (cmp == 0) - continue; - - if (cmp < 0) { /* Lost mapping */ + for (i = 0, j = 0; i < nr_before || j < nr_after;) { + if (j == nr_after || before[i].start < after[j].start) { test_msg("[NOTE]\tLost mapping: %#" PRIx64 "-%#" PRIx64 "\n", before[i].start, before[i].end); - j--; if (before[i].is_vvar_or_vdso) { fail("Lost vvar/vdso mapping"); return -1; } + i++; continue; } - - test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); - i--; + if (i == nr_before || before[i].start > after[j].start) { + test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); + j++; + continue; + } + if (before[i].end == after[j].end) { + i++; + j++; + } else if (before[i].end > after[j].end) { + before[i].start = after[j].end; + j++; + } else { + after[j].start = before[i].end; + i++; + } } return 0; @@ -129,11 +123,10 @@ static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) static struct vm_area vmas_before[MAX_VMAS]; static struct vm_area vmas_after[MAX_VMAS]; +static int nr_before, nr_after; int main(int argc, char *argv[]) { - int nr_before, nr_after; - test_init(argc, argv); test_msg("[NOTE]\tMappings before:\n"); @@ -154,7 +147,7 @@ int main(int argc, char *argv[]) } /* After restore vDSO/VVAR blobs must remain in the old place. */ - if (check_vvar_vdso(vmas_before, vmas_after)) + if (check_vvar_vdso(vmas_before, nr_before, vmas_after, nr_after)) return -1; if (nr_before + 2 < nr_after) { From 6344e8d71c57d44600fca0c34ec64827688c737d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 11 Nov 2025 22:10:36 +0000 Subject: [PATCH 721/775] cr-servce: move kerndat_init after log_init kerndat_init() can generate a significant volume of logs. If called before log_init(), all these messages will be saved in the early_log_buffer, which has a limited capacity. Additionally, saving to the early_log_buffer can introduce a performance penalty, especially when verbose mode is not enabled. Signed-off-by: Radostin Stoyanov Signed-off-by: Andrei Vagin --- criu/cr-service.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/criu/cr-service.c b/criu/cr-service.c index b4718dde2..dccf4ef38 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -439,12 +439,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_unprivileged) opts.unprivileged = req->unprivileged; - if (check_caps()) - return 1; - - if (kerndat_init()) - return 1; - if (log_keep_err()) { pr_perror("Can't tune log"); goto err; @@ -738,9 +732,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } } - if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) - goto err; - if (req->orphan_pts_master) opts.orphan_pts_master = true; @@ -817,6 +808,16 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (setup_logging_from_req(req, output_changed_by_rpc_conf)) goto err; + if (check_caps()) + goto err; + + if (kerndat_init()) + goto err; + + /* init_pidfd_store_sk must be called after kerndat_init. */ + if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) + goto err; + if (req->mntns_compat_mode) opts.mntns_compat_mode = true; From e689d902b3d5dabcad8107c00a463a607ef49ebb Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 11 Nov 2025 15:21:09 -0800 Subject: [PATCH 722/775] criu/log: properly handle truncated length from vsnprintf vsnprintf does not always return the number of bytes actually written to the buffer. If the output was truncated due to the buffer limit, the return value is the total number of bytes which WOULD have been written to the final string if enough space had been available. This means we must cap the return value to the buffer size excluding the terminating null byte to correctly calculate the log entry size. Signed-off-by: Andrei Vagin --- criu/log.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/criu/log.c b/criu/log.c index a02a8df20..fe7077702 100644 --- a/criu/log.c +++ b/criu/log.c @@ -202,7 +202,7 @@ void flush_early_log_buffer(int fd) } pos += hdr->len; } - if (early_log_buf_off == EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(struct early_log_hdr)) >= EARLY_LOG_BUF_LEN) pr_warn("The early log buffer is full, some messages may have been lost\n"); early_log_buf_off = 0; } @@ -320,7 +320,7 @@ unsigned int log_get_loglevel(void) static void early_vprint(const char *format, unsigned int loglevel, va_list params) { - unsigned int log_size = 0; + int log_size = 0, log_space; struct early_log_hdr *hdr; if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) @@ -332,6 +332,7 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para hdr->level = loglevel; /* Skip the log entry size */ early_log_buf_off += sizeof(hdr); + log_space = EARLY_LOG_BUF_LEN - early_log_buf_off; if (loglevel >= LOG_TIMESTAMP) { /* * If logging is not yet setup we just write zeros @@ -339,12 +340,17 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para * keep the same format as the other messages on * log levels with timestamps (>=LOG_TIMESTAMP). */ - log_size = snprintf(early_log_buffer + early_log_buf_off, sizeof(early_log_buffer) - early_log_buf_off, + log_size = snprintf(early_log_buffer + early_log_buf_off, log_space, "(00.000000) "); } - log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, - sizeof(early_log_buffer) - early_log_buf_off - log_size, format, params); + if (log_size < log_space) + log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, + log_space - log_size, format, params); + if (log_size > log_space) { + /* vsnprintf always add the terminating null byte. */ + log_size = log_space - 1; + } /* Save log entry size */ hdr->len = log_size; From 0a7e7d09dd91354277e697495bd8fb05626987a9 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 12 Nov 2025 05:50:23 +0000 Subject: [PATCH 723/775] log: use sizeof(*hdr) instead of sizeof(hdr) Using sizeof(hdr) where hdr is a pointer gives the size of the pointer, not the size of the structure it points to. Reported-by: Kir Kolyshkin Signed-off-by: Andrei Vagin --- criu/log.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/criu/log.c b/criu/log.c index fe7077702..bf6f657f2 100644 --- a/criu/log.c +++ b/criu/log.c @@ -190,7 +190,7 @@ void flush_early_log_buffer(int fd) * with reading the log_level. */ struct early_log_hdr *hdr = (void *)early_log_buffer + pos; - pos += sizeof(hdr); + pos += sizeof(*hdr); if (hdr->level <= current_loglevel) { size_t size = 0; while (size < hdr->len) { @@ -323,7 +323,7 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para int log_size = 0, log_space; struct early_log_hdr *hdr; - if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(*hdr)) >= EARLY_LOG_BUF_LEN) return; /* Save loglevel */ @@ -331,7 +331,7 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para hdr = (void *)early_log_buffer + early_log_buf_off; hdr->level = loglevel; /* Skip the log entry size */ - early_log_buf_off += sizeof(hdr); + early_log_buf_off += sizeof(*hdr); log_space = EARLY_LOG_BUF_LEN - early_log_buf_off; if (loglevel >= LOG_TIMESTAMP) { /* From 3c7d4fa013297b431da48eff821db7f2e8b90c27 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sat, 8 Nov 2025 06:53:19 +0000 Subject: [PATCH 724/775] criu: Version 4.2 (CRIUTIBILITY) Major changes: * plugins/amdgpu: Implement parallel restore * Handle processes with uprobes vma * Fix: getsockopt usage for SO_PASSCRED/SO_PASSSEC on Linux 6.16 * Relax ELF magic check to support MIPS libraries * pagemap: prevent integer overflow in pagemap_len This release's name is a nod to the growing challenge we face in maintaining compatibility across the rapidly evolving Linux kernel ecosystem. The full changelog can be found here: https://criu.org/Download/criu/4.2. Signed-off-by: Andrei Vagin --- Makefile.versions | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.versions b/Makefile.versions index 0b1a46a16..3e6c9ed22 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. CRIU_VERSION_MAJOR := 4 -CRIU_VERSION_MINOR := 1 -CRIU_VERSION_SUBLEVEL := 1 +CRIU_VERSION_MINOR := 2 +CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := CRISCV +CRIU_VERSION_NAME := CRIUTIBILITY CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL From ddbb3dbd8d84a785ad211be42d2ba0d034c0291f Mon Sep 17 00:00:00 2001 From: Pengda Yang Date: Wed, 15 Mar 2023 16:58:31 +0800 Subject: [PATCH 725/775] limit the field width of 'scanf' Fixes: #2121 Signed-off-by: Pengda Yang --- criu/proc_parse.c | 6 +++--- test/zdtm/lib/fs.c | 2 +- test/zdtm/static/apparmor.c | 2 +- test/zdtm/static/apparmor_stacking.c | 2 +- test/zdtm/static/cgroup01.c | 2 +- test/zdtm/static/cgroup02.c | 2 +- test/zdtm/static/change_mnt_context.c | 2 +- test/zdtm/static/file_locks01.c | 2 +- test/zdtm/static/file_locks02.c | 2 +- test/zdtm/static/file_locks03.c | 2 +- test/zdtm/static/file_locks04.c | 2 +- test/zdtm/static/netns-dev.c | 2 +- test/zdtm/static/ofd_file_locks.c | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 0d3b5b23f..f51f2e801 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -1477,7 +1477,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -2302,10 +2302,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index bf8cd9cd3..efcc7a1d0 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa46..dc1636821 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b4..0bc36048c 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264..7bfb67762 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a08..8a925c0a4 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b..8787ae5cf 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5..bfdca51d9 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa..ae4827de9 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21..228e66892 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa7..7e0d2654e 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea..f268f2fec 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f5..a68fa38ee 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); From 63861407544172a04c8b03d3387ea6a8b23d9be2 Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:36:33 +0000 Subject: [PATCH 726/775] plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 59 ++++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 6 +++ 2 files changed, 65 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_socket_utils.c create mode 100644 plugins/amdgpu/amdgpu_socket_utils.h diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 000000000..9e957ae54 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 000000000..4e7aa2aa4 --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,6 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +int install_parallel_sock(void); + +#endif \ No newline at end of file From 33ed774c8dd13fc48955557434bad9908031379e Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:38:48 +0000 Subject: [PATCH 727/775] plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang --- plugins/amdgpu/amdgpu_socket_utils.c | 261 +++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_socket_utils.h | 48 +++++ 2 files changed, 309 insertions(+) diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c index 9e957ae54..c8bf6d1ba 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "amdgpu_socket_utils.h" #include "criu-log.h" @@ -53,6 +54,266 @@ int install_parallel_sock(void) ret = -1; goto err; } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + err: close(sock_fd); return ret; diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h index 4e7aa2aa4..d7200c6bd 100644 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -1,6 +1,54 @@ #ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ #define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + int install_parallel_sock(void); +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + #endif \ No newline at end of file From 4a3a695dfb9da7338174549b0cadcc4279cbf51a Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Wed, 15 Jan 2025 06:38:27 +0000 Subject: [PATCH 728/775] plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 420 +++++++++++++++++++++--- plugins/amdgpu/amdgpu_plugin_topology.c | 2 +- plugins/amdgpu/amdgpu_plugin_topology.h | 1 + 4 files changed, 374 insertions(+), 51 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 3d55f8bb4..870a039cd 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 96c086162..69194fbc7 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -28,11 +28,13 @@ #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" @@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ /* Call ioctl, restarting if it is interrupted */ @@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + int offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; + if (!e->device_entries[i]->gpu_id) + continue; + + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1546,8 +1609,8 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } @@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (plugin_disabled) return -ENOTSUP; + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, + amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) +{ + return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); +} + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, + buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5b4396a0c..730f2e028 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -45,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index c890e3dda..e19f8e7ce 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); From 920437205c4f5359e4c54765c9e23d2d57c2f4ec Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Tue, 10 Dec 2024 12:44:35 +0000 Subject: [PATCH 729/775] plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang --- Documentation/criu-amdgpu-plugin.txt | 1 + plugins/amdgpu/README.md | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt index 68803f3db..fe76fc3bc 100644 --- a/Documentation/criu-amdgpu-plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 1078eafe6..b808fbc4f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to From 7a4ee0ae8effdbf475804b72995912b0911ad28a Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:26:21 -0500 Subject: [PATCH 730/775] restorer: Skip non-regular VMAs amdgpu represents allocated device memory as a memory mapping of the device file. This is a non-standard VMA that must be handled by the plugin, not the normal VMA code. Ignore all VMAs on device files. Signed-off-by: David Francis --- criu/pie/restorer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 5c40b0e93..008e1398d 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1989,6 +1989,9 @@ __visible long __export_restore_task(struct task_restore_args *args) for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { + if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) + continue; + ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%" PRIx64 ", %" PRIu64 ", %ld) " From fb02dbf68582c6589724d7aa3bb06ce3d588cc71 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:45:37 -0500 Subject: [PATCH 731/775] files-ext: Allow plugin files to retry amdgpu dmabuf CRIU requires the ability of the amdgpu plugin to retry. Change files_ext.c to read a response of 1 from a plugin restore function to mean retry. Signed-off-by: David Francis --- criu/files-ext.c | 10 +++++++--- criu/include/criu-plugin.h | 2 +- plugins/amdgpu/amdgpu_plugin.c | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/criu/files-ext.c b/criu/files-ext.c index 95ec8e37c..4cc99d921 100644 --- a/criu/files-ext.c +++ b/criu/files-ext.c @@ -45,10 +45,11 @@ static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; + bool retry_needed; xfi = container_of(d, struct ext_file_info, d); - fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); + fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; @@ -57,8 +58,11 @@ static int open_fd(struct file_desc *d, int *new_fd) if (restore_fown(fd, xfi->xfe->fown)) return -1; - *new_fd = fd; - return 0; + if (!retry_needed) + *new_fd = fd; + else + *new_fd = -1; + return retry_needed; } static struct file_desc_ops ext_desc_ops = { diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 9fb21a449..ee84ccdf6 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -70,7 +70,7 @@ enum { DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 69194fbc7..e3b4ead3f 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1614,7 +1614,7 @@ exit: return ret; } -int amdgpu_plugin_restore_file(int id) +int amdgpu_plugin_restore_file(int id, bool *retry_needed) { int ret = 0, fd; char img_path[PATH_MAX]; @@ -1625,6 +1625,8 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; + *retry_needed = false; + if (plugin_disabled) return -ENOTSUP; From 0b7ca29c1944a8021c22a8e7041f047facb52e48 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 19 Feb 2025 14:30:07 -0500 Subject: [PATCH 732/775] plugin/amdgpu: Add amdgpu drm header For amdgpu plugin to call the new amdgpu drm CRIU ioctls, it needs the amdgpu drm header file, copied from the kernel's includes. Signed-off-by: David Francis --- plugins/amdgpu/amdgpu_drm.h | 1688 +++++++++++++++++++++++++++++++++++ 1 file changed, 1688 insertions(+) create mode 100644 plugins/amdgpu/amdgpu_drm.h diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h new file mode 100644 index 000000000..9cebd072a --- /dev/null +++ b/plugins/amdgpu/amdgpu_drm.h @@ -0,0 +1,1688 @@ +/* amdgpu_drm.h -- Public header for the amdgpu driver -*- linux-c -*- + * + * Copyright 2000 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Fremont, California. + * Copyright 2002 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Kevin E. Martin + * Gareth Hughes + * Keith Whitwell + */ + +#ifndef __AMDGPU_DRM_H__ +#define __AMDGPU_DRM_H__ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_AMDGPU_GEM_CREATE 0x00 +#define DRM_AMDGPU_GEM_MMAP 0x01 +#define DRM_AMDGPU_CTX 0x02 +#define DRM_AMDGPU_BO_LIST 0x03 +#define DRM_AMDGPU_CS 0x04 +#define DRM_AMDGPU_INFO 0x05 +#define DRM_AMDGPU_GEM_METADATA 0x06 +#define DRM_AMDGPU_GEM_WAIT_IDLE 0x07 +#define DRM_AMDGPU_GEM_VA 0x08 +#define DRM_AMDGPU_WAIT_CS 0x09 +#define DRM_AMDGPU_GEM_OP 0x10 +#define DRM_AMDGPU_GEM_USERPTR 0x11 +#define DRM_AMDGPU_WAIT_FENCES 0x12 +#define DRM_AMDGPU_VM 0x13 +#define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 +#define DRM_AMDGPU_SCHED 0x15 +#define DRM_AMDGPU_USERQ 0x16 +#define DRM_AMDGPU_USERQ_SIGNAL 0x17 +#define DRM_AMDGPU_USERQ_WAIT 0x18 +#define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 + +#define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) +#define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) +#define DRM_IOCTL_AMDGPU_CTX DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CTX, union drm_amdgpu_ctx) +#define DRM_IOCTL_AMDGPU_BO_LIST DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_BO_LIST, union drm_amdgpu_bo_list) +#define DRM_IOCTL_AMDGPU_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CS, union drm_amdgpu_cs) +#define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info) +#define DRM_IOCTL_AMDGPU_GEM_METADATA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_METADATA, struct drm_amdgpu_gem_metadata) +#define DRM_IOCTL_AMDGPU_GEM_WAIT_IDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_WAIT_IDLE, union drm_amdgpu_gem_wait_idle) +#define DRM_IOCTL_AMDGPU_GEM_VA DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_VA, struct drm_amdgpu_gem_va) +#define DRM_IOCTL_AMDGPU_WAIT_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_CS, union drm_amdgpu_wait_cs) +#define DRM_IOCTL_AMDGPU_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_OP, struct drm_amdgpu_gem_op) +#define DRM_IOCTL_AMDGPU_GEM_USERPTR DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_USERPTR, struct drm_amdgpu_gem_userptr) +#define DRM_IOCTL_AMDGPU_WAIT_FENCES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_FENCES, union drm_amdgpu_wait_fences) +#define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_VM, union drm_amdgpu_vm) +#define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) +#define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ, union drm_amdgpu_userq) +#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) +#define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) +#define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) + +/** + * DOC: memory domains + * + * %AMDGPU_GEM_DOMAIN_CPU System memory that is not GPU accessible. + * Memory in this pool could be swapped out to disk if there is pressure. + * + * %AMDGPU_GEM_DOMAIN_GTT GPU accessible system memory, mapped into the + * GPU's virtual address space via gart. Gart memory linearizes non-contiguous + * pages of system memory, allows GPU access system memory in a linearized + * fashion. + * + * %AMDGPU_GEM_DOMAIN_VRAM Local video memory. For APUs, it is memory + * carved out by the BIOS. + * + * %AMDGPU_GEM_DOMAIN_GDS Global on-chip data storage used to share data + * across shader threads. + * + * %AMDGPU_GEM_DOMAIN_GWS Global wave sync, used to synchronize the + * execution of all the waves on a device. + * + * %AMDGPU_GEM_DOMAIN_OA Ordered append, used by 3D or Compute engines + * for appending data. + * + * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for + * signalling user mode queues. + */ +#define AMDGPU_GEM_DOMAIN_CPU 0x1 +#define AMDGPU_GEM_DOMAIN_GTT 0x2 +#define AMDGPU_GEM_DOMAIN_VRAM 0x4 +#define AMDGPU_GEM_DOMAIN_GDS 0x8 +#define AMDGPU_GEM_DOMAIN_GWS 0x10 +#define AMDGPU_GEM_DOMAIN_OA 0x20 +#define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ + AMDGPU_GEM_DOMAIN_GTT | \ + AMDGPU_GEM_DOMAIN_VRAM | \ + AMDGPU_GEM_DOMAIN_GDS | \ + AMDGPU_GEM_DOMAIN_GWS | \ + AMDGPU_GEM_DOMAIN_OA | \ + AMDGPU_GEM_DOMAIN_DOORBELL) + +/* Flag that CPU access will be required for the case of VRAM domain */ +#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) +/* Flag that CPU access will not work, this VRAM domain is invisible */ +#define AMDGPU_GEM_CREATE_NO_CPU_ACCESS (1 << 1) +/* Flag that USWC attributes should be used for GTT */ +#define AMDGPU_GEM_CREATE_CPU_GTT_USWC (1 << 2) +/* Flag that the memory should be in VRAM and cleared */ +#define AMDGPU_GEM_CREATE_VRAM_CLEARED (1 << 3) +/* Flag that allocating the BO should use linear VRAM */ +#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS (1 << 5) +/* Flag that BO is always valid in this VM */ +#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) +/* Flag that BO sharing will be explicitly synchronized */ +#define AMDGPU_GEM_CREATE_EXPLICIT_SYNC (1 << 7) +/* Flag that indicates allocating MQD gart on GFX9, where the mtype + * for the second page onward should be set to NC. It should never + * be used by user space applications. + */ +#define AMDGPU_GEM_CREATE_CP_MQD_GFX9 (1 << 8) +/* Flag that BO may contain sensitive data that must be wiped before + * releasing the memory + */ +#define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) +/* Flag that BO will be encrypted and that the TMZ bit should be + * set in the PTEs when mapping this buffer via GPUVM or + * accessing it with various hw blocks + */ +#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) +/* Flag that BO will be used only in preemptible context, which does + * not require GTT memory accounting + */ +#define AMDGPU_GEM_CREATE_PREEMPTIBLE (1 << 11) +/* Flag that BO can be discarded under memory pressure without keeping the + * content. + */ +#define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) +/* Flag that BO is shared coherently between multiple devices or CPU threads. + * May depend on GPU instructions to flush caches to system scope explicitly. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_COHERENT (1 << 13) +/* Flag that BO should not be cached by GPU. Coherent without having to flush + * GPU caches explicitly + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) +/* Flag that BO should be coherent across devices when using device-level + * atomics. May depend on GPU instructions to flush caches to device scope + * explicitly, promoting them to system scope automatically. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) +/* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ +#define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) + +struct drm_amdgpu_gem_create_in { + /** the requested memory size */ + __u64 bo_size; + /** physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; + /** the requested memory domains */ + __u64 domains; + /** allocation flags */ + __u64 domain_flags; +}; + +struct drm_amdgpu_gem_create_out { + /** returned GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +union drm_amdgpu_gem_create { + struct drm_amdgpu_gem_create_in in; + struct drm_amdgpu_gem_create_out out; +}; + +/** Opcode to create new residency list. */ +#define AMDGPU_BO_LIST_OP_CREATE 0 +/** Opcode to destroy previously created residency list */ +#define AMDGPU_BO_LIST_OP_DESTROY 1 +/** Opcode to update resource information in the list */ +#define AMDGPU_BO_LIST_OP_UPDATE 2 + +struct drm_amdgpu_bo_list_in { + /** Type of operation */ + __u32 operation; + /** Handle of list or 0 if we want to create one */ + __u32 list_handle; + /** Number of BOs in list */ + __u32 bo_number; + /** Size of each element describing BO */ + __u32 bo_info_size; + /** Pointer to array describing BOs */ + __u64 bo_info_ptr; +}; + +struct drm_amdgpu_bo_list_entry { + /** Handle of BO */ + __u32 bo_handle; + /** New (if specified) BO priority to be used during migration */ + __u32 bo_priority; +}; + +struct drm_amdgpu_bo_list_out { + /** Handle of resource list */ + __u32 list_handle; + __u32 _pad; +}; + +union drm_amdgpu_bo_list { + struct drm_amdgpu_bo_list_in in; + struct drm_amdgpu_bo_list_out out; +}; + +/* context related */ +#define AMDGPU_CTX_OP_ALLOC_CTX 1 +#define AMDGPU_CTX_OP_FREE_CTX 2 +#define AMDGPU_CTX_OP_QUERY_STATE 3 +#define AMDGPU_CTX_OP_QUERY_STATE2 4 +#define AMDGPU_CTX_OP_GET_STABLE_PSTATE 5 +#define AMDGPU_CTX_OP_SET_STABLE_PSTATE 6 + +/* GPU reset status */ +#define AMDGPU_CTX_NO_RESET 0 +/* this the context caused it */ +#define AMDGPU_CTX_GUILTY_RESET 1 +/* some other context caused it */ +#define AMDGPU_CTX_INNOCENT_RESET 2 +/* unknown cause */ +#define AMDGPU_CTX_UNKNOWN_RESET 3 + +/* indicate gpu reset occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET (1<<0) +/* indicate vram lost occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1) +/* indicate some job from this context once cause gpu hang */ +#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2) +/* indicate some errors are detected by RAS */ +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE (1<<3) +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE (1<<4) +/* indicate that the reset hasn't completed yet */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS (1<<5) + +/* Context priority level */ +#define AMDGPU_CTX_PRIORITY_UNSET -2048 +#define AMDGPU_CTX_PRIORITY_VERY_LOW -1023 +#define AMDGPU_CTX_PRIORITY_LOW -512 +#define AMDGPU_CTX_PRIORITY_NORMAL 0 +/* + * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires + * CAP_SYS_NICE or DRM_MASTER +*/ +#define AMDGPU_CTX_PRIORITY_HIGH 512 +#define AMDGPU_CTX_PRIORITY_VERY_HIGH 1023 + +/* select a stable profiling pstate for perfmon tools */ +#define AMDGPU_CTX_STABLE_PSTATE_FLAGS_MASK 0xf +#define AMDGPU_CTX_STABLE_PSTATE_NONE 0 +#define AMDGPU_CTX_STABLE_PSTATE_STANDARD 1 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK 2 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK 3 +#define AMDGPU_CTX_STABLE_PSTATE_PEAK 4 + +struct drm_amdgpu_ctx_in { + /** AMDGPU_CTX_OP_* */ + __u32 op; + /** Flags */ + __u32 flags; + __u32 ctx_id; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; +}; + +union drm_amdgpu_ctx_out { + struct { + __u32 ctx_id; + __u32 _pad; + } alloc; + + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** Number of resets caused by this context so far. */ + __u32 hangs; + /** Reset status since the last call of the ioctl. */ + __u32 reset_status; + } state; + + struct { + __u32 flags; + __u32 _pad; + } pstate; +}; + +union drm_amdgpu_ctx { + struct drm_amdgpu_ctx_in in; + union drm_amdgpu_ctx_out out; +}; + +/* user queue IOCTL operations */ +#define AMDGPU_USERQ_OP_CREATE 1 +#define AMDGPU_USERQ_OP_FREE 2 + +/* queue priority levels */ +/* low < normal low < normal high < high */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK 0x3 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_LOW 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_LOW 1 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH 2 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH 3 /* admin only */ +/* for queues that need access to protected content */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE (1 << 2) + +/* + * This structure is a container to pass input configuration + * info for all supported userqueue related operations. + * For operation AMDGPU_USERQ_OP_CREATE: user is expected + * to set all fields, excep the parameter 'queue_id'. + * For operation AMDGPU_USERQ_OP_FREE: the only input parameter expected + * to be set is 'queue_id', eveything else is ignored. + */ +struct drm_amdgpu_userq_in { + /** AMDGPU_USERQ_OP_* */ + __u32 op; + /** Queue id passed for operation USERQ_OP_FREE */ + __u32 queue_id; + /** the target GPU engine to execute workload (AMDGPU_HW_IP_*) */ + __u32 ip_type; + /** + * @doorbell_handle: the handle of doorbell GEM object + * associated with this userqueue client. + */ + __u32 doorbell_handle; + /** + * @doorbell_offset: 32-bit offset of the doorbell in the doorbell bo. + * Kernel will generate absolute doorbell offset using doorbell_handle + * and doorbell_offset in the doorbell bo. + */ + __u32 doorbell_offset; + /** + * @flags: flags used for queue parameters + */ + __u32 flags; + /** + * @queue_va: Virtual address of the GPU memory which holds the queue + * object. The queue holds the workload packets. + */ + __u64 queue_va; + /** + * @queue_size: Size of the queue in bytes, this needs to be 256-byte + * aligned. + */ + __u64 queue_size; + /** + * @rptr_va : Virtual address of the GPU memory which holds the ring RPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + */ + __u64 rptr_va; + /** + * @wptr_va : Virtual address of the GPU memory which holds the ring WPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + * + * Queue, RPTR and WPTR can come from the same object, as long as the size + * and alignment related requirements are met. + */ + __u64 wptr_va; + /** + * @mqd: MQD (memory queue descriptor) is a set of parameters which allow + * the GPU to uniquely define and identify a usermode queue. + * + * MQD data can be of different size for different GPU IP/engine and + * their respective versions/revisions, so this points to a __u64 * + * which holds IP specific MQD of this usermode queue. + */ + __u64 mqd; + /** + * @size: size of MQD data in bytes, it must match the MQD structure + * size of the respective engine/revision defined in UAPI for ex, for + * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11). + */ + __u64 mqd_size; +}; + +/* The structure to carry output of userqueue ops */ +struct drm_amdgpu_userq_out { + /** + * For operation AMDGPU_USERQ_OP_CREATE: This field contains a unique + * queue ID to represent the newly created userqueue in the system, otherwise + * it should be ignored. + */ + __u32 queue_id; + __u32 _pad; +}; + +union drm_amdgpu_userq { + struct drm_amdgpu_userq_in in; + struct drm_amdgpu_userq_out out; +}; + +/* GFX V11 IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_gfx11 { + /** + * @shadow_va: Virtual address of the GPU memory to hold the shadow buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 shadow_va; + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 csa_va; +}; + +/* GFX V11 SDMA IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_sdma_gfx11 { + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 csa_va; +}; + +/* GFX V11 Compute IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_compute_gfx11 { + /** + * @eop_va: Virtual address of the GPU memory to hold the EOP buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 eop_va; +}; + +/* userq signal/wait ioctl */ +struct drm_amdgpu_userq_signal { + /** + * @queue_id: Queue handle used by the userq fence creation function + * to retrieve the WPTR. + */ + __u32 queue_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to be signaled. + */ + __u64 syncobj_handles; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u64 num_syncobj_handles; + /** + * @bo_read_handles: The list of BO handles that the submitted user queue job + * is using for read only. This will update BO fences in the kernel. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of BO handles that the submitted user queue job + * is using for write only. This will update BO fences in the kernel. + */ + __u64 bo_write_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; +}; + +struct drm_amdgpu_userq_fence_info { + /** + * @va: A gpu address allocated for each queue which stores the + * read pointer (RPTR) value. + */ + __u64 va; + /** + * @value: A 64 bit value represents the write pointer (WPTR) of the + * queue commands which compared with the RPTR value to signal the + * fences. + */ + __u64 value; +}; + +struct drm_amdgpu_userq_wait { + /** + * @waitq_id: Queue handle used by the userq wait IOCTL to retrieve the + * wait queue and maintain the fence driver references in it. + */ + __u32 waitq_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 syncobj_handles; + /** + * @syncobj_timeline_handles: The list of timeline syncobj handles submitted by + * the user queue job to get the va/value pairs at given @syncobj_timeline_points. + */ + __u64 syncobj_timeline_handles; + /** + * @syncobj_timeline_points: The list of timeline syncobj points submitted by the + * user queue job for the corresponding @syncobj_timeline_handles. + */ + __u64 syncobj_timeline_points; + /** + * @bo_read_handles: The list of read BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of write BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_write_handles; + /** + * @num_syncobj_timeline_handles: A count that represents the number of timeline + * syncobj handles in @syncobj_timeline_handles. + */ + __u16 num_syncobj_timeline_handles; + /** + * @num_fences: This field can be used both as input and output. As input it defines + * the maximum number of fences that can be returned and as output it will specify + * how many fences were actually returned from the ioctl. + */ + __u16 num_fences; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u32 num_syncobj_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; + /** + * @out_fences: The field is a return value from the ioctl containing the list of + * address/value pairs to wait for. + */ + __u64 out_fences; +}; + +/* vm ioctl */ +#define AMDGPU_VM_OP_RESERVE_VMID 1 +#define AMDGPU_VM_OP_UNRESERVE_VMID 2 + +struct drm_amdgpu_vm_in { + /** AMDGPU_VM_OP_* */ + __u32 op; + __u32 flags; +}; + +struct drm_amdgpu_vm_out { + /** For future use, no flags defined so far */ + __u64 flags; +}; + +union drm_amdgpu_vm { + struct drm_amdgpu_vm_in in; + struct drm_amdgpu_vm_out out; +}; + +/* sched ioctl */ +#define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE 1 +#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE 2 + +struct drm_amdgpu_sched_in { + /* AMDGPU_SCHED_OP_* */ + __u32 op; + __u32 fd; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; + __u32 ctx_id; +}; + +union drm_amdgpu_sched { + struct drm_amdgpu_sched_in in; +}; + +/* + * This is not a reliable API and you should expect it to fail for any + * number of reasons and have fallback path that do not use userptr to + * perform any operation. + */ +#define AMDGPU_GEM_USERPTR_READONLY (1 << 0) +#define AMDGPU_GEM_USERPTR_ANONONLY (1 << 1) +#define AMDGPU_GEM_USERPTR_VALIDATE (1 << 2) +#define AMDGPU_GEM_USERPTR_REGISTER (1 << 3) + +struct drm_amdgpu_gem_userptr { + __u64 addr; + __u64 size; + /* AMDGPU_GEM_USERPTR_* */ + __u32 flags; + /* Resulting GEM handle */ + __u32 handle; +}; + +/* SI-CI-VI: */ +/* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ +#define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 +#define AMDGPU_TILING_ARRAY_MODE_MASK 0xf +#define AMDGPU_TILING_PIPE_CONFIG_SHIFT 4 +#define AMDGPU_TILING_PIPE_CONFIG_MASK 0x1f +#define AMDGPU_TILING_TILE_SPLIT_SHIFT 9 +#define AMDGPU_TILING_TILE_SPLIT_MASK 0x7 +#define AMDGPU_TILING_MICRO_TILE_MODE_SHIFT 12 +#define AMDGPU_TILING_MICRO_TILE_MODE_MASK 0x7 +#define AMDGPU_TILING_BANK_WIDTH_SHIFT 15 +#define AMDGPU_TILING_BANK_WIDTH_MASK 0x3 +#define AMDGPU_TILING_BANK_HEIGHT_SHIFT 17 +#define AMDGPU_TILING_BANK_HEIGHT_MASK 0x3 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_SHIFT 19 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_MASK 0x3 +#define AMDGPU_TILING_NUM_BANKS_SHIFT 21 +#define AMDGPU_TILING_NUM_BANKS_MASK 0x3 + +/* GFX9 - GFX11: */ +#define AMDGPU_TILING_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_SWIZZLE_MODE_MASK 0x1f +#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT 5 +#define AMDGPU_TILING_DCC_OFFSET_256B_MASK 0xFFFFFF +#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT 29 +#define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 + +/* GFX12 and later: */ +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 +/* These are DCC recompression settings for memory management: */ +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ +/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata + * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */ +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK 0x1 +/* bit gap */ +#define AMDGPU_TILING_GFX12_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_GFX12_SCANOUT_MASK 0x1 + +/* Set/Get helpers for tiling flags. */ +#define AMDGPU_TILING_SET(field, value) \ + (((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT) +#define AMDGPU_TILING_GET(value, field) \ + (((__u64)(value) >> AMDGPU_TILING_##field##_SHIFT) & AMDGPU_TILING_##field##_MASK) + +#define AMDGPU_GEM_METADATA_OP_SET_METADATA 1 +#define AMDGPU_GEM_METADATA_OP_GET_METADATA 2 + +/** The same structure is shared for input/output */ +struct drm_amdgpu_gem_metadata { + /** GEM Object handle */ + __u32 handle; + /** Do we want get or set metadata */ + __u32 op; + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** family specific tiling info */ + __u64 tiling_info; + __u32 data_size_bytes; + __u32 data[64]; + } data; +}; + +struct drm_amdgpu_gem_mmap_in { + /** the GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +struct drm_amdgpu_gem_mmap_out { + /** mmap offset from the vma offset manager */ + __u64 addr_ptr; +}; + +union drm_amdgpu_gem_mmap { + struct drm_amdgpu_gem_mmap_in in; + struct drm_amdgpu_gem_mmap_out out; +}; + +struct drm_amdgpu_gem_wait_idle_in { + /** GEM object handle */ + __u32 handle; + /** For future use, no flags defined so far */ + __u32 flags; + /** Absolute timeout to wait */ + __u64 timeout; +}; + +struct drm_amdgpu_gem_wait_idle_out { + /** BO status: 0 - BO is idle, 1 - BO is busy */ + __u32 status; + /** Returned current memory domain */ + __u32 domain; +}; + +union drm_amdgpu_gem_wait_idle { + struct drm_amdgpu_gem_wait_idle_in in; + struct drm_amdgpu_gem_wait_idle_out out; +}; + +struct drm_amdgpu_wait_cs_in { + /* Command submission handle + * handle equals 0 means none to wait for + * handle equals ~0ull means wait for the latest sequence number + */ + __u64 handle; + /** Absolute timeout to wait */ + __u64 timeout; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; +}; + +struct drm_amdgpu_wait_cs_out { + /** CS status: 0 - CS completed, 1 - CS still busy */ + __u64 status; +}; + +union drm_amdgpu_wait_cs { + struct drm_amdgpu_wait_cs_in in; + struct drm_amdgpu_wait_cs_out out; +}; + +struct drm_amdgpu_fence { + __u32 ctx_id; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u64 seq_no; +}; + +struct drm_amdgpu_wait_fences_in { + /** This points to uint64_t * which points to fences */ + __u64 fences; + __u32 fence_count; + __u32 wait_all; + __u64 timeout_ns; +}; + +struct drm_amdgpu_wait_fences_out { + __u32 status; + __u32 first_signaled; +}; + +union drm_amdgpu_wait_fences { + struct drm_amdgpu_wait_fences_in in; + struct drm_amdgpu_wait_fences_out out; +}; + +#define AMDGPU_GEM_OP_GET_GEM_CREATE_INFO 0 +#define AMDGPU_GEM_OP_SET_PLACEMENT 1 +#define AMDGPU_GEM_OP_GET_MAPPING_INFO 2 + +struct drm_amdgpu_gem_vm_entry { + /* Start of mapping (in bytes) */ + __u64 addr; + + /* Size of mapping (in bytes) */ + __u64 size; + + /* Mapping offset */ + __u64 offset; + + /* flags needed to recreate mapping */ + __u64 flags; +}; + +/* Sets or returns a value associated with a buffer. */ +struct drm_amdgpu_gem_op { + /** GEM object handle */ + __u32 handle; + /** AMDGPU_GEM_OP_* */ + __u32 op; + /** Input or return value. For MAPPING_INFO op: pointer to array of struct drm_amdgpu_gem_vm_entry */ + __u64 value; + /** For MAPPING_INFO op: number of mappings (in/out) */ + __u32 num_entries; + + __u32 padding; +}; + +#define AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT (1 << 0) + +struct drm_amdgpu_gem_list_handles { + /* User pointer to array of drm_amdgpu_gem_bo_info_entry */ + __u64 entries; + + /* Size of entries buffer / Number of handles in process (if larger than size of buffer, must retry) */ + __u32 num_entries; + + __u32 padding; +}; + +struct drm_amdgpu_gem_list_handles_entry { + /* gem handle of buffer object */ + __u32 gem_handle; + + /* Currently just one flag: IS_IMPORT */ + __u32 flags; + + /* Size of bo */ + __u64 size; + + /* Preferred domains for GEM_CREATE */ + __u64 preferred_domains; + + /* GEM_CREATE flags for re-creation of buffer */ + __u64 alloc_flags; + + /* physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; +}; + +#define AMDGPU_VA_OP_MAP 1 +#define AMDGPU_VA_OP_UNMAP 2 +#define AMDGPU_VA_OP_CLEAR 3 +#define AMDGPU_VA_OP_REPLACE 4 + +/* Delay the page table update till the next CS */ +#define AMDGPU_VM_DELAY_UPDATE (1 << 0) + +/* Mapping flags */ +/* readable mapping */ +#define AMDGPU_VM_PAGE_READABLE (1 << 1) +/* writable mapping */ +#define AMDGPU_VM_PAGE_WRITEABLE (1 << 2) +/* executable mapping, new for VI */ +#define AMDGPU_VM_PAGE_EXECUTABLE (1 << 3) +/* partially resident texture */ +#define AMDGPU_VM_PAGE_PRT (1 << 4) +/* MTYPE flags use bit 5 to 8 */ +#define AMDGPU_VM_MTYPE_MASK (0xf << 5) +/* Default MTYPE. Pre-AI must use this. Recommended for newer ASICs. */ +#define AMDGPU_VM_MTYPE_DEFAULT (0 << 5) +/* Use Non Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_NC (1 << 5) +/* Use Write Combine MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_WC (2 << 5) +/* Use Cache Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_CC (3 << 5) +/* Use UnCached MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_UC (4 << 5) +/* Use Read Write MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_RW (5 << 5) +/* don't allocate MALL */ +#define AMDGPU_VM_PAGE_NOALLOC (1 << 9) + +struct drm_amdgpu_gem_va { + /** GEM object handle */ + __u32 handle; + __u32 _pad; + /** AMDGPU_VA_OP_* */ + __u32 operation; + /** AMDGPU_VM_PAGE_* */ + __u32 flags; + /** va address to assign . Must be correctly aligned.*/ + __u64 va_address; + /** Specify offset inside of BO to assign. Must be correctly aligned.*/ + __u64 offset_in_bo; + /** Specify mapping size. Must be correctly aligned. */ + __u64 map_size; + /** + * vm_timeline_point is a sequence number used to add new timeline point. + */ + __u64 vm_timeline_point; + /** + * The vm page table update fence is installed in given vm_timeline_syncobj_out + * at vm_timeline_point. + */ + __u32 vm_timeline_syncobj_out; + /** the number of syncobj handles in @input_fence_syncobj_handles */ + __u32 num_syncobj_handles; + /** Array of sync object handle to wait for given input fences */ + __u64 input_fence_syncobj_handles; +}; + +#define AMDGPU_HW_IP_GFX 0 +#define AMDGPU_HW_IP_COMPUTE 1 +#define AMDGPU_HW_IP_DMA 2 +#define AMDGPU_HW_IP_UVD 3 +#define AMDGPU_HW_IP_VCE 4 +#define AMDGPU_HW_IP_UVD_ENC 5 +#define AMDGPU_HW_IP_VCN_DEC 6 +/* + * From VCN4, AMDGPU_HW_IP_VCN_ENC is re-used to support + * both encoding and decoding jobs. + */ +#define AMDGPU_HW_IP_VCN_ENC 7 +#define AMDGPU_HW_IP_VCN_JPEG 8 +#define AMDGPU_HW_IP_VPE 9 +#define AMDGPU_HW_IP_NUM 10 + +#define AMDGPU_HW_IP_INSTANCE_MAX_COUNT 1 + +#define AMDGPU_CHUNK_ID_IB 0x01 +#define AMDGPU_CHUNK_ID_FENCE 0x02 +#define AMDGPU_CHUNK_ID_DEPENDENCIES 0x03 +#define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 +#define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 +#define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 +#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT 0x08 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL 0x09 +#define AMDGPU_CHUNK_ID_CP_GFX_SHADOW 0x0a + +struct drm_amdgpu_cs_chunk { + __u32 chunk_id; + __u32 length_dw; + __u64 chunk_data; +}; + +struct drm_amdgpu_cs_in { + /** Rendering context id */ + __u32 ctx_id; + /** Handle of resource list associated with CS */ + __u32 bo_list_handle; + __u32 num_chunks; + __u32 flags; + /** this points to __u64 * which point to cs chunks */ + __u64 chunks; +}; + +struct drm_amdgpu_cs_out { + __u64 handle; +}; + +union drm_amdgpu_cs { + struct drm_amdgpu_cs_in in; + struct drm_amdgpu_cs_out out; +}; + +/* Specify flags to be used for IB */ + +/* This IB should be submitted to CE */ +#define AMDGPU_IB_FLAG_CE (1<<0) + +/* Preamble flag, which means the IB could be dropped if no context switch */ +#define AMDGPU_IB_FLAG_PREAMBLE (1<<1) + +/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ +#define AMDGPU_IB_FLAG_PREEMPT (1<<2) + +/* The IB fence should do the L2 writeback but not invalidate any shader + * caches (L2/vL1/sL1/I$). */ +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) + +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. + * This will reset wave ID counters for the IB. + */ +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) + +/* Flag the IB as secure (TMZ) + */ +#define AMDGPU_IB_FLAGS_SECURE (1 << 5) + +/* Tell KMD to flush and invalidate caches + */ +#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) + +struct drm_amdgpu_cs_chunk_ib { + __u32 _pad; + /** AMDGPU_IB_FLAG_* */ + __u32 flags; + /** Virtual address to begin IB execution */ + __u64 va_start; + /** Size of submission */ + __u32 ib_bytes; + /** HW IP to submit to */ + __u32 ip_type; + /** HW IP index of the same type to submit to */ + __u32 ip_instance; + /** Ring index to submit to */ + __u32 ring; +}; + +struct drm_amdgpu_cs_chunk_dep { + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; + __u64 handle; +}; + +struct drm_amdgpu_cs_chunk_fence { + __u32 handle; + __u32 offset; +}; + +struct drm_amdgpu_cs_chunk_sem { + __u32 handle; +}; + +struct drm_amdgpu_cs_chunk_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ 0 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD 1 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD 2 + +union drm_amdgpu_fence_to_handle { + struct { + struct drm_amdgpu_fence fence; + __u32 what; + __u32 pad; + } in; + struct { + __u32 handle; + } out; +}; + +struct drm_amdgpu_cs_chunk_data { + union { + struct drm_amdgpu_cs_chunk_ib ib_data; + struct drm_amdgpu_cs_chunk_fence fence_data; + }; +}; + +#define AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW 0x1 + +struct drm_amdgpu_cs_chunk_cp_gfx_shadow { + __u64 shadow_va; + __u64 csa_va; + __u64 gds_va; + __u64 flags; +}; + +/* + * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU + * + */ +#define AMDGPU_IDS_FLAGS_FUSION 0x1 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 +#define AMDGPU_IDS_FLAGS_TMZ 0x4 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8 + +/* + * Query h/w info: Flag identifying VF/PF/PT mode + * + */ +#define AMDGPU_IDS_FLAGS_MODE_MASK 0x300 +#define AMDGPU_IDS_FLAGS_MODE_SHIFT 0x8 +#define AMDGPU_IDS_FLAGS_MODE_PF 0x0 +#define AMDGPU_IDS_FLAGS_MODE_VF 0x1 +#define AMDGPU_IDS_FLAGS_MODE_PT 0x2 + +/* indicate if acceleration can be working */ +#define AMDGPU_INFO_ACCEL_WORKING 0x00 +/* get the crtc_id from the mode object id? */ +#define AMDGPU_INFO_CRTC_FROM_ID 0x01 +/* query hw IP info */ +#define AMDGPU_INFO_HW_IP_INFO 0x02 +/* query hw IP instance count for the specified type */ +#define AMDGPU_INFO_HW_IP_COUNT 0x03 +/* timestamp for GL_ARB_timer_query */ +#define AMDGPU_INFO_TIMESTAMP 0x05 +/* Query the firmware version */ +#define AMDGPU_INFO_FW_VERSION 0x0e + /* Subquery id: Query VCE firmware version */ + #define AMDGPU_INFO_FW_VCE 0x1 + /* Subquery id: Query UVD firmware version */ + #define AMDGPU_INFO_FW_UVD 0x2 + /* Subquery id: Query GMC firmware version */ + #define AMDGPU_INFO_FW_GMC 0x03 + /* Subquery id: Query GFX ME firmware version */ + #define AMDGPU_INFO_FW_GFX_ME 0x04 + /* Subquery id: Query GFX PFP firmware version */ + #define AMDGPU_INFO_FW_GFX_PFP 0x05 + /* Subquery id: Query GFX CE firmware version */ + #define AMDGPU_INFO_FW_GFX_CE 0x06 + /* Subquery id: Query GFX RLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC 0x07 + /* Subquery id: Query GFX MEC firmware version */ + #define AMDGPU_INFO_FW_GFX_MEC 0x08 + /* Subquery id: Query SMC firmware version */ + #define AMDGPU_INFO_FW_SMC 0x0a + /* Subquery id: Query SDMA firmware version */ + #define AMDGPU_INFO_FW_SDMA 0x0b + /* Subquery id: Query PSP SOS firmware version */ + #define AMDGPU_INFO_FW_SOS 0x0c + /* Subquery id: Query PSP ASD firmware version */ + #define AMDGPU_INFO_FW_ASD 0x0d + /* Subquery id: Query VCN firmware version */ + #define AMDGPU_INFO_FW_VCN 0x0e + /* Subquery id: Query GFX RLC SRLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f + /* Subquery id: Query GFX RLC SRLG firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10 + /* Subquery id: Query GFX RLC SRLS firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11 + /* Subquery id: Query DMCU firmware version */ + #define AMDGPU_INFO_FW_DMCU 0x12 + #define AMDGPU_INFO_FW_TA 0x13 + /* Subquery id: Query DMCUB firmware version */ + #define AMDGPU_INFO_FW_DMCUB 0x14 + /* Subquery id: Query TOC firmware version */ + #define AMDGPU_INFO_FW_TOC 0x15 + /* Subquery id: Query CAP firmware version */ + #define AMDGPU_INFO_FW_CAP 0x16 + /* Subquery id: Query GFX RLCP firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCP 0x17 + /* Subquery id: Query GFX RLCV firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCV 0x18 + /* Subquery id: Query MES_KIQ firmware version */ + #define AMDGPU_INFO_FW_MES_KIQ 0x19 + /* Subquery id: Query MES firmware version */ + #define AMDGPU_INFO_FW_MES 0x1a + /* Subquery id: Query IMU firmware version */ + #define AMDGPU_INFO_FW_IMU 0x1b + /* Subquery id: Query VPE firmware version */ + #define AMDGPU_INFO_FW_VPE 0x1c + +/* number of bytes moved for TTM migration */ +#define AMDGPU_INFO_NUM_BYTES_MOVED 0x0f +/* the used VRAM size */ +#define AMDGPU_INFO_VRAM_USAGE 0x10 +/* the used GTT size */ +#define AMDGPU_INFO_GTT_USAGE 0x11 +/* Information about GDS, etc. resource configuration */ +#define AMDGPU_INFO_GDS_CONFIG 0x13 +/* Query information about VRAM and GTT domains */ +#define AMDGPU_INFO_VRAM_GTT 0x14 +/* Query information about register in MMR address space*/ +#define AMDGPU_INFO_READ_MMR_REG 0x15 +/* Query information about device: rev id, family, etc. */ +#define AMDGPU_INFO_DEV_INFO 0x16 +/* visible vram usage */ +#define AMDGPU_INFO_VIS_VRAM_USAGE 0x17 +/* number of TTM buffer evictions */ +#define AMDGPU_INFO_NUM_EVICTIONS 0x18 +/* Query memory about VRAM and GTT domains */ +#define AMDGPU_INFO_MEMORY 0x19 +/* Query vce clock table */ +#define AMDGPU_INFO_VCE_CLOCK_TABLE 0x1A +/* Query vbios related information */ +#define AMDGPU_INFO_VBIOS 0x1B + /* Subquery id: Query vbios size */ + #define AMDGPU_INFO_VBIOS_SIZE 0x1 + /* Subquery id: Query vbios image */ + #define AMDGPU_INFO_VBIOS_IMAGE 0x2 + /* Subquery id: Query vbios info */ + #define AMDGPU_INFO_VBIOS_INFO 0x3 +/* Query UVD handles */ +#define AMDGPU_INFO_NUM_HANDLES 0x1C +/* Query sensor related information */ +#define AMDGPU_INFO_SENSOR 0x1D + /* Subquery id: Query GPU shader clock */ + #define AMDGPU_INFO_SENSOR_GFX_SCLK 0x1 + /* Subquery id: Query GPU memory clock */ + #define AMDGPU_INFO_SENSOR_GFX_MCLK 0x2 + /* Subquery id: Query GPU temperature */ + #define AMDGPU_INFO_SENSOR_GPU_TEMP 0x3 + /* Subquery id: Query GPU load */ + #define AMDGPU_INFO_SENSOR_GPU_LOAD 0x4 + /* Subquery id: Query average GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_AVG_POWER 0x5 + /* Subquery id: Query northbridge voltage */ + #define AMDGPU_INFO_SENSOR_VDDNB 0x6 + /* Subquery id: Query graphics voltage */ + #define AMDGPU_INFO_SENSOR_VDDGFX 0x7 + /* Subquery id: Query GPU stable pstate shader clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_SCLK 0x8 + /* Subquery id: Query GPU stable pstate memory clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_MCLK 0x9 + /* Subquery id: Query GPU peak pstate shader clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa + /* Subquery id: Query GPU peak pstate memory clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb + /* Subquery id: Query input GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER 0xc +/* Number of VRAM page faults on CPU access. */ +#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E +#define AMDGPU_INFO_VRAM_LOST_COUNTER 0x1F +/* query ras mask of enabled features*/ +#define AMDGPU_INFO_RAS_ENABLED_FEATURES 0x20 +/* RAS MASK: UMC (VRAM) */ +#define AMDGPU_INFO_RAS_ENABLED_UMC (1 << 0) +/* RAS MASK: SDMA */ +#define AMDGPU_INFO_RAS_ENABLED_SDMA (1 << 1) +/* RAS MASK: GFX */ +#define AMDGPU_INFO_RAS_ENABLED_GFX (1 << 2) +/* RAS MASK: MMHUB */ +#define AMDGPU_INFO_RAS_ENABLED_MMHUB (1 << 3) +/* RAS MASK: ATHUB */ +#define AMDGPU_INFO_RAS_ENABLED_ATHUB (1 << 4) +/* RAS MASK: PCIE */ +#define AMDGPU_INFO_RAS_ENABLED_PCIE (1 << 5) +/* RAS MASK: HDP */ +#define AMDGPU_INFO_RAS_ENABLED_HDP (1 << 6) +/* RAS MASK: XGMI */ +#define AMDGPU_INFO_RAS_ENABLED_XGMI (1 << 7) +/* RAS MASK: DF */ +#define AMDGPU_INFO_RAS_ENABLED_DF (1 << 8) +/* RAS MASK: SMN */ +#define AMDGPU_INFO_RAS_ENABLED_SMN (1 << 9) +/* RAS MASK: SEM */ +#define AMDGPU_INFO_RAS_ENABLED_SEM (1 << 10) +/* RAS MASK: MP0 */ +#define AMDGPU_INFO_RAS_ENABLED_MP0 (1 << 11) +/* RAS MASK: MP1 */ +#define AMDGPU_INFO_RAS_ENABLED_MP1 (1 << 12) +/* RAS MASK: FUSE */ +#define AMDGPU_INFO_RAS_ENABLED_FUSE (1 << 13) +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS 0x21 + /* Subquery id: Decode */ + #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0 + /* Subquery id: Encode */ + #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1 +/* Query the max number of IBs per gang per submission */ +#define AMDGPU_INFO_MAX_IBS 0x22 +/* query last page fault info */ +#define AMDGPU_INFO_GPUVM_FAULT 0x23 +/* query FW object size and alignment */ +#define AMDGPU_INFO_UQ_FW_AREAS 0x24 + +#define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 +#define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff +#define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 +#define AMDGPU_INFO_MMR_SH_INDEX_MASK 0xff + +struct drm_amdgpu_query_fw { + /** AMDGPU_INFO_FW_* */ + __u32 fw_type; + /** + * Index of the IP if there are more IPs of + * the same type. + */ + __u32 ip_instance; + /** + * Index of the engine. Whether this is used depends + * on the firmware type. (e.g. MEC, SDMA) + */ + __u32 index; + __u32 _pad; +}; + +/* Input structure for the INFO ioctl */ +struct drm_amdgpu_info { + /* Where the return value will be stored */ + __u64 return_pointer; + /* The size of the return value. Just like "size" in "snprintf", + * it limits how many bytes the kernel can write. */ + __u32 return_size; + /* The query request id. */ + __u32 query; + + union { + struct { + __u32 id; + __u32 _pad; + } mode_crtc; + + struct { + /** AMDGPU_HW_IP_* */ + __u32 type; + /** + * Index of the IP if there are more IPs of the same + * type. Ignored by AMDGPU_INFO_HW_IP_COUNT. + */ + __u32 ip_instance; + } query_hw_ip; + + struct { + __u32 dword_offset; + /** number of registers to read */ + __u32 count; + __u32 instance; + /** For future use, no flags defined so far */ + __u32 flags; + } read_mmr_reg; + + struct drm_amdgpu_query_fw query_fw; + + struct { + __u32 type; + __u32 offset; + } vbios_info; + + struct { + __u32 type; + } sensor_info; + + struct { + __u32 type; + } video_cap; + }; +}; + +struct drm_amdgpu_info_gds { + /** GDS GFX partition size */ + __u32 gds_gfx_partition_size; + /** GDS compute partition size */ + __u32 compute_partition_size; + /** total GDS memory size */ + __u32 gds_total_size; + /** GWS size per GFX partition */ + __u32 gws_per_gfx_partition; + /** GSW size per compute partition */ + __u32 gws_per_compute_partition; + /** OA size per GFX partition */ + __u32 oa_per_gfx_partition; + /** OA size per compute partition */ + __u32 oa_per_compute_partition; + __u32 _pad; +}; + +struct drm_amdgpu_info_vram_gtt { + __u64 vram_size; + __u64 vram_cpu_accessible_size; + __u64 gtt_size; +}; + +struct drm_amdgpu_heap_info { + /** max. physical memory */ + __u64 total_heap_size; + + /** Theoretical max. available memory in the given heap */ + __u64 usable_heap_size; + + /** + * Number of bytes allocated in the heap. This includes all processes + * and private allocations in the kernel. It changes when new buffers + * are allocated, freed, and moved. It cannot be larger than + * heap_size. + */ + __u64 heap_usage; + + /** + * Theoretical possible max. size of buffer which + * could be allocated in the given heap + */ + __u64 max_allocation; +}; + +struct drm_amdgpu_memory_info { + struct drm_amdgpu_heap_info vram; + struct drm_amdgpu_heap_info cpu_accessible_vram; + struct drm_amdgpu_heap_info gtt; +}; + +struct drm_amdgpu_info_firmware { + __u32 ver; + __u32 feature; +}; + +struct drm_amdgpu_info_vbios { + __u8 name[64]; + __u8 vbios_pn[64]; + __u32 version; + __u32 pad; + __u8 vbios_ver_str[32]; + __u8 date[32]; +}; + +#define AMDGPU_VRAM_TYPE_UNKNOWN 0 +#define AMDGPU_VRAM_TYPE_GDDR1 1 +#define AMDGPU_VRAM_TYPE_DDR2 2 +#define AMDGPU_VRAM_TYPE_GDDR3 3 +#define AMDGPU_VRAM_TYPE_GDDR4 4 +#define AMDGPU_VRAM_TYPE_GDDR5 5 +#define AMDGPU_VRAM_TYPE_HBM 6 +#define AMDGPU_VRAM_TYPE_DDR3 7 +#define AMDGPU_VRAM_TYPE_DDR4 8 +#define AMDGPU_VRAM_TYPE_GDDR6 9 +#define AMDGPU_VRAM_TYPE_DDR5 10 +#define AMDGPU_VRAM_TYPE_LPDDR4 11 +#define AMDGPU_VRAM_TYPE_LPDDR5 12 +#define AMDGPU_VRAM_TYPE_HBM3E 13 + +struct drm_amdgpu_info_device { + /** PCI Device ID */ + __u32 device_id; + /** Internal chip revision: A0, A1, etc.) */ + __u32 chip_rev; + __u32 external_rev; + /** Revision id in PCI Config space */ + __u32 pci_rev; + __u32 family; + __u32 num_shader_engines; + __u32 num_shader_arrays_per_engine; + /* in KHz */ + __u32 gpu_counter_freq; + __u64 max_engine_clock; + __u64 max_memory_clock; + /* cu information */ + __u32 cu_active_number; + /* NOTE: cu_ao_mask is INVALID, DON'T use it */ + __u32 cu_ao_mask; + __u32 cu_bitmap[4][4]; + /** Render backend pipe mask. One render backend is CB+DB. */ + __u32 enabled_rb_pipes_mask; + __u32 num_rb_pipes; + __u32 num_hw_gfx_contexts; + /* PCIe version (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_gen; + __u64 ids_flags; + /** Starting virtual address for UMDs. */ + __u64 virtual_address_offset; + /** The maximum virtual address */ + __u64 virtual_address_max; + /** Required alignment of virtual addresses. */ + __u32 virtual_address_alignment; + /** Page table entry - fragment size */ + __u32 pte_fragment_size; + __u32 gart_page_size; + /** constant engine ram size*/ + __u32 ce_ram_size; + /** video memory type info*/ + __u32 vram_type; + /** video memory bit width*/ + __u32 vram_bit_width; + /* vce harvesting instance */ + __u32 vce_harvest_config; + /* gfx double offchip LDS buffers */ + __u32 gc_double_offchip_lds_buf; + /* NGG Primitive Buffer */ + __u64 prim_buf_gpu_addr; + /* NGG Position Buffer */ + __u64 pos_buf_gpu_addr; + /* NGG Control Sideband */ + __u64 cntl_sb_buf_gpu_addr; + /* NGG Parameter Cache */ + __u64 param_buf_gpu_addr; + __u32 prim_buf_size; + __u32 pos_buf_size; + __u32 cntl_sb_buf_size; + __u32 param_buf_size; + /* wavefront size*/ + __u32 wave_front_size; + /* shader visible vgprs*/ + __u32 num_shader_visible_vgprs; + /* CU per shader array*/ + __u32 num_cu_per_sh; + /* number of tcc blocks*/ + __u32 num_tcc_blocks; + /* gs vgt table depth*/ + __u32 gs_vgt_table_depth; + /* gs primitive buffer depth*/ + __u32 gs_prim_buffer_depth; + /* max gs wavefront per vgt*/ + __u32 max_gs_waves_per_vgt; + /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_num_lanes; + /* always on cu bitmap */ + __u32 cu_ao_bitmap[4][4]; + /** Starting high virtual address for UMDs. */ + __u64 high_va_offset; + /** The maximum high virtual address */ + __u64 high_va_max; + /* gfx10 pa_sc_tile_steering_override */ + __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; + __u64 min_engine_clock; + __u64 min_memory_clock; + /* The following fields are only set on gfx11+, older chips set 0. */ + __u32 tcp_cache_size; /* AKA GL0, VMEM cache */ + __u32 num_sqc_per_wgp; + __u32 sqc_data_cache_size; /* AKA SMEM cache */ + __u32 sqc_inst_cache_size; + __u32 gl1c_cache_size; + __u32 gl2c_cache_size; + __u64 mall_size; /* AKA infinity cache */ + /* high 32 bits of the rb pipes mask */ + __u32 enabled_rb_pipes_mask_hi; + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; + /* Userq IP mask (1 << AMDGPU_HW_IP_*) */ + __u32 userq_ip_mask; + __u32 pad; +}; + +struct drm_amdgpu_info_hw_ip { + /** Version of h/w IP */ + __u32 hw_ip_version_major; + __u32 hw_ip_version_minor; + /** Capabilities */ + __u64 capabilities_flags; + /** command buffer address start alignment*/ + __u32 ib_start_alignment; + /** command buffer size alignment*/ + __u32 ib_size_alignment; + /** Bitmask of available rings. Bit 0 means ring 0, etc. */ + __u32 available_rings; + /** version info: bits 23:16 major, 15:8 minor, 7:0 revision */ + __u32 ip_discovery_version; + /* Userq available slots */ + __u32 userq_num_slots; +}; + +/* GFX metadata BO sizes and alignment info (in bytes) */ +struct drm_amdgpu_info_uq_fw_areas_gfx { + /* shadow area size */ + __u32 shadow_size; + /* shadow area base virtual mem alignment */ + __u32 shadow_alignment; + /* context save area size */ + __u32 csa_size; + /* context save area base virtual mem alignment */ + __u32 csa_alignment; +}; + +/* IP specific fw related information used in the + * subquery AMDGPU_INFO_UQ_FW_AREAS + */ +struct drm_amdgpu_info_uq_fw_areas { + union { + struct drm_amdgpu_info_uq_fw_areas_gfx gfx; + }; +}; + +struct drm_amdgpu_info_num_handles { + /** Max handles as supported by firmware for UVD */ + __u32 uvd_max_handles; + /** Handles currently in use for UVD */ + __u32 uvd_used_handles; +}; + +#define AMDGPU_VCE_CLOCK_TABLE_ENTRIES 6 + +struct drm_amdgpu_info_vce_clock_table_entry { + /** System clock */ + __u32 sclk; + /** Memory clock */ + __u32 mclk; + /** VCE clock */ + __u32 eclk; + __u32 pad; +}; + +struct drm_amdgpu_info_vce_clock_table { + struct drm_amdgpu_info_vce_clock_table_entry entries[AMDGPU_VCE_CLOCK_TABLE_ENTRIES]; + __u32 num_valid_entries; + __u32 pad; +}; + +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2 0 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4 1 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1 2 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC 3 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC 4 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG 5 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9 6 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1 7 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT 8 + +struct drm_amdgpu_info_video_codec_info { + __u32 valid; + __u32 max_width; + __u32 max_height; + __u32 max_pixels_per_frame; + __u32 max_level; + __u32 pad; +}; + +struct drm_amdgpu_info_video_caps { + struct drm_amdgpu_info_video_codec_info codec_info[AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT]; +}; + +#define AMDGPU_VMHUB_TYPE_MASK 0xff +#define AMDGPU_VMHUB_TYPE_SHIFT 0 +#define AMDGPU_VMHUB_TYPE_GFX 0 +#define AMDGPU_VMHUB_TYPE_MM0 1 +#define AMDGPU_VMHUB_TYPE_MM1 2 +#define AMDGPU_VMHUB_IDX_MASK 0xff00 +#define AMDGPU_VMHUB_IDX_SHIFT 8 + +struct drm_amdgpu_info_gpuvm_fault { + __u64 addr; + __u32 status; + __u32 vmhub; +}; + +struct drm_amdgpu_info_uq_metadata_gfx { + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; +}; + +struct drm_amdgpu_info_uq_metadata { + union { + struct drm_amdgpu_info_uq_metadata_gfx gfx; + }; +}; + +/* + * Supported GPU families + */ +#define AMDGPU_FAMILY_UNKNOWN 0 +#define AMDGPU_FAMILY_SI 110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */ +#define AMDGPU_FAMILY_CI 120 /* Bonaire, Hawaii */ +#define AMDGPU_FAMILY_KV 125 /* Kaveri, Kabini, Mullins */ +#define AMDGPU_FAMILY_VI 130 /* Iceland, Tonga */ +#define AMDGPU_FAMILY_CZ 135 /* Carrizo, Stoney */ +#define AMDGPU_FAMILY_AI 141 /* Vega10 */ +#define AMDGPU_FAMILY_RV 142 /* Raven */ +#define AMDGPU_FAMILY_NV 143 /* Navi10 */ +#define AMDGPU_FAMILY_VGH 144 /* Van Gogh */ +#define AMDGPU_FAMILY_GC_11_0_0 145 /* GC 11.0.0 */ +#define AMDGPU_FAMILY_YC 146 /* Yellow Carp */ +#define AMDGPU_FAMILY_GC_11_0_1 148 /* GC 11.0.1 */ +#define AMDGPU_FAMILY_GC_10_3_6 149 /* GC 10.3.6 */ +#define AMDGPU_FAMILY_GC_10_3_7 151 /* GC 10.3.7 */ +#define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ +#define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ + +/* FIXME wrong namespace! */ +struct drm_color_ctm_3x4 { + /* + * Conversion matrix with 3x4 dimensions in S31.32 sign-magnitude + * (not two's complement!) format. + */ + __u64 matrix[12]; +}; + +#if defined(__cplusplus) +} +#endif + +#endif From 5eb61e1b14959acb858fea69d45bf5c8f7f53ee5 Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 15 May 2025 09:49:24 -0400 Subject: [PATCH 733/775] plugin/amdgpu: Add drm header The amdgpu plugin usually calls drm ioctls through the libdrm wrappers. However, amdgpu restore requires dealing with dmabufs and gem handles directly, which means drm ioctls must be called directly. Add the drm.h header (from the kernel's uapi). Signed-off-by: David Francis --- plugins/amdgpu/drm.h | 1450 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1450 insertions(+) create mode 100644 plugins/amdgpu/drm.h diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h new file mode 100644 index 000000000..84c819c17 --- /dev/null +++ b/plugins/amdgpu/drm.h @@ -0,0 +1,1450 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__KERNEL__) + +#include +#include +typedef unsigned int drm_handle_t; + +#elif defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char __user *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char __user *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char __user *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char __user *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version __user *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc __user *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int __user *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void __user *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void __user *virt; +#else + void __user *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub __user *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int __user *send_indices; /**< List of handles to buffers */ + int __user *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int __user *request_indices; /**< Buffer information */ + int __user *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx __user *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +struct drm_gem_close { + /** Handle of the object to be closed. */ + __u32 handle; + __u32 pad; +}; + +/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +struct drm_gem_flink { + /** Handle for the object being named */ + __u32 handle; + + /** Returned global name */ + __u32 name; +}; + +/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +struct drm_gem_open { + /** Name of object being opened */ + __u32 name; + + /** Returned handle for the object */ + __u32 handle; + + /** Returned size of the object */ + __u64 size; +}; + +/* DRM_IOCTL_GEM_CHANGE_HANDLE ioctl argument type */ +struct drm_gem_change_handle { + /** Current handle of object */ + __u32 handle; + + /** Handle to change that object to */ + __u32 new_handle; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#define DRM_CLIENT_NAME_MAX_LEN 64 +struct drm_set_client_name { + __u64 name_len; + __u64 name; +}; + + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/** + * DRM_IOCTL_SET_CLIENT_NAME - Attach a name to a drm_file + * + * Having a name allows for easier tracking and debugging. + * The length of the name (without null ending char) must be + * <= DRM_CLIENT_NAME_MAX_LEN. + * The call will fail if the name contains whitespaces or non-printable chars. + */ +#define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name) + +/** + * DRM_IOCTL_GEM_CHANGE_HANDLE - Move an object to a different handle + * + * Some applications (notably CRIU) need objects to have specific gem handles. + * This ioctl changes the object at one gem handle to use a new gem handle. + */ +#define DRM_IOCTL_GEM_CHANGE_HANDLE DRM_IOWR(0xD2, struct drm_gem_change_handle) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +#ifndef __KERNEL__ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; +#endif + +#if defined(__cplusplus) +} +#endif + +#endif From db0ec806d12d1435fbf2ccbcac05ec878fe0f401 Mon Sep 17 00:00:00 2001 From: David Francis Date: Wed, 12 Feb 2025 09:29:21 -0500 Subject: [PATCH 734/775] plugin/amdgpu: Add handling for amdgpu drm buffer objects Buffer objects held by the amdgpu drm driver are checkpointed with the new BO_INFO and MAPPING_INFO ioctls/ioctl options. Handling is in amdgpu_plugin_drm.h Handling of imported buffer objects may require dmabuf fds to be transferred between processes. These occur over fdstore, with the handle-fstore id relationships kept in shread memory. There is a new plugin callback: RESTORE_INIT to create the shared memory. During checkpoint, track shared buffer objects, so that buffer objects that are shared across processes can be identified. During restore, track which buffer objects have been restored. Retry restore of a drm file if a buffer object is imported and the original has not been exported yet. Skip buffer objects that have already been completed or cannot be completed in the current restore. So drm code can use sdma_copy_bo, that function no longer requires kfd bo structs Update the protobuf messages with new amdgpu drm information. Signed-off-by: David Francis --- criu/include/criu-plugin.h | 3 + criu/plugin.c | 13 +- criu/servicefd.c | 2 +- plugins/amdgpu/amdgpu_plugin.c | 281 ++++++++++++++-- plugins/amdgpu/amdgpu_plugin_drm.c | 487 +++++++++++++++++++++++++++- plugins/amdgpu/amdgpu_plugin_drm.h | 12 + plugins/amdgpu/amdgpu_plugin_util.c | 84 +++++ plugins/amdgpu/amdgpu_plugin_util.h | 39 ++- plugins/amdgpu/criu-amdgpu.proto | 25 ++ 9 files changed, 900 insertions(+), 46 deletions(-) diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index ee84ccdf6..977dad655 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -62,6 +62,8 @@ enum { CR_PLUGIN_HOOK__POST_FORKING = 12, + CR_PLUGIN_HOOK__RESTORE_INIT = 13, + CR_PLUGIN_HOOK__MAX }; @@ -81,6 +83,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index 18da0499d..a2057e9c1 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -60,6 +60,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); __assign_hook(POST_FORKING, "cr_plugin_post_forking"); + __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); #undef __assign_hook @@ -257,8 +258,16 @@ int cr_plugin_init(int stage) goto err; } - if (stage == CR_PLUGIN_STAGE__RESTORE && check_inventory_plugins()) - goto err; + if (stage == CR_PLUGIN_STAGE__RESTORE) { + int ret; + + if (check_inventory_plugins()) + goto err; + + ret = run_plugins(RESTORE_INIT); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } exit_code = 0; err: diff --git a/criu/servicefd.c b/criu/servicefd.c index 06a8d3eba..dfb019066 100644 --- a/criu/servicefd.c +++ b/criu/servicefd.c @@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me) ret = 0; return ret; -} +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index e3b4ead3f..4be8421a0 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -23,12 +25,17 @@ #include "criu-plugin.h" #include "plugin.h" #include "criu-amdgpu.pb-c.h" +#include "util.h" +#include "util-pie.h" +#include "fdstore.h" #include "kfd_ioctl.h" #include "xmalloc.h" #include "criu-log.h" #include "files.h" #include "pstree.h" +#include "sockets.h" +#include "rst-malloc.h" #include "common/list.h" #include "amdgpu_plugin_drm.h" @@ -66,6 +73,19 @@ bool plugin_added_to_inventory = false; bool plugin_disabled = false; +struct handle_id { + int handle; + int fdstore_id; +}; +struct shared_handle_ids { + int num_handles; + struct handle_id *handles; +}; +struct shared_handle_ids *shared_memory = NULL; + +static mutex_t *shared_memory_mutex; + +int current_pid; /* * In the case of a single process (common case), this optimization can effectively * reduce the restore latency with parallel restore. In the case of multiple processes, @@ -526,11 +546,11 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, - void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free) { - uint64_t size, src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; @@ -543,10 +563,8 @@ static int sdma_copy_bo(struct kfd_criu_bo_bucket bo_bucket, FILE *storage_fp, uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int j, err, shared_fd, packets_per_buffer; + int j, err, packets_per_buffer; - shared_fd = bo_bucket.dmabuf_fd; - size = bo_bucket.size; buffer_bo_size = min(size, buffer_size); packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; @@ -757,7 +775,8 @@ err_dst_bo_map: if (err) pr_perror("dest range free failed"); err_dst_va: - err = amdgpu_bo_free(h_bo_dst); + if (!do_not_free) + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); err_dst_bo_prep: @@ -845,8 +864,9 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ, false); + if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -943,8 +963,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = sdma_copy_bo(bo_buckets[i], bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, false); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -1053,6 +1073,134 @@ exit: return ret; } +int store_dmabuf_fd(int handle, int fd) +{ + int id; + + id = fdstore_add(fd); + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return 0; + } + if (shared_memory->handles[i].handle == -1) { + shared_memory->handles[i].handle = handle; + shared_memory->handles[i].fdstore_id = id; + mutex_unlock(shared_memory_mutex); + return 0; + } + } + mutex_unlock(shared_memory_mutex); + + return -1; +} + +int amdgpu_id_for_handle(int handle) +{ + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return shared_memory->handles[i].fdstore_id; + } + } + mutex_unlock(shared_memory_mutex); + return -1; +} + +int amdgpu_restore_init(void) +{ + if (!shared_memory) { + int protection = PROT_READ | PROT_WRITE; + int visibility = MAP_SHARED | MAP_ANONYMOUS; + size_t img_size; + FILE *img_fp = NULL; + int ret; + unsigned char *buf; + int num_handles = 0; + char img_path[PATH_MAX]; + CriuRenderNode *rd = NULL; + CriuKfd *e = NULL; + + DIR *d; + struct dirent *dir; + d = opendir("."); + if (d) { + while ((dir = readdir(d)) != NULL) { + if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { + pr_info("CC3: Found kfd file\n"); + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + e = criu_kfd__unpack(NULL, img_size, buf); + num_handles += e->num_of_bos; + criu_kfd__free_unpacked(e, NULL); + xfree(buf); + } + if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { + pr_info("CC3: Found drm file\n"); + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + rd = criu_render_node__unpack(NULL, img_size, buf); + num_handles += rd->num_of_bos; + criu_render_node__free_unpacked(rd, NULL); + xfree(buf); + } + } + closedir(d); + } + + if (num_handles > 0) { + shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0); + shared_memory->num_handles = num_handles; + shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0); + + for (int i = 0; i < num_handles; i++) { + shared_memory->handles[i].handle = -1; + shared_memory->handles[i].fdstore_id = -1; + } + + shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex)); + if (!shared_memory_mutex) { + pr_err("Can't create amdgpu mutex\n"); + return -1; + } + mutex_init(shared_memory_mutex); + } + } + + return 0; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init) + static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets, CriuKfd *e) { @@ -1095,6 +1243,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd { struct thread_data *thread_datas; int ret = 0, i; + amdgpu_device_handle h_dev; + uint32_t major, minor; pr_debug("Dumping %d BOs\n", args->num_bos); @@ -1118,6 +1268,19 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd boinfo->size = bo_bucket->size; boinfo->offset = bo_bucket->offset; boinfo->alloc_flags = bo_bucket->alloc_flags; + + ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev); + + boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd); + + amdgpu_device_deinitialize(h_dev); + } + for (i = 0; i < e->num_of_bos; i++) { + KfdBoEntry *boinfo = e->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, false); + if (ret) + goto exit; } for (int i = 0; i < e->num_of_gpus; i++) { @@ -1457,6 +1620,29 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) } pr_info("Restore BOs Ok\n"); + + return 0; +} + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd) +{ + struct vma_metadata *vma_md; + + vma_md = xmalloc(sizeof(*vma_md)); + if (!vma_md) { + return -ENOMEM; + } + + memset(vma_md, 0, sizeof(*vma_md)); + + vma_md->old_pgoff = offset; + vma_md->vma_entry = addr; + + vma_md->new_pgoff = restored_offset; + vma_md->fd = fd; + + list_add_tail(&vma_md->list, &update_vma_info_list); + return 0; } @@ -1691,8 +1877,18 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); - if (fd < 0) + if (fd < 0) { pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + return -1; + } + + ret = amdgpu_plugin_drm_restore_file(fd, rd); + if (ret == 1) + *retry_needed = true; + if (ret < 0) { + fd = ret; + goto fail; + } fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1704,12 +1900,20 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - fd = dup(fd); - if (fd == -1) { - pr_perror("unable to duplicate the render fd"); - return -1; + + if (fd < 0) + return fd; + + if (!(*retry_needed)) { + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; } - return fd; + + return 0; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1753,11 +1957,13 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * This way, we know that the file descriptors we store will not conflict with file descriptors inside core * CRIU. */ - fd_next = find_unused_fd_pid(e->pid); - if (fd_next <= 0) { - pr_err("Failed to find unused fd (fd:%d)\n", fd_next); - ret = -EINVAL; - goto exit; + if (fd_next == -1) { + fd_next = find_unused_fd_pid(e->pid); + if (fd_next <= 0) { + pr_err("Failed to find unused fd (fd:%d)\n", fd_next); + ret = -EINVAL; + goto exit; + } } ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology); @@ -1790,14 +1996,26 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) args.num_objects = e->num_of_objects; args.priv_data_size = e->priv_data.len; args.priv_data = (uintptr_t)e->priv_data.data; - args.op = KFD_CRIU_OP_RESTORE; + if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { pr_perror("Restore ioctl failed"); ret = -1; goto exit; } + if (ret < 0) + goto exit; + + for (int i = 0; i < args.num_bos; i++) { + struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; + + if (bo_entry->handle != -1) { + store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd); + } + } + ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e); if (ret) goto exit; @@ -1940,19 +2158,14 @@ int amdgpu_plugin_resume_devices_late(int target_pid) } } + clear_restore_state(); + close(fd); return exit_code; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) -int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size, - amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type) -{ - return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); -} - int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) { int ret = 0; @@ -2061,8 +2274,10 @@ void *parallel_restore_bo_contents(void *_thread_data) entry = &restore_cmd->entries[i]; fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); - ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer, - buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, + buffer, buffer_size, h_dev, + max_copy_size, SDMA_OP_VRAM_WRITE, false); + if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); goto err_sdma; diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index d54cd937d..199dad21e 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -19,19 +19,112 @@ #include #include "common/list.h" +#include "files.h" +#include "fdstore.h" #include "criu-amdgpu.pb-c.h" +#define __user +#include "drm.h" #include #include #include "xmalloc.h" -#include "criu-log.h" -#include "kfd_ioctl.h" +#include "amdgpu_drm.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "util.h" +#include "common/scm.h" + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) +{ + uint32_t handle; + int fd = amdgpu_device_get_fd(h_dev); + + if (dmabuf_fd == -1) { + return -1; + } + + drmPrimeFDToHandle(fd, dmabuf_fd, &handle); + + return handle; +} + +int drmIoctl(int fd, unsigned long request, void *arg) +{ + int ret, max_retries = 200; + + do { + ret = ioctl(fd, request, arg); + } while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret == -1 && errno == EBADF) + /* In case pthread_atfork didn't catch it, this will + * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. + */ + pr_perror("KFD file descriptor not valid in this process"); + return ret; +} + +static int allocate_bo_entries(CriuRenderNode *e, int num_bos) +{ + e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos); + if (!e->bo_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_bo_entry__init(entry); + + e->bo_entries[i] = entry; + e->n_bo_entries++; + } + return 0; +} + +static int allocate_vm_entries(DrmBoEntry *e, int num_vms) +{ + e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms); + if (!e->vm_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_vms; i++) { + DrmVmEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_vm_entry__init(entry); + + e->vm_entries[i] = entry; + e->n_vm_entries++; + } + return 0; +} + +static void free_e(CriuRenderNode *e) +{ + for (int i = 0; i < e->n_bo_entries; i++) { + if (e->bo_entries[i]) + xfree(e->bo_entries[i]); + } + + xfree(e); +} int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) { @@ -60,19 +153,260 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) return 0; } +static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs) +{ + size_t image_size = 0, max_bo_size = 0, buffer_size; + struct amdgpu_gpu_info gpu_info = { 0 }; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + uint32_t major, minor; + FILE *bo_contents_fp = NULL; + void *buffer = NULL; + char img_path[40]; + int num_bos = 0; + int i, ret = 0; + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); + if (ret) { + pr_perror("failed to initialize device"); + goto exit; + } + plugin_log_msg("libdrm initialized successfully\n"); + + ret = amdgpu_query_gpu_info(h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto exit; + } + + max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + + for (i = 0; i < rd->num_of_bos; i++) { + if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) { + if (rd->bo_entries[i]->size > max_bo_size) + max_bo_size = rd->bo_entries[i]->size; + } + } + + buffer_size = max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto exit; + } + + for (i = 0; i < rd->num_of_bos; i++) { + if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT))) + continue; + + if (rd->bo_entries[i]->num_of_vms == 0) + continue; + + num_bos++; + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); + + bo_contents_fp = open_img_file(img_path, false, &image_size); + + ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, true); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + break; + } + plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i); + + if (bo_contents_fp) + fclose(bo_contents_fp); + } + +exit: + for (int i = 0; i < rd->num_of_bos; i++) { + if (dmabufs[i] != KFD_INVALID_FD) + close(dmabufs[i]); + } + + xfree(buffer); + + amdgpu_device_deinitialize(h_dev); + return ret; +} int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) { - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; + CriuRenderNode *rd = NULL; char path[PATH_MAX]; unsigned char *buf; int minor; int len; int ret; + size_t image_size; + struct tp_node *tp_node; + struct drm_amdgpu_gem_list_handles list_handles_args = { 0 }; + struct drm_amdgpu_gem_list_handles_entry *list_handles_entries; + int num_bos; + + rd = xmalloc(sizeof(*rd)); + if (!rd) { + ret = -ENOMEM; + goto exit; + } + criu_render_node__init(rd); /* Get the topology node of the DRM device */ minor = minor(drm->st_rdev); + rd->drm_render_minor = minor; + rd->id = id; + + num_bos = 8; + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret && errno == EINVAL) { + pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n"); + list_handles_args.num_entries = 0; + } else if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + + if (list_handles_args.num_entries > num_bos) { + num_bos = list_handles_args.num_entries; + xfree(list_handles_entries); + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + } else { + num_bos = list_handles_args.num_entries; + } + + rd->num_of_bos = num_bos; + ret = allocate_bo_entries(rd, num_bos); + if (ret) + goto exit; + + for (int i = 0; i < num_bos; i++) { + int num_vm_entries = 8; + struct drm_amdgpu_gem_vm_entry *vm_info_entries; + struct drm_amdgpu_gem_op vm_info_args = { 0 }; + DrmBoEntry *boinfo = rd->bo_entries[i]; + struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i]; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + int dmabuf_fd; + uint32_t major, minor; + amdgpu_device_handle h_dev; + void *buffer = NULL; + char img_path[40]; + FILE *bo_contents_fp = NULL; + int device_fd; + + boinfo->size = handle_entry.size; + + boinfo->alloc_flags = handle_entry.alloc_flags; + boinfo->preferred_domains = handle_entry.preferred_domains; + boinfo->alignment = handle_entry.alignment; + boinfo->handle = handle_entry.gem_handle; + boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle); + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + boinfo->offset = mmap_args.out.addr_ptr; + + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + + if (vm_info_args.num_entries > num_vm_entries) { + num_vm_entries = vm_info_args.num_entries; + xfree(vm_info_entries); + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + } else { + num_vm_entries = vm_info_args.num_entries; + } + + boinfo->num_of_vms = num_vm_entries; + ret = allocate_vm_entries(boinfo, num_vm_entries); + if (ret) + goto exit; + + for (int j = 0; j < num_vm_entries; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + boinfo->addr = vm_info_entries[j].addr; + vminfo->addr = vm_info_entries[j].addr; + vminfo->size = vm_info_entries[j].size; + vminfo->offset = vm_info_entries[j].offset; + vminfo->flags = vm_info_entries[j].flags; + } + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + + device_fd = amdgpu_device_get_fd(h_dev); + + drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd); + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i); + bo_contents_fp = open_img_file(img_path, true, &image_size); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size); + + ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000, + SDMA_OP_VRAM_READ, false); + + if (dmabuf_fd != KFD_INVALID_FD) + close(dmabuf_fd); + + if (bo_contents_fp) + fclose(bo_contents_fp); + + ret = amdgpu_device_deinitialize(h_dev); + if (ret) + goto exit; + + xfree(vm_info_entries); + } + xfree(list_handles_entries); + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, boinfo->is_import); + if (ret) + goto exit; + } + tp_node = sys_get_node_by_render_minor(&src_topology, minor); if (!tp_node) { pr_err("Failed to find a device with minor number = %d\n", minor); @@ -80,21 +414,156 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) } /* Get the GPU_ID of the DRM device */ - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) { - pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id); + rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd->gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id); return -ENODEV; } - len = criu_render_node__get_packed_size(&rd); + len = criu_render_node__get_packed_size(rd); buf = xmalloc(len); if (!buf) return -ENOMEM; - criu_render_node__pack(&rd, buf); + criu_render_node__pack(rd, buf); snprintf(path, sizeof(path), IMG_DRM_FILE, id); ret = write_img_file(path, buf, len); + xfree(buf); +exit: + free_e(rd); return ret; } + +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) +{ + int ret = 0; + bool retry_needed = false; + uint32_t major, minor; + amdgpu_device_handle h_dev; + int device_fd; + int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos); + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + if (ret) { + pr_info("Error in init amdgpu device\n"); + goto exit; + } + + device_fd = amdgpu_device_get_fd(h_dev); + + for (int i = 0; i < rd->num_of_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + int dmabuf_fd = -1; + uint32_t handle; + struct drm_gem_change_handle change_args = { 0 }; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + struct drm_amdgpu_gem_va va_args = { 0 }; + int fd_id; + + if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { + continue; + } else if (boinfo->handle != -1) { + if (boinfo->is_import) { + fd_id = amdgpu_id_for_handle(boinfo->handle); + if (fd_id == -1) { + retry_needed = true; + continue; + } + dmabuf_fd = fdstore_get(fd_id); + } + } + + if (boinfo->is_import) { + drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); + } else { + union drm_amdgpu_gem_create create_args = { 0 }; + + create_args.in.bo_size = boinfo->size; + create_args.in.alignment = boinfo->alignment; + create_args.in.domains = boinfo->preferred_domains; + create_args.in.domain_flags = boinfo->alloc_flags; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) { + pr_perror("Error Failed to call create ioctl"); + ret = -1; + goto exit; + } + handle = create_args.out.handle; + + drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); + } + + change_args.handle = handle; + change_args.new_handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) { + pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support"); + ret = -1; + goto exit; + } + + if (!boinfo->is_import) + store_dmabuf_fd(boinfo->handle, dmabuf_fd); + + dmabufs[i] = dmabuf_fd; + + ret = record_completed_work(boinfo->handle, rd->drm_render_minor); + if (ret) + goto exit; + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + for (int j = 0; j < boinfo->num_of_vms; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + va_args.handle = boinfo->handle; + va_args.operation = AMDGPU_VA_OP_MAP; + va_args.flags = vminfo->flags; + va_args.va_address = vminfo->addr; + va_args.offset_in_bo = vminfo->offset; + va_args.map_size = vminfo->size; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) { + pr_perror("Error Failed to call gem va ioctl"); + ret = -1; + goto exit; + } + } + + ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd); + if (ret < 0) + goto exit; + } + + if (ret) { + pr_info("Error in deinit amdgpu device\n"); + goto exit; + } + + ret = record_completed_work(-1, rd->drm_render_minor); + if (ret) + goto exit; + + ret = amdgpu_device_deinitialize(h_dev); + + if (rd->num_of_bos > 0) { + ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs); + if (ret) + goto exit; + } + +exit: + if (ret < 0) + return ret; + xfree(dmabufs); + + return retry_needed; +} diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h index 6f0c1a9a6..c766def56 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.h +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -24,5 +24,17 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); */ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd); + +int amdgpu_plugin_drm_unpause_file(int fd); + +int amdgpu_id_for_handle(int handle); + +int store_dmabuf_fd(int handle, int fd); + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd); + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id); + #endif /* __AMDGPU_PLUGIN_DRM_H__ */ diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index a165fc9cd..491e7fc74 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -41,6 +41,9 @@ /* Tracks number of device files that need to be checkpointed */ static int dev_file_cnt = 0; +static LIST_HEAD(shared_bos); +static LIST_HEAD(completed_work); + /* Helper structures to encode device topology of SRC and DEST platforms */ struct tp_system src_topology; struct tp_system dest_topology; @@ -68,6 +71,87 @@ void init_gpu_count(struct tp_system *topo) dev_file_cnt = 1 + topology_gpu_count(topo); } +bool shared_bo_has_exporter(int handle) +{ + struct shared_bo *bo; + + if (handle == -1) + return false; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return bo->has_exporter; + } + } + + return false; +} + +int record_shared_bo(int handle, bool is_imported) +{ + struct shared_bo *bo; + + if (handle == -1) + return 0; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return 0; + } + } + bo = malloc(sizeof(struct shared_bo)); + if (!bo) + return -1; + bo->handle = handle; + bo->has_exporter = !is_imported; + list_add(&bo->l, &shared_bos); + + return 0; +} + +int record_completed_work(int handle, int id) +{ + struct restore_completed_work *work; + + work = malloc(sizeof(struct restore_completed_work)); + if (!work) + return -1; + work->handle = handle; + work->id = id; + list_add(&work->l, &completed_work); + + return 0; +} + +bool work_already_completed(int handle, int id) +{ + struct restore_completed_work *work; + + list_for_each_entry(work, &completed_work, l) { + if (work->handle == handle && work->id == id) { + return true; + } + } + + return false; +} + +void clear_restore_state() +{ + while (!list_empty(&shared_dmabuf_fds)) { + struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l); + list_del(&st->l); + close(st->dmabuf_fd); + free(st); + } + + while (!list_empty(&completed_work)) { + struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); + list_del(&st->l); + free(st); + } +} + int read_fp(FILE *fp, void *buf, const size_t buf_len) { size_t len_read; diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index aacca3a28..046a82fb0 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -1,6 +1,8 @@ #ifndef __AMDGPU_PLUGIN_UTIL_H__ #define __AMDGPU_PLUGIN_UTIL_H__ +#include + #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif @@ -52,7 +54,7 @@ #define IMG_DRM_FILE "amdgpu-renderD-%d.img" /* Name of file having serialized data of DRM device buffer objects (BOs) */ -#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%04x.img" +#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" /* Helper macros to Checkpoint and Restore a ROCm file */ #define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" @@ -73,6 +75,24 @@ enum sdma_op_type { SDMA_OP_VRAM_WRITE, }; +struct dumped_fd { + struct list_head l; + int fd; + bool is_drm; +}; + +struct shared_bo { + struct list_head l; + int handle; + bool has_exporter; +}; + +struct restore_completed_work { + struct list_head l; + int handle; + int id; +}; + /* Helper structures to encode device topology of SRC and DEST platforms */ extern struct tp_system src_topology; extern struct tp_system dest_topology; @@ -101,6 +121,23 @@ bool checkpoint_is_complete(); void decrement_checkpoint_count(); void init_gpu_count(struct tp_system *topology); +bool shared_bo_has_exporter(int handle); +int record_shared_bo(int handle, bool is_imported); + +int record_shared_dmabuf_fd(int handle, int dmabuf_fd); +int dmabuf_fd_for_handle(int handle); + +int record_completed_work(int handle, int id); +bool work_already_completed(int handle, int id); + +void clear_restore_state(); + void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free); + +int serve_out_dmabuf_fd(int handle, int fd); + #endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 078b67650..565413c34 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -46,6 +46,7 @@ message kfd_bo_entry { required uint64 offset = 3; required uint32 alloc_flags = 4; required uint32 gpu_id = 5; + required uint32 handle = 6; } message criu_kfd { @@ -61,6 +62,30 @@ message criu_kfd { required bytes priv_data = 10; } +message drm_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 alloc_flags = 4; + required uint64 alignment = 5; + required uint32 preferred_domains = 6; + required uint32 handle = 7; + required uint32 is_import = 8; + required uint32 num_of_vms = 9; + repeated drm_vm_entry vm_entries = 10; +} + +message drm_vm_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 flags = 4; +} + message criu_render_node { required uint32 gpu_id = 1; + required uint32 id = 2; + required uint32 drm_render_minor = 3; + required uint64 num_of_bos = 4; + repeated drm_bo_entry bo_entries = 5; } From d43217dadb9764e0342306da84f45f7a85c78bbf Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 30 Oct 2025 22:56:37 -0700 Subject: [PATCH 735/775] plugin: Add DUMP_DEVICES_LATE callback The amdgpu plugin was counting how many files were checkpointed to determine when it should close the device files. The number of device files is not consistent; a process may have multiple copies of the drm device files open. Instead of doing this counting, add a new callback after all files are checkpointed, so plugins can clean up their resources at an appropriate time. Signed-off-by: David Francis --- criu/cr-dump.c | 4 +++ criu/include/criu-plugin.h | 3 ++ criu/plugin.c | 1 + plugins/amdgpu/amdgpu_plugin.c | 55 ++++++++++++----------------- plugins/amdgpu/amdgpu_plugin_util.c | 42 +++++++++++++--------- plugins/amdgpu/amdgpu_plugin_util.h | 6 ++-- 6 files changed, 60 insertions(+), 51 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 60b8e793c..4df40e9b6 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2247,6 +2247,10 @@ int cr_dump_tasks(pid_t pid) goto err; } + ret = run_plugins(DUMP_DEVICES_LATE, pid); + if (ret && ret != -ENOTSUP) + goto err; + if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 977dad655..c3bea1385 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -64,6 +64,8 @@ enum { CR_PLUGIN_HOOK__RESTORE_INIT = 13, + CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14, + CR_PLUGIN_HOOK__MAX }; @@ -84,6 +86,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id); enum { CR_PLUGIN_STAGE__DUMP, diff --git a/criu/plugin.c b/criu/plugin.c index a2057e9c1..f9322a3c2 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -61,6 +61,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); __assign_hook(POST_FORKING, "cr_plugin_post_forking"); __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); + __assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late"); #undef __assign_hook diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 4be8421a0..11e410c31 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -58,13 +58,6 @@ struct vma_metadata { /************************************ Global Variables ********************************************/ -/** - * FD of KFD device used to checkpoint. On a multi-process - * tree the order of checkpointing goes from parent to child - * and so on - so saving the FD will not be overwritten - */ -static int kfd_checkpoint_fd; - static LIST_HEAD(update_vma_info_list); size_t kfd_max_buffer_size; @@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; } -static int unpause_process(int fd) +int amdgpu_unpause_processes(int pid) { int ret = 0; struct kfd_ioctl_criu_args args = { 0 }; + struct list_head *l = get_dumped_fds(); + struct dumped_fd *st; - args.op = KFD_CRIU_OP_UNPAUSE; + list_for_each_entry(st, l, l) { + if (st->is_drm) { + close(st->fd); + } else { + args.op = KFD_CRIU_OP_UNPAUSE; - ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); - if (ret) { - pr_perror("Failed to unpause process"); - goto exit; + ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args); + if (ret) { + pr_perror("Failed to unpause process"); + goto exit; + } + } } - // Reset the KFD FD - kfd_checkpoint_fd = -1; - sys_close_drm_render_devices(&src_topology); - exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); + clear_dumped_fds(); return ret; } +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes) int store_dmabuf_fd(int handle, int fd) { @@ -1401,9 +1400,6 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } - /* Initialize number of device files that will be checkpointed */ - init_gpu_count(&src_topology); - /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { @@ -1415,11 +1411,9 @@ int amdgpu_plugin_dump_file(int fd, int id) if (ret) return ret; - /* Invoke unpause process if needed */ - decrement_checkpoint_count(); - if (checkpoint_is_complete()) { - ret = unpause_process(kfd_checkpoint_fd); - } + ret = record_dumped_fd(fd, true); + if (ret) + return ret; /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; @@ -1517,14 +1511,11 @@ int amdgpu_plugin_dump_file(int fd, int id) xfree(buf); -exit: - /* Restore all queues if conditions permit */ - kfd_checkpoint_fd = fd; - decrement_checkpoint_count(); - if (checkpoint_is_complete()) { - ret = unpause_process(fd); - } + ret = record_dumped_fd(fd, false); + if (ret) + goto exit; +exit: xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index 491e7fc74..fd59c06ad 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -38,9 +38,7 @@ #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" -/* Tracks number of device files that need to be checkpointed */ -static int dev_file_cnt = 0; - +static LIST_HEAD(dumped_fds); static LIST_HEAD(shared_bos); static LIST_HEAD(completed_work); @@ -52,23 +50,25 @@ struct tp_system dest_topology; struct device_maps checkpoint_maps; struct device_maps restore_maps; -bool checkpoint_is_complete() +int record_dumped_fd(int fd, bool is_drm) { - return (dev_file_cnt == 0); + int newfd = dup(fd); + + if (newfd < 0) + return newfd; + struct dumped_fd *st = malloc(sizeof(struct dumped_fd)); + if (!st) + return -1; + st->fd = newfd; + st->is_drm = is_drm; + list_add(&st->l, &dumped_fds); + + return 0; } -void decrement_checkpoint_count() +struct list_head *get_dumped_fds() { - dev_file_cnt--; -} - -void init_gpu_count(struct tp_system *topo) -{ - if (dev_file_cnt != 0) - return; - - /* We add ONE to include checkpointing of KFD device */ - dev_file_cnt = 1 + topology_gpu_count(topo); + return &dumped_fds; } bool shared_bo_has_exporter(int handle) @@ -152,6 +152,16 @@ void clear_restore_state() } } +void clear_dumped_fds() +{ + while (!list_empty(&dumped_fds)) { + struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l); + list_del(&st->l); + close(st->fd); + free(st); + } +} + int read_fp(FILE *fp, void *buf, const size_t buf_len) { size_t len_read; diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index 046a82fb0..f20388efa 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -117,9 +117,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len); int write_img_file(char *path, const void *buf, const size_t buf_len); FILE *open_img_file(char *path, bool write, size_t *size); -bool checkpoint_is_complete(); -void decrement_checkpoint_count(); -void init_gpu_count(struct tp_system *topology); +int record_dumped_fd(int fd, bool is_drm); +struct list_head *get_dumped_fds(); +void clear_dumped_fds(); bool shared_bo_has_exporter(int handle); int record_shared_bo(int handle, bool is_imported); From 9e404e2083913cde0bad2d0396e6cc7c311a8ba4 Mon Sep 17 00:00:00 2001 From: David Francis Date: Thu, 30 Oct 2025 22:57:04 -0700 Subject: [PATCH 736/775] plugin/amdgpu: Support for checkpoint of dmabuf fds amdgpu libraries that use dmabuf fd to share GPU memory between processes close the dmabuf fds immediately after using them. However, it is possible that checkpoint of a process catches one of the dmabuf fds open. In that case, the amdgpu plugin needs to handle it. The checkpoint of the dmabuf fd does require the device file it was exported from to have already been dumped To identify which device this dmabuf fd was exprted from, attempt to import it on each device, then record the dmabuf handle it imports as. This handle can be used to restore it. Signed-off-by: David Francis --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 34 ++++- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 207 ++++++++++++++++++++++++++ plugins/amdgpu/amdgpu_plugin_dmabuf.h | 16 ++ plugins/amdgpu/amdgpu_plugin_drm.c | 7 +- plugins/amdgpu/amdgpu_plugin_util.c | 48 +++++- plugins/amdgpu/amdgpu_plugin_util.h | 8 +- plugins/amdgpu/criu-amdgpu.proto | 4 + 8 files changed, 306 insertions(+), 20 deletions(-) create mode 100644 plugins/amdgpu/amdgpu_plugin_dmabuf.c create mode 100644 plugins/amdgpu/amdgpu_plugin_dmabuf.h diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 870a039cd..31e177e4a 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -27,7 +27,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 11e410c31..125aaef9a 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -38,6 +38,7 @@ #include "rst-malloc.h" #include "common/list.h" +#include "amdgpu_plugin_dmabuf.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" @@ -46,7 +47,7 @@ #include "img-streamer.h" #include "image.h" #include "cr_options.h" - +#include "util.h" struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -1064,6 +1065,9 @@ int amdgpu_unpause_processes(int pid) } } + if (post_dump_dmabuf_check() < 0) + ret = -1; + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); clear_dumped_fds(); @@ -1400,7 +1404,17 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } - /* Check whether this plugin was called for kfd or render nodes */ + /* Check whether this plugin was called for kfd, dmabuf or render nodes */ + ret = get_dmabuf_info(fd, &st); + if (ret < 0) { + pr_perror("Failed to get dmabuf info"); + return -1; + } else if (ret == 0) { + pr_info("Dumping dmabuf fd = %d\n", fd); + ret = amdgpu_plugin_dmabuf_dump(fd, id); + return ret; + } + if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { /* This is RenderD dumper plugin, for now just save renderD @@ -1414,7 +1428,7 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = record_dumped_fd(fd, true); if (ret) return ret; - + ret = try_dump_dmabuf_list(); /* Need to return success here so that criu can call plugins for renderD nodes */ return ret; } @@ -1538,7 +1552,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) int ret = 0, bucket_index = 0; pr_debug("Restoring %d devices\n", e->num_of_gpus); - args->num_devices = e->num_of_gpus; device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices); if (!device_buckets) @@ -1822,12 +1835,17 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) * first as we assume restore_maps is already filled. Need to fix this later. */ snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); - pr_info("Restoring RenderD %s\n", img_path); img_fp = open_img_file(img_path, false, &img_size); - if (!img_fp) - return -EINVAL; - + if (!img_fp) { + ret = amdgpu_plugin_dmabuf_restore(id); + if (ret == 1) { + *retry_needed = true; + return 0; + } + return ret; + } + pr_info("Restoring RenderD %s\n", img_path); pr_debug("RenderD Image file size:%ld\n", img_size); buf = xmalloc(img_size); if (!buf) { diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c new file mode 100644 index 000000000..74b5f9038 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -0,0 +1,207 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/list.h" +#include "criu-amdgpu.pb-c.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_dmabuf.h" +#include "fdstore.h" + +#include "util.h" +#include "common/scm.h" + +struct dmabuf { + int id; + int dmabuf_fd; + struct list_head node; +}; + +static LIST_HEAD(dmabuf_list); + +/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */ +int get_dmabuf_info(int fd, struct stat *st) +{ + char path[PATH_MAX]; + + if (read_fd_link(fd, path, sizeof(path)) < 0) + return -1; + + if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0) + return 1; + + return 0; +} + +int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret = 0; + char path[PATH_MAX]; + size_t len = 0; + unsigned char *buf = NULL; + int gem_handle; + + pr_info("TWI: Dumping dmabuf fd = %d\n", dmabuf_fd); + + gem_handle = handle_for_shared_bo_fd(dmabuf_fd); + if (gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); + return -EAGAIN; /* Retry needed */ + } + + CriuDmabufNode *node = xmalloc(sizeof(*node)); + if (!node) { + pr_err("Failed to allocate memory for dmabuf node\n"); + return -ENOMEM; + } + criu_dmabuf_node__init(node); + + node->gem_handle = gem_handle; + + if (node->gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd\n"); + xfree(node); + return -EINVAL; + } + + /* Serialize metadata to a file */ + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + len = criu_dmabuf_node__get_packed_size(node); + buf = xmalloc(len); + if (!buf) { + pr_err("Failed to allocate buffer for dmabuf metadata\n"); + xfree(node); + return -ENOMEM; + } + criu_dmabuf_node__pack(node, buf); + ret = write_img_file(path, buf, len); + + xfree(buf); + xfree(node); + return ret; +} + +int amdgpu_plugin_dmabuf_restore(int id) +{ + char path[PATH_MAX]; + size_t img_size; + FILE *img_fp = NULL; + int ret = 0; + CriuDmabufNode *rd = NULL; + unsigned char *buf = NULL; + int fd_id; + + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + + pr_info("TWI: Restoring dmabuf fd, id = %d\n", id); + + /* Read serialized metadata */ + img_fp = open_img_file(path, false, &img_size); + if (!img_fp) { + pr_err("Failed to open dmabuf metadata file: %s\n", path); + return -EINVAL; + } + + pr_debug("dmabuf Image file size:%ld\n", img_size); + buf = xmalloc(img_size); + if (!buf) { + pr_perror("Failed to allocate memory"); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", path); + xfree(buf); + return ret; + } + + rd = criu_dmabuf_node__unpack(NULL, img_size, buf); + if (rd == NULL) { + pr_perror("Unable to parse the dmabuf message %d", id); + xfree(buf); + fclose(img_fp); + return -1; + } + fclose(img_fp); + + pr_info("TWI: dmabuf node gem_handle = %d\n", rd->gem_handle); + + /* Match GEM handle with shared_dmabuf list */ + fd_id = amdgpu_id_for_handle(rd->gem_handle); + if (fd_id == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", + rd->gem_handle); + return 1; + } + int dmabuf_fd = fdstore_get(fd_id); + pr_info("TWI: dmabuf node fd_id = %d, dmabuf_fd = %d\n", fd_id, dmabuf_fd); + if (dmabuf_fd == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", + rd->gem_handle); + return 1; /* Retry needed */ + } else { + pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", + dmabuf_fd, rd->gem_handle); + } + ret = dmabuf_fd; + + pr_info("Successfully restored dmabuf_fd %d\n", + dmabuf_fd); + criu_dmabuf_node__free_unpacked(rd, NULL); + xfree(buf); + return ret; +} + +int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret; + + ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id); + if (ret == -EAGAIN) { + struct dmabuf *b = xmalloc(sizeof(*b)); + b->id = id; + b->dmabuf_fd = dmabuf_fd; + list_add(&b->node, &dmabuf_list); + return 0; + } + return ret; +} + +int try_dump_dmabuf_list() +{ + struct dmabuf *b, *t; + list_for_each_entry_safe(b, t, &dmabuf_list, node) { + int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); + if (ret == -EAGAIN) + continue; + else if (ret) + return ret; + list_del(&b->node); + xfree(b); + } + return 0; +} + +int post_dump_dmabuf_check() +{ + if (!list_empty(&dmabuf_list)) { + pr_err("Not all dma buffers have been dumped\n"); + return -1; + } + return 1; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.h b/plugins/amdgpu/amdgpu_plugin_dmabuf.h new file mode 100644 index 000000000..f07af7ee0 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.h @@ -0,0 +1,16 @@ + +#ifndef __AMDGPU_PLUGIN_DMABUF_H__ +#define __AMDGPU_PLUGIN_DMABUF_H__ + +#include "amdgpu_plugin_util.h" +#include "criu-amdgpu.pb-c.h" + +int amdgpu_plugin_dmabuf_dump(int fd, int id); +int amdgpu_plugin_dmabuf_restore(int id); + +int try_dump_dmabuf_list(); +int post_dump_dmabuf_check(); + +int get_dmabuf_info(int fd, struct stat *st); + +#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */ \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 199dad21e..8466ca40d 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -47,7 +47,8 @@ int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) return -1; } - drmPrimeFDToHandle(fd, dmabuf_fd, &handle); + if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle)) + return -1; return handle; } @@ -465,6 +466,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { continue; } else if (boinfo->handle != -1) { + pr_info("TWI: restore bo %d\n", boinfo->handle); if (boinfo->is_import) { fd_id = amdgpu_id_for_handle(boinfo->handle); if (fd_id == -1) { @@ -472,11 +474,13 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) continue; } dmabuf_fd = fdstore_get(fd_id); + pr_info("TWI: restore bo %d: fd_id %d, dmabuf_fd %d\n", boinfo->handle, fd_id, dmabuf_fd); } } if (boinfo->is_import) { drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); + pr_info("TWI: restore bo imported to handle %d\n", handle); } else { union drm_amdgpu_gem_create create_args = { 0 }; @@ -493,6 +497,7 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) handle = create_args.out.handle; drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); + pr_info("TWI: restore bo created at handle %d and exported to fd %d\n", handle, dmabuf_fd); } change_args.handle = handle; diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index fd59c06ad..a2cafa4a3 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -37,6 +37,7 @@ #include "amdgpu_drm.h" #include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_plugin_drm.h" static LIST_HEAD(dumped_fds); static LIST_HEAD(shared_bos); @@ -109,6 +110,46 @@ int record_shared_bo(int handle, bool is_imported) return 0; } +int handle_for_shared_bo_fd(int fd) +{ + struct dumped_fd *df; + int trial_handle; + amdgpu_device_handle h_dev; + uint32_t major, minor; + struct shared_bo *bo; + + list_for_each_entry(df, &dumped_fds, l) { + /* see if the gem handle for fd using the hdev for df->fd is the + same as bo->handle. */ + + if (!df->is_drm) { + continue; + } + + if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) { + pr_err("Failed to initialize amdgpu device\n"); + continue; + } + + trial_handle = get_gem_handle(h_dev, fd); + if (trial_handle < 0) + continue; + + pr_info("TWI: Check device %d, got handle %d\n", df->fd, trial_handle); + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == trial_handle) { + pr_info("TWI: And that handle exists\n"); + return trial_handle; + } + } + + amdgpu_device_deinitialize(h_dev); + } + + return -1; +} + int record_completed_work(int handle, int id) { struct restore_completed_work *work; @@ -138,13 +179,6 @@ bool work_already_completed(int handle, int id) void clear_restore_state() { - while (!list_empty(&shared_dmabuf_fds)) { - struct shared_dmabuf *st = list_first_entry(&shared_dmabuf_fds, struct shared_dmabuf, l); - list_del(&st->l); - close(st->dmabuf_fd); - free(st); - } - while (!list_empty(&completed_work)) { struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); list_del(&st->l); diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h index f20388efa..f5f752d0b 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -53,6 +53,9 @@ /* Name of file having serialized data of DRM device */ #define IMG_DRM_FILE "amdgpu-renderD-%d.img" +/* Name of file having serialized data of dmabuf meta */ +#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img" + /* Name of file having serialized data of DRM device buffer objects (BOs) */ #define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" @@ -61,6 +64,7 @@ #define HSAKMT_SHM "/hsakmt_shared_mem" #define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" #define HSAKMT_SEM "hsakmt_semaphore" +#define DMABUF_LINK "/dmabuf" /* Help macros to build sDMA command packets */ #define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) @@ -123,9 +127,7 @@ void clear_dumped_fds(); bool shared_bo_has_exporter(int handle); int record_shared_bo(int handle, bool is_imported); - -int record_shared_dmabuf_fd(int handle, int dmabuf_fd); -int dmabuf_fd_for_handle(int handle); +int handle_for_shared_bo_fd(int dmabuf_fd); int record_completed_work(int handle, int id); bool work_already_completed(int handle, int id); diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 565413c34..7682a8f21 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -89,3 +89,7 @@ message criu_render_node { required uint64 num_of_bos = 4; repeated drm_bo_entry bo_entries = 5; } + +message criu_dmabuf_node { + required uint32 gem_handle = 1; +} From ff35a9126e3a2d4e6f5f9f2ca89b032f9ae5bc22 Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:26:44 -0800 Subject: [PATCH 737/775] plugins/amdgpu: remove excessive debug messages These pr_info lines begin with "CC3" and "TWI" were not meant to be included in the patch. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 5 +---- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 22 ++++++---------------- plugins/amdgpu/amdgpu_plugin_drm.c | 4 ---- plugins/amdgpu/amdgpu_plugin_util.c | 6 +----- 4 files changed, 8 insertions(+), 29 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 125aaef9a..4640ccf88 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -48,6 +48,7 @@ #include "image.h" #include "cr_options.h" #include "util.h" + struct vma_metadata { struct list_head list; uint64_t old_pgoff; @@ -327,8 +328,6 @@ void getenv_size_t(const char *var, size_t *value) int sh = 0; size_t size; - pr_info("Value str: %s\n", value_str); - if (value_str) { size = (size_t)strtoul(value_str, &endp, 0); if (errno || value_str == endp) { @@ -1132,7 +1131,6 @@ int amdgpu_restore_init(void) if (d) { while ((dir = readdir(d)) != NULL) { if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { - pr_info("CC3: Found kfd file\n"); img_fp = open_img_file(dir->d_name, false, &img_size); buf = xmalloc(img_size); if (!buf) { @@ -1155,7 +1153,6 @@ int amdgpu_restore_init(void) xfree(buf); } if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { - pr_info("CC3: Found drm file\n"); img_fp = open_img_file(dir->d_name, false, &img_size); buf = xmalloc(img_size); if (!buf) { diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c index 74b5f9038..bdc107f64 100644 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -55,8 +55,6 @@ int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) unsigned char *buf = NULL; int gem_handle; - pr_info("TWI: Dumping dmabuf fd = %d\n", dmabuf_fd); - gem_handle = handle_for_shared_bo_fd(dmabuf_fd); if (gem_handle < 0) { pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); @@ -107,8 +105,6 @@ int amdgpu_plugin_dmabuf_restore(int id) snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); - pr_info("TWI: Restoring dmabuf fd, id = %d\n", id); - /* Read serialized metadata */ img_fp = open_img_file(path, false, &img_size); if (!img_fp) { @@ -139,29 +135,23 @@ int amdgpu_plugin_dmabuf_restore(int id) } fclose(img_fp); - pr_info("TWI: dmabuf node gem_handle = %d\n", rd->gem_handle); - /* Match GEM handle with shared_dmabuf list */ fd_id = amdgpu_id_for_handle(rd->gem_handle); if (fd_id == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", - rd->gem_handle); + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); return 1; } + int dmabuf_fd = fdstore_get(fd_id); - pr_info("TWI: dmabuf node fd_id = %d, dmabuf_fd = %d\n", fd_id, dmabuf_fd); if (dmabuf_fd == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", - rd->gem_handle); + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); return 1; /* Retry needed */ - } else { - pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", - dmabuf_fd, rd->gem_handle); } + + pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", dmabuf_fd, rd->gem_handle); ret = dmabuf_fd; - pr_info("Successfully restored dmabuf_fd %d\n", - dmabuf_fd); + pr_info("Successfully restored dmabuf_fd %d\n", dmabuf_fd); criu_dmabuf_node__free_unpacked(rd, NULL); xfree(buf); return ret; diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 8466ca40d..00bcb7a29 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -466,7 +466,6 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { continue; } else if (boinfo->handle != -1) { - pr_info("TWI: restore bo %d\n", boinfo->handle); if (boinfo->is_import) { fd_id = amdgpu_id_for_handle(boinfo->handle); if (fd_id == -1) { @@ -474,13 +473,11 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) continue; } dmabuf_fd = fdstore_get(fd_id); - pr_info("TWI: restore bo %d: fd_id %d, dmabuf_fd %d\n", boinfo->handle, fd_id, dmabuf_fd); } } if (boinfo->is_import) { drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); - pr_info("TWI: restore bo imported to handle %d\n", handle); } else { union drm_amdgpu_gem_create create_args = { 0 }; @@ -497,7 +494,6 @@ int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) handle = create_args.out.handle; drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); - pr_info("TWI: restore bo created at handle %d and exported to fd %d\n", handle, dmabuf_fd); } change_args.handle = handle; diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c index a2cafa4a3..592562474 100644 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -135,13 +135,9 @@ int handle_for_shared_bo_fd(int fd) if (trial_handle < 0) continue; - pr_info("TWI: Check device %d, got handle %d\n", df->fd, trial_handle); - list_for_each_entry(bo, &shared_bos, l) { - if (bo->handle == trial_handle) { - pr_info("TWI: And that handle exists\n"); + if (bo->handle == trial_handle) return trial_handle; - } } amdgpu_device_deinitialize(h_dev); From 690b6104321dc64dd2ff0c9f6aa6f7c093b24f65 Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:29:35 -0800 Subject: [PATCH 738/775] plugins/amdgpu: return 0 in post_dump_dmabuf_check Use `return 0` on success in `post_dump_dmabuf_check()` for consistency with other functions. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_dmabuf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c index bdc107f64..11c9792e3 100644 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -179,7 +179,7 @@ int try_dump_dmabuf_list() int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); if (ret == -EAGAIN) continue; - else if (ret) + if (ret) return ret; list_del(&b->node); xfree(b); @@ -193,5 +193,5 @@ int post_dump_dmabuf_check() pr_err("Not all dma buffers have been dumped\n"); return -1; } - return 1; -} \ No newline at end of file + return 0; +} From 77e6558ddb134e0e8cfbeb6ce3341bf9b3116ccd Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:32:03 -0800 Subject: [PATCH 739/775] plugins/amdgpu: apply code-style fixes Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 4640ccf88..83fa41724 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1406,10 +1406,10 @@ int amdgpu_plugin_dump_file(int fd, int id) if (ret < 0) { pr_perror("Failed to get dmabuf info"); return -1; - } else if (ret == 0) { + } + if (ret == 0) { pr_info("Dumping dmabuf fd = %d\n", fd); - ret = amdgpu_plugin_dmabuf_dump(fd, id); - return ret; + return amdgpu_plugin_dmabuf_dump(fd, id); } if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { @@ -1425,9 +1425,9 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = record_dumped_fd(fd, true); if (ret) return ret; - ret = try_dump_dmabuf_list(); + /* Need to return success here so that criu can call plugins for renderD nodes */ - return ret; + return try_dump_dmabuf_list(); } pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); From 6ed49894c5da4466cc89d2fc69afce29dedd6f2e Mon Sep 17 00:00:00 2001 From: David Francis Date: Sun, 2 Nov 2025 07:32:44 -0800 Subject: [PATCH 740/775] plugins/amdgpu: add a comment for retry_needed Add a comment that explains the purpose of `retry_needed`. Co-authored-by: Andrei Vagin Signed-off-by: David Francis Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 83fa41724..36dc0b6b0 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1837,6 +1837,10 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed) if (!img_fp) { ret = amdgpu_plugin_dmabuf_restore(id); if (ret == 1) { + /* This is a dmabuf fd, but the corresponding buffer object that was + * exported to make it has not yet been restored. Need to try again + * later when the buffer object exists, so it can be re-exported. + */ *retry_needed = true; return 0; } From f56ccfd2d6815b499f321abf2c95a6c7cb3a1c40 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 2 Nov 2025 17:01:31 +0000 Subject: [PATCH 741/775] plugins/amdgpu: remove unused variable amdgpu_plugin_drm.c:167:6: error: variable 'num_bos' set but not used [-Werror,-Wunused-but-set-variable] 167 | int num_bos = 0; | Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin_drm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 00bcb7a29..923bfcdd1 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -164,7 +164,6 @@ static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int FILE *bo_contents_fp = NULL; void *buffer = NULL; char img_path[40]; - int num_bos = 0; int i, ret = 0; ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); @@ -206,8 +205,6 @@ static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int if (rd->bo_entries[i]->num_of_vms == 0) continue; - num_bos++; - snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); bo_contents_fp = open_img_file(img_path, false, &image_size); From e4a5e164b4ccad7e82cef638f9510f932daea00f Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 5 Nov 2025 15:12:06 +0000 Subject: [PATCH 742/775] plugins/amdgpu: update kernel headers This patch updates drm.h and amdgpu_drm.h kernel headers, and adds drm_mode.h (included by drm.h) from the rocm-7.1.0 release tag. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_drm.h | 125 +++- plugins/amdgpu/drm.h | 58 +- plugins/amdgpu/drm_mode.h | 1362 +++++++++++++++++++++++++++++++++++ 3 files changed, 1523 insertions(+), 22 deletions(-) create mode 100644 plugins/amdgpu/drm_mode.h diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h index 9cebd072a..69227a12b 100644 --- a/plugins/amdgpu/amdgpu_drm.h +++ b/plugins/amdgpu/amdgpu_drm.h @@ -58,6 +58,11 @@ extern "C" { #define DRM_AMDGPU_USERQ_SIGNAL 0x17 #define DRM_AMDGPU_USERQ_WAIT 0x18 #define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 +/* not upstream */ +#define DRM_AMDGPU_GEM_DGMA 0x5c + +/* hybrid specific ioctls */ +#define DRM_AMDGPU_SEM 0x5b #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) @@ -80,6 +85,8 @@ extern "C" { #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) #define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) +#define DRM_IOCTL_AMDGPU_GEM_DGMA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_DGMA, struct drm_amdgpu_gem_dgma) + /** * DOC: memory domains * @@ -105,7 +112,12 @@ extern "C" { * * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for * signalling user mode queues. + * + * %AMDGPU_GEM_DOMAIN_MMIO_REMAP MMIO remap page (special mapping for HDP flushing). */ +/* hybrid specific ioctls */ +#define DRM_IOCTL_AMDGPU_SEM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_SEM, union drm_amdgpu_sem) + #define AMDGPU_GEM_DOMAIN_CPU 0x1 #define AMDGPU_GEM_DOMAIN_GTT 0x2 #define AMDGPU_GEM_DOMAIN_VRAM 0x4 @@ -113,13 +125,20 @@ extern "C" { #define AMDGPU_GEM_DOMAIN_GWS 0x10 #define AMDGPU_GEM_DOMAIN_OA 0x20 #define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_MMIO_REMAP 0x80 +#define AMDGPU_GEM_DOMAIN_DGMA 0x400 +#define AMDGPU_GEM_DOMAIN_DGMA_IMPORT 0x800 + #define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ AMDGPU_GEM_DOMAIN_GTT | \ AMDGPU_GEM_DOMAIN_VRAM | \ AMDGPU_GEM_DOMAIN_GDS | \ AMDGPU_GEM_DOMAIN_GWS | \ - AMDGPU_GEM_DOMAIN_OA | \ - AMDGPU_GEM_DOMAIN_DOORBELL) + AMDGPU_GEM_DOMAIN_OA |\ + AMDGPU_GEM_DOMAIN_DOORBELL |\ + AMDGPU_GEM_DOMAIN_MMIO_REMAP |\ + AMDGPU_GEM_DOMAIN_DGMA |\ + AMDGPU_GEM_DOMAIN_DGMA_IMPORT) /* Flag that CPU access will be required for the case of VRAM domain */ #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) @@ -182,6 +201,14 @@ extern "C" { /* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ #define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) +/* hybrid specific */ +/* Flag that the memory should be in SPARSE resource */ +#define AMDGPU_GEM_CREATE_SPARSE (1ULL << 29) +/* Flag that the memory allocation should be from top of domain */ +#define AMDGPU_GEM_CREATE_TOP_DOWN (1ULL << 30) +/* Flag that the memory allocation should be pinned */ +#define AMDGPU_GEM_CREATE_NO_EVICT (1ULL << 31) + struct drm_amdgpu_gem_create_in { /** the requested memory size */ __u64 bo_size; @@ -581,6 +608,35 @@ struct drm_amdgpu_userq_wait { __u64 out_fences; }; +/* sem related */ +#define AMDGPU_SEM_OP_CREATE_SEM 1 +#define AMDGPU_SEM_OP_WAIT_SEM 2 +#define AMDGPU_SEM_OP_SIGNAL_SEM 3 +#define AMDGPU_SEM_OP_DESTROY_SEM 4 +#define AMDGPU_SEM_OP_IMPORT_SEM 5 +#define AMDGPU_SEM_OP_EXPORT_SEM 6 + +struct drm_amdgpu_sem_in { + /** AMDGPU_SEM_OP_* */ + uint32_t op; + uint32_t handle; + uint32_t ctx_id; + uint32_t ip_type; + uint32_t ip_instance; + uint32_t ring; + uint64_t seq; +}; + +union drm_amdgpu_sem_out { + int32_t fd; + uint32_t handle; +}; + +union drm_amdgpu_sem { + struct drm_amdgpu_sem_in in; + union drm_amdgpu_sem_out out; +}; + /* vm ioctl */ #define AMDGPU_VM_OP_RESERVE_VMID 1 #define AMDGPU_VM_OP_UNRESERVE_VMID 2 @@ -637,6 +693,15 @@ struct drm_amdgpu_gem_userptr { __u32 handle; }; +#define AMDGPU_GEM_DGMA_IMPORT 0 +#define AMDGPU_GEM_DGMA_QUERY_PHYS_ADDR 1 +struct drm_amdgpu_gem_dgma { + __u64 addr; + __u64 size; + __u32 op; + __u32 handle; +}; + /* SI-CI-VI: */ /* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ #define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 @@ -1084,10 +1149,11 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU * */ -#define AMDGPU_IDS_FLAGS_FUSION 0x1 -#define AMDGPU_IDS_FLAGS_PREEMPTION 0x2 -#define AMDGPU_IDS_FLAGS_TMZ 0x4 -#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x8 +#define AMDGPU_IDS_FLAGS_FUSION 0x01 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x02 +#define AMDGPU_IDS_FLAGS_TMZ 0x04 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x08 +#define AMDGPU_IDS_FLAGS_GANG_SUBMIT 0x10 /* * Query h/w info: Flag identifying VF/PF/PT mode @@ -1269,6 +1335,16 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow { /* query FW object size and alignment */ #define AMDGPU_INFO_UQ_FW_AREAS 0x24 +/* Hybrid Stack Specific Defs*/ +/* gpu capability */ +#define AMDGPU_INFO_CAPABILITY 0x50 +/* virtual range */ +#define AMDGPU_INFO_VIRTUAL_RANGE 0x51 +/* query pin memory capability */ +#define AMDGPU_CAPABILITY_PIN_MEM_FLAG (1 << 0) +/* query direct gma capability */ +#define AMDGPU_CAPABILITY_DIRECT_GMA_FLAG (1 << 1) + #define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 #define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff #define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 @@ -1325,6 +1401,11 @@ struct drm_amdgpu_info { __u32 flags; } read_mmr_reg; + struct { + uint32_t aperture; + uint32_t _pad; + } virtual_range; + struct drm_amdgpu_query_fw query_fw; struct { @@ -1423,6 +1504,8 @@ struct drm_amdgpu_info_vbios { #define AMDGPU_VRAM_TYPE_LPDDR5 12 #define AMDGPU_VRAM_TYPE_HBM3E 13 +#define AMDGPU_VRAM_TYPE_HBM_WIDTH 4096 + struct drm_amdgpu_info_device { /** PCI Device ID */ __u32 device_id; @@ -1672,6 +1755,7 @@ struct drm_amdgpu_info_uq_metadata { #define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ #define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ +#ifndef HAVE_DRM_COLOR_CTM_3X4 /* FIXME wrong namespace! */ struct drm_color_ctm_3x4 { /* @@ -1680,6 +1764,35 @@ struct drm_color_ctm_3x4 { */ __u64 matrix[12]; }; +#endif + +/** + * Definition of System Unified Address (SUA) apertures + */ +#define AMDGPU_SUA_APERTURE_PRIVATE 1 +#define AMDGPU_SUA_APERTURE_SHARED 2 +struct drm_amdgpu_virtual_range { + uint64_t start; + uint64_t end; +}; + +struct drm_amdgpu_capability { + __u32 flag; + __u32 direct_gma_size; +}; + +/* + * Definition of free sync enter and exit signals + * We may have more options in the future + */ +#define AMDGPU_FREESYNC_FULLSCREEN_ENTER 1 +#define AMDGPU_FREESYNC_FULLSCREEN_EXIT 2 + +struct drm_amdgpu_freesync { + __u32 op; /* AMDGPU_FREESYNC_FULLSCREEN_ENTER or */ + /* AMDGPU_FREESYNC_FULLSCREEN_ENTER */ + __u32 spare[7]; +}; #if defined(__cplusplus) } diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h index 84c819c17..3cd5cf15e 100644 --- a/plugins/amdgpu/drm.h +++ b/plugins/amdgpu/drm.h @@ -597,40 +597,62 @@ struct drm_set_version { int drm_dd_minor; }; -/* DRM_IOCTL_GEM_CLOSE ioctl argument type */ +/** + * struct drm_gem_close - Argument for &DRM_IOCTL_GEM_CLOSE ioctl. + * @handle: Handle of the object to be closed. + * @pad: Padding. + * + * Releases the handle to an mm object. + */ struct drm_gem_close { - /** Handle of the object to be closed. */ __u32 handle; __u32 pad; }; -/* DRM_IOCTL_GEM_FLINK ioctl argument type */ +/** + * struct drm_gem_flink - Argument for &DRM_IOCTL_GEM_FLINK ioctl. + * @handle: Handle for the object being named. + * @name: Returned global name. + * + * Create a global name for an object, returning the name. + * + * Note that the name does not hold a reference; when the object + * is freed, the name goes away. + */ struct drm_gem_flink { - /** Handle for the object being named */ __u32 handle; - - /** Returned global name */ __u32 name; }; -/* DRM_IOCTL_GEM_OPEN ioctl argument type */ +/** + * struct drm_gem_open - Argument for &DRM_IOCTL_GEM_OPEN ioctl. + * @name: Name of object being opened. + * @handle: Returned handle for the object. + * @size: Returned size of the object + * + * Open an object using the global name, returning a handle and the size. + * + * This handle (of course) holds a reference to the object, so the object + * will not go away until the handle is deleted. + */ struct drm_gem_open { - /** Name of object being opened */ __u32 name; - - /** Returned handle for the object */ __u32 handle; - - /** Returned size of the object */ __u64 size; }; -/* DRM_IOCTL_GEM_CHANGE_HANDLE ioctl argument type */ +/** + * struct drm_gem_change_handle - Argument for &DRM_IOCTL_GEM_CHANGE_HANDLE ioctl. + * @handle: The handle of a gem object. + * @new_handle: An available gem handle. + * + * This ioctl changes the handle of a GEM object to the specified one. + * The new handle must be unused. On success the old handle is closed + * and all further IOCTL should refer to the new handle only. + * Calls to DRM_IOCTL_PRIME_FD_TO_HANDLE will return the new handle. + */ struct drm_gem_change_handle { - /** Current handle of object */ __u32 handle; - - /** Handle to change that object to */ __u32 new_handle; }; @@ -914,13 +936,17 @@ struct drm_syncobj_destroy { }; #define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE (1 << 1) #define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE (1 << 1) struct drm_syncobj_handle { __u32 handle; __u32 flags; __s32 fd; __u32 pad; + + __u64 point; }; struct drm_syncobj_transfer { diff --git a/plugins/amdgpu/drm_mode.h b/plugins/amdgpu/drm_mode.h new file mode 100644 index 000000000..c082810c0 --- /dev/null +++ b/plugins/amdgpu/drm_mode.h @@ -0,0 +1,1362 @@ +/* + * Copyright (c) 2007 Dave Airlie + * Copyright (c) 2007 Jakob Bornecrantz + * Copyright (c) 2008 Red Hat Inc. + * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA + * Copyright (c) 2007-2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _DRM_MODE_H +#define _DRM_MODE_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * DOC: overview + * + * DRM exposes many UAPI and structure definitions to have a consistent + * and standardized interface with users. + * Userspace can refer to these structure definitions and UAPI formats + * to communicate to drivers. + */ + +#define DRM_CONNECTOR_NAME_LEN 32 +#define DRM_DISPLAY_MODE_LEN 32 +#define DRM_PROP_NAME_LEN 32 + +#define DRM_MODE_TYPE_BUILTIN (1<<0) /* deprecated */ +#define DRM_MODE_TYPE_CLOCK_C ((1<<1) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_CRTC_C ((1<<2) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_PREFERRED (1<<3) +#define DRM_MODE_TYPE_DEFAULT (1<<4) /* deprecated */ +#define DRM_MODE_TYPE_USERDEF (1<<5) +#define DRM_MODE_TYPE_DRIVER (1<<6) + +#define DRM_MODE_TYPE_ALL (DRM_MODE_TYPE_PREFERRED | \ + DRM_MODE_TYPE_USERDEF | \ + DRM_MODE_TYPE_DRIVER) + +/* Video mode flags */ +/* bit compatible with the xrandr RR_ definitions (bits 0-13) + * + * ABI warning: Existing userspace really expects + * the mode flags to match the xrandr definitions. Any + * changes that don't match the xrandr definitions will + * likely need a new client cap or some other mechanism + * to avoid breaking existing userspace. This includes + * allocating new flags in the previously unused bits! + */ +#define DRM_MODE_FLAG_PHSYNC (1<<0) +#define DRM_MODE_FLAG_NHSYNC (1<<1) +#define DRM_MODE_FLAG_PVSYNC (1<<2) +#define DRM_MODE_FLAG_NVSYNC (1<<3) +#define DRM_MODE_FLAG_INTERLACE (1<<4) +#define DRM_MODE_FLAG_DBLSCAN (1<<5) +#define DRM_MODE_FLAG_CSYNC (1<<6) +#define DRM_MODE_FLAG_PCSYNC (1<<7) +#define DRM_MODE_FLAG_NCSYNC (1<<8) +#define DRM_MODE_FLAG_HSKEW (1<<9) /* hskew provided */ +#define DRM_MODE_FLAG_BCAST (1<<10) /* deprecated */ +#define DRM_MODE_FLAG_PIXMUX (1<<11) /* deprecated */ +#define DRM_MODE_FLAG_DBLCLK (1<<12) +#define DRM_MODE_FLAG_CLKDIV2 (1<<13) + /* + * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX + * (define not exposed to user space). + */ +#define DRM_MODE_FLAG_3D_MASK (0x1f<<14) +#define DRM_MODE_FLAG_3D_NONE (0<<14) +#define DRM_MODE_FLAG_3D_FRAME_PACKING (1<<14) +#define DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE (2<<14) +#define DRM_MODE_FLAG_3D_LINE_ALTERNATIVE (3<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL (4<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH (5<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH (6<<14) +#define DRM_MODE_FLAG_3D_TOP_AND_BOTTOM (7<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF (8<<14) + +/* Picture aspect ratio options */ +#define DRM_MODE_PICTURE_ASPECT_NONE 0 +#define DRM_MODE_PICTURE_ASPECT_4_3 1 +#define DRM_MODE_PICTURE_ASPECT_16_9 2 +#define DRM_MODE_PICTURE_ASPECT_64_27 3 +#define DRM_MODE_PICTURE_ASPECT_256_135 4 + +/* Content type options */ +#define DRM_MODE_CONTENT_TYPE_NO_DATA 0 +#define DRM_MODE_CONTENT_TYPE_GRAPHICS 1 +#define DRM_MODE_CONTENT_TYPE_PHOTO 2 +#define DRM_MODE_CONTENT_TYPE_CINEMA 3 +#define DRM_MODE_CONTENT_TYPE_GAME 4 + +/* Aspect ratio flag bitmask (4 bits 22:19) */ +#define DRM_MODE_FLAG_PIC_AR_MASK (0x0F<<19) +#define DRM_MODE_FLAG_PIC_AR_NONE \ + (DRM_MODE_PICTURE_ASPECT_NONE<<19) +#define DRM_MODE_FLAG_PIC_AR_4_3 \ + (DRM_MODE_PICTURE_ASPECT_4_3<<19) +#define DRM_MODE_FLAG_PIC_AR_16_9 \ + (DRM_MODE_PICTURE_ASPECT_16_9<<19) +#define DRM_MODE_FLAG_PIC_AR_64_27 \ + (DRM_MODE_PICTURE_ASPECT_64_27<<19) +#define DRM_MODE_FLAG_PIC_AR_256_135 \ + (DRM_MODE_PICTURE_ASPECT_256_135<<19) + +#define DRM_MODE_FLAG_ALL (DRM_MODE_FLAG_PHSYNC | \ + DRM_MODE_FLAG_NHSYNC | \ + DRM_MODE_FLAG_PVSYNC | \ + DRM_MODE_FLAG_NVSYNC | \ + DRM_MODE_FLAG_INTERLACE | \ + DRM_MODE_FLAG_DBLSCAN | \ + DRM_MODE_FLAG_CSYNC | \ + DRM_MODE_FLAG_PCSYNC | \ + DRM_MODE_FLAG_NCSYNC | \ + DRM_MODE_FLAG_HSKEW | \ + DRM_MODE_FLAG_DBLCLK | \ + DRM_MODE_FLAG_CLKDIV2 | \ + DRM_MODE_FLAG_3D_MASK) + +/* DPMS flags */ +/* bit compatible with the xorg definitions. */ +#define DRM_MODE_DPMS_ON 0 +#define DRM_MODE_DPMS_STANDBY 1 +#define DRM_MODE_DPMS_SUSPEND 2 +#define DRM_MODE_DPMS_OFF 3 + +/* Scaling mode options */ +#define DRM_MODE_SCALE_NONE 0 /* Unmodified timing (display or + software can still scale) */ +#define DRM_MODE_SCALE_FULLSCREEN 1 /* Full screen, ignore aspect */ +#define DRM_MODE_SCALE_CENTER 2 /* Centered, no scaling */ +#define DRM_MODE_SCALE_ASPECT 3 /* Full screen, preserve aspect */ + +/* Dithering mode options */ +#define DRM_MODE_DITHERING_OFF 0 +#define DRM_MODE_DITHERING_ON 1 +#define DRM_MODE_DITHERING_AUTO 2 + +/* Dirty info options */ +#define DRM_MODE_DIRTY_OFF 0 +#define DRM_MODE_DIRTY_ON 1 +#define DRM_MODE_DIRTY_ANNOTATE 2 + +/* Link Status options */ +#define DRM_MODE_LINK_STATUS_GOOD 0 +#define DRM_MODE_LINK_STATUS_BAD 1 + +/* + * DRM_MODE_ROTATE_ + * + * Signals that a drm plane is been rotated degrees in counter + * clockwise direction. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_ROTATE_0 (1<<0) +#define DRM_MODE_ROTATE_90 (1<<1) +#define DRM_MODE_ROTATE_180 (1<<2) +#define DRM_MODE_ROTATE_270 (1<<3) + +/* + * DRM_MODE_ROTATE_MASK + * + * Bitmask used to look for drm plane rotations. + */ +#define DRM_MODE_ROTATE_MASK (\ + DRM_MODE_ROTATE_0 | \ + DRM_MODE_ROTATE_90 | \ + DRM_MODE_ROTATE_180 | \ + DRM_MODE_ROTATE_270) + +/* + * DRM_MODE_REFLECT_ + * + * Signals that the contents of a drm plane is reflected along the axis, + * in the same way as mirroring. + * See kerneldoc chapter "Plane Composition Properties" for more details. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_REFLECT_X (1<<4) +#define DRM_MODE_REFLECT_Y (1<<5) + +/* + * DRM_MODE_REFLECT_MASK + * + * Bitmask used to look for drm plane reflections. + */ +#define DRM_MODE_REFLECT_MASK (\ + DRM_MODE_REFLECT_X | \ + DRM_MODE_REFLECT_Y) + +/* Content Protection Flags */ +#define DRM_MODE_CONTENT_PROTECTION_UNDESIRED 0 +#define DRM_MODE_CONTENT_PROTECTION_DESIRED 1 +#define DRM_MODE_CONTENT_PROTECTION_ENABLED 2 + +/** + * struct drm_mode_modeinfo - Display mode information. + * @clock: pixel clock in kHz + * @hdisplay: horizontal display size + * @hsync_start: horizontal sync start + * @hsync_end: horizontal sync end + * @htotal: horizontal total size + * @hskew: horizontal skew + * @vdisplay: vertical display size + * @vsync_start: vertical sync start + * @vsync_end: vertical sync end + * @vtotal: vertical total size + * @vscan: vertical scan + * @vrefresh: approximate vertical refresh rate in Hz + * @flags: bitmask of misc. flags, see DRM_MODE_FLAG_* defines + * @type: bitmask of type flags, see DRM_MODE_TYPE_* defines + * @name: string describing the mode resolution + * + * This is the user-space API display mode information structure. For the + * kernel version see struct drm_display_mode. + */ +struct drm_mode_modeinfo { + __u32 clock; + __u16 hdisplay; + __u16 hsync_start; + __u16 hsync_end; + __u16 htotal; + __u16 hskew; + __u16 vdisplay; + __u16 vsync_start; + __u16 vsync_end; + __u16 vtotal; + __u16 vscan; + + __u32 vrefresh; + + __u32 flags; + __u32 type; + char name[DRM_DISPLAY_MODE_LEN]; +}; + +struct drm_mode_card_res { + __u64 fb_id_ptr; + __u64 crtc_id_ptr; + __u64 connector_id_ptr; + __u64 encoder_id_ptr; + __u32 count_fbs; + __u32 count_crtcs; + __u32 count_connectors; + __u32 count_encoders; + __u32 min_width; + __u32 max_width; + __u32 min_height; + __u32 max_height; +}; + +struct drm_mode_crtc { + __u64 set_connectors_ptr; + __u32 count_connectors; + + __u32 crtc_id; /**< Id */ + __u32 fb_id; /**< Id of framebuffer */ + + __u32 x; /**< x Position on the framebuffer */ + __u32 y; /**< y Position on the framebuffer */ + + __u32 gamma_size; + __u32 mode_valid; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_PRESENT_TOP_FIELD (1<<0) +#define DRM_MODE_PRESENT_BOTTOM_FIELD (1<<1) + +/* Planes blend with or override other bits on the CRTC */ +struct drm_mode_set_plane { + __u32 plane_id; + __u32 crtc_id; + __u32 fb_id; /* fb object contains surface format type */ + __u32 flags; /* see above flags */ + + /* Signed dest location allows it to be partially off screen */ + __s32 crtc_x; + __s32 crtc_y; + __u32 crtc_w; + __u32 crtc_h; + + /* Source values are 16.16 fixed point */ + __u32 src_x; + __u32 src_y; + __u32 src_h; + __u32 src_w; +}; + +/** + * struct drm_mode_get_plane - Get plane metadata. + * + * Userspace can perform a GETPLANE ioctl to retrieve information about a + * plane. + * + * To retrieve the number of formats supported, set @count_format_types to zero + * and call the ioctl. @count_format_types will be updated with the value. + * + * To retrieve these formats, allocate an array with the memory needed to store + * @count_format_types formats. Point @format_type_ptr to this array and call + * the ioctl again (with @count_format_types still set to the value returned in + * the first ioctl call). + */ +struct drm_mode_get_plane { + /** + * @plane_id: Object ID of the plane whose information should be + * retrieved. Set by caller. + */ + __u32 plane_id; + + /** @crtc_id: Object ID of the current CRTC. */ + __u32 crtc_id; + /** @fb_id: Object ID of the current fb. */ + __u32 fb_id; + + /** + * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's + * are created and they receive an index, which corresponds to their + * position in the bitmask. Bit N corresponds to + * :ref:`CRTC index` N. + */ + __u32 possible_crtcs; + /** @gamma_size: Never used. */ + __u32 gamma_size; + + /** @count_format_types: Number of formats. */ + __u32 count_format_types; + /** + * @format_type_ptr: Pointer to ``__u32`` array of formats that are + * supported by the plane. These formats do not require modifiers. + */ + __u64 format_type_ptr; +}; + +struct drm_mode_get_plane_res { + __u64 plane_id_ptr; + __u32 count_planes; +}; + +#define DRM_MODE_ENCODER_NONE 0 +#define DRM_MODE_ENCODER_DAC 1 +#define DRM_MODE_ENCODER_TMDS 2 +#define DRM_MODE_ENCODER_LVDS 3 +#define DRM_MODE_ENCODER_TVDAC 4 +#define DRM_MODE_ENCODER_VIRTUAL 5 +#define DRM_MODE_ENCODER_DSI 6 +#define DRM_MODE_ENCODER_DPMST 7 +#define DRM_MODE_ENCODER_DPI 8 + +struct drm_mode_get_encoder { + __u32 encoder_id; + __u32 encoder_type; + + __u32 crtc_id; /**< Id of crtc */ + + __u32 possible_crtcs; + __u32 possible_clones; +}; + +/* This is for connectors with multiple signal types. */ +/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */ +enum drm_mode_subconnector { + DRM_MODE_SUBCONNECTOR_Automatic = 0, /* DVI-I, TV */ + DRM_MODE_SUBCONNECTOR_Unknown = 0, /* DVI-I, TV, DP */ + DRM_MODE_SUBCONNECTOR_VGA = 1, /* DP */ + DRM_MODE_SUBCONNECTOR_DVID = 3, /* DVI-I DP */ + DRM_MODE_SUBCONNECTOR_DVIA = 4, /* DVI-I */ + DRM_MODE_SUBCONNECTOR_Composite = 5, /* TV */ + DRM_MODE_SUBCONNECTOR_SVIDEO = 6, /* TV */ + DRM_MODE_SUBCONNECTOR_Component = 8, /* TV */ + DRM_MODE_SUBCONNECTOR_SCART = 9, /* TV */ + DRM_MODE_SUBCONNECTOR_DisplayPort = 10, /* DP */ + DRM_MODE_SUBCONNECTOR_HDMIA = 11, /* DP */ + DRM_MODE_SUBCONNECTOR_Native = 15, /* DP */ + DRM_MODE_SUBCONNECTOR_Wireless = 18, /* DP */ +}; + +#define DRM_MODE_CONNECTOR_Unknown 0 +#define DRM_MODE_CONNECTOR_VGA 1 +#define DRM_MODE_CONNECTOR_DVII 2 +#define DRM_MODE_CONNECTOR_DVID 3 +#define DRM_MODE_CONNECTOR_DVIA 4 +#define DRM_MODE_CONNECTOR_Composite 5 +#define DRM_MODE_CONNECTOR_SVIDEO 6 +#define DRM_MODE_CONNECTOR_LVDS 7 +#define DRM_MODE_CONNECTOR_Component 8 +#define DRM_MODE_CONNECTOR_9PinDIN 9 +#define DRM_MODE_CONNECTOR_DisplayPort 10 +#define DRM_MODE_CONNECTOR_HDMIA 11 +#define DRM_MODE_CONNECTOR_HDMIB 12 +#define DRM_MODE_CONNECTOR_TV 13 +#define DRM_MODE_CONNECTOR_eDP 14 +#define DRM_MODE_CONNECTOR_VIRTUAL 15 +#define DRM_MODE_CONNECTOR_DSI 16 +#define DRM_MODE_CONNECTOR_DPI 17 +#define DRM_MODE_CONNECTOR_WRITEBACK 18 +#define DRM_MODE_CONNECTOR_SPI 19 +#define DRM_MODE_CONNECTOR_USB 20 + +/** + * struct drm_mode_get_connector - Get connector metadata. + * + * User-space can perform a GETCONNECTOR ioctl to retrieve information about a + * connector. User-space is expected to retrieve encoders, modes and properties + * by performing this ioctl at least twice: the first time to retrieve the + * number of elements, the second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_props and @count_encoders to + * zero, set @count_modes to 1, and set @modes_ptr to a temporary struct + * drm_mode_modeinfo element. + * + * To retrieve the elements, allocate arrays for @encoders_ptr, @modes_ptr, + * @props_ptr and @prop_values_ptr, then set @count_modes, @count_props and + * @count_encoders to their capacity. + * + * Performing the ioctl only twice may be racy: the number of elements may have + * changed with a hotplug event in-between the two ioctls. User-space is + * expected to retry the last ioctl until the number of elements stabilizes. + * The kernel won't fill any array which doesn't have the expected length. + * + * **Force-probing a connector** + * + * If the @count_modes field is set to zero and the DRM client is the current + * DRM master, the kernel will perform a forced probe on the connector to + * refresh the connector status, modes and EDID. A forced-probe can be slow, + * might cause flickering and the ioctl will block. + * + * User-space needs to force-probe connectors to ensure their metadata is + * up-to-date at startup and after receiving a hot-plug event. User-space + * may perform a forced-probe when the user explicitly requests it. User-space + * shouldn't perform a forced-probe in other situations. + */ +struct drm_mode_get_connector { + /** @encoders_ptr: Pointer to ``__u32`` array of object IDs. */ + __u64 encoders_ptr; + /** @modes_ptr: Pointer to struct drm_mode_modeinfo array. */ + __u64 modes_ptr; + /** @props_ptr: Pointer to ``__u32`` array of property IDs. */ + __u64 props_ptr; + /** @prop_values_ptr: Pointer to ``__u64`` array of property values. */ + __u64 prop_values_ptr; + + /** @count_modes: Number of modes. */ + __u32 count_modes; + /** @count_props: Number of properties. */ + __u32 count_props; + /** @count_encoders: Number of encoders. */ + __u32 count_encoders; + + /** @encoder_id: Object ID of the current encoder. */ + __u32 encoder_id; + /** @connector_id: Object ID of the connector. */ + __u32 connector_id; + /** + * @connector_type: Type of the connector. + * + * See DRM_MODE_CONNECTOR_* defines. + */ + __u32 connector_type; + /** + * @connector_type_id: Type-specific connector number. + * + * This is not an object ID. This is a per-type connector number. Each + * (type, type_id) combination is unique across all connectors of a DRM + * device. + * + * The (type, type_id) combination is not a stable identifier: the + * type_id can change depending on the driver probe order. + */ + __u32 connector_type_id; + + /** + * @connection: Status of the connector. + * + * See enum drm_connector_status. + */ + __u32 connection; + /** @mm_width: Width of the connected sink in millimeters. */ + __u32 mm_width; + /** @mm_height: Height of the connected sink in millimeters. */ + __u32 mm_height; + /** + * @subpixel: Subpixel order of the connected sink. + * + * See enum subpixel_order. + */ + __u32 subpixel; + + /** @pad: Padding, must be zero. */ + __u32 pad; +}; + +#define DRM_MODE_PROP_PENDING (1<<0) /* deprecated, do not use */ +#define DRM_MODE_PROP_RANGE (1<<1) +#define DRM_MODE_PROP_IMMUTABLE (1<<2) +#define DRM_MODE_PROP_ENUM (1<<3) /* enumerated type with text strings */ +#define DRM_MODE_PROP_BLOB (1<<4) +#define DRM_MODE_PROP_BITMASK (1<<5) /* bitmask of enumerated types */ + +/* non-extended types: legacy bitmask, one bit per type: */ +#define DRM_MODE_PROP_LEGACY_TYPE ( \ + DRM_MODE_PROP_RANGE | \ + DRM_MODE_PROP_ENUM | \ + DRM_MODE_PROP_BLOB | \ + DRM_MODE_PROP_BITMASK) + +/* extended-types: rather than continue to consume a bit per type, + * grab a chunk of the bits to use as integer type id. + */ +#define DRM_MODE_PROP_EXTENDED_TYPE 0x0000ffc0 +#define DRM_MODE_PROP_TYPE(n) ((n) << 6) +#define DRM_MODE_PROP_OBJECT DRM_MODE_PROP_TYPE(1) +#define DRM_MODE_PROP_SIGNED_RANGE DRM_MODE_PROP_TYPE(2) + +/* the PROP_ATOMIC flag is used to hide properties from userspace that + * is not aware of atomic properties. This is mostly to work around + * older userspace (DDX drivers) that read/write each prop they find, + * without being aware that this could be triggering a lengthy modeset. + */ +#define DRM_MODE_PROP_ATOMIC 0x80000000 + +/** + * struct drm_mode_property_enum - Description for an enum/bitfield entry. + * @value: numeric value for this enum entry. + * @name: symbolic name for this enum entry. + * + * See struct drm_property_enum for details. + */ +struct drm_mode_property_enum { + __u64 value; + char name[DRM_PROP_NAME_LEN]; +}; + +/** + * struct drm_mode_get_property - Get property metadata. + * + * User-space can perform a GETPROPERTY ioctl to retrieve information about a + * property. The same property may be attached to multiple objects, see + * "Modeset Base Object Abstraction". + * + * The meaning of the @values_ptr field changes depending on the property type. + * See &drm_property.flags for more details. + * + * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the + * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For + * backwards compatibility, the kernel will always set @count_enum_blobs to + * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must + * ignore these two fields if the property has a different type. + * + * User-space is expected to retrieve values and enums by performing this ioctl + * at least twice: the first time to retrieve the number of elements, the + * second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_values and @count_enum_blobs + * to zero, then call the ioctl. @count_values will be updated with the number + * of elements. If the property has the type &DRM_MODE_PROP_ENUM or + * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well. + * + * To retrieve the elements themselves, allocate an array for @values_ptr and + * set @count_values to its capacity. If the property has the type + * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for + * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl + * again will fill the arrays. + */ +struct drm_mode_get_property { + /** @values_ptr: Pointer to a ``__u64`` array. */ + __u64 values_ptr; + /** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */ + __u64 enum_blob_ptr; + + /** + * @prop_id: Object ID of the property which should be retrieved. Set + * by the caller. + */ + __u32 prop_id; + /** + * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for + * a definition of the flags. + */ + __u32 flags; + /** + * @name: Symbolic property name. User-space should use this field to + * recognize properties. + */ + char name[DRM_PROP_NAME_LEN]; + + /** @count_values: Number of elements in @values_ptr. */ + __u32 count_values; + /** @count_enum_blobs: Number of elements in @enum_blob_ptr. */ + __u32 count_enum_blobs; +}; + +struct drm_mode_connector_set_property { + __u64 value; + __u32 prop_id; + __u32 connector_id; +}; + +#define DRM_MODE_OBJECT_CRTC 0xcccccccc +#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0 +#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0 +#define DRM_MODE_OBJECT_MODE 0xdededede +#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0 +#define DRM_MODE_OBJECT_FB 0xfbfbfbfb +#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb +#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee +#define DRM_MODE_OBJECT_ANY 0 + +struct drm_mode_obj_get_properties { + __u64 props_ptr; + __u64 prop_values_ptr; + __u32 count_props; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_obj_set_property { + __u64 value; + __u32 prop_id; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_get_blob { + __u32 blob_id; + __u32 length; + __u64 data; +}; + +struct drm_mode_fb_cmd { + __u32 fb_id; + __u32 width; + __u32 height; + __u32 pitch; + __u32 bpp; + __u32 depth; + /* driver specific handle */ + __u32 handle; +}; + +#define DRM_MODE_FB_INTERLACED (1<<0) /* for interlaced framebuffers */ +#define DRM_MODE_FB_MODIFIERS (1<<1) /* enables ->modifier[] */ + +/** + * struct drm_mode_fb_cmd2 - Frame-buffer metadata. + * + * This struct holds frame-buffer metadata. There are two ways to use it: + * + * - User-space can fill this struct and perform a &DRM_IOCTL_MODE_ADDFB2 + * ioctl to register a new frame-buffer. The new frame-buffer object ID will + * be set by the kernel in @fb_id. + * - User-space can set @fb_id and perform a &DRM_IOCTL_MODE_GETFB2 ioctl to + * fetch metadata about an existing frame-buffer. + * + * In case of planar formats, this struct allows up to 4 buffer objects with + * offsets and pitches per plane. The pitch and offset order are dictated by + * the format FourCC as defined by ``drm_fourcc.h``, e.g. NV12 is described as: + * + * YUV 4:2:0 image with a plane of 8-bit Y samples followed by an + * interleaved U/V plane containing 8-bit 2x2 subsampled colour difference + * samples. + * + * So it would consist of a Y plane at ``offsets[0]`` and a UV plane at + * ``offsets[1]``. + * + * To accommodate tiled, compressed, etc formats, a modifier can be specified. + * For more information see the "Format Modifiers" section. Note that even + * though it looks like we have a modifier per-plane, we in fact do not. The + * modifier for each plane must be identical. Thus all combinations of + * different data layouts for multi-plane formats must be enumerated as + * separate modifiers. + * + * All of the entries in @handles, @pitches, @offsets and @modifier must be + * zero when unused. Warning, for @offsets and @modifier zero can't be used to + * figure out whether the entry is used or not since it's a valid value (a zero + * offset is common, and a zero modifier is &DRM_FORMAT_MOD_LINEAR). + */ +struct drm_mode_fb_cmd2 { + /** @fb_id: Object ID of the frame-buffer. */ + __u32 fb_id; + /** @width: Width of the frame-buffer. */ + __u32 width; + /** @height: Height of the frame-buffer. */ + __u32 height; + /** + * @pixel_format: FourCC format code, see ``DRM_FORMAT_*`` constants in + * ``drm_fourcc.h``. + */ + __u32 pixel_format; + /** + * @flags: Frame-buffer flags (see &DRM_MODE_FB_INTERLACED and + * &DRM_MODE_FB_MODIFIERS). + */ + __u32 flags; + + /** + * @handles: GEM buffer handle, one per plane. Set to 0 if the plane is + * unused. The same handle can be used for multiple planes. + */ + __u32 handles[4]; + /** @pitches: Pitch (aka. stride) in bytes, one per plane. */ + __u32 pitches[4]; + /** @offsets: Offset into the buffer in bytes, one per plane. */ + __u32 offsets[4]; + /** + * @modifier: Format modifier, one per plane. See ``DRM_FORMAT_MOD_*`` + * constants in ``drm_fourcc.h``. All planes must use the same + * modifier. Ignored unless &DRM_MODE_FB_MODIFIERS is set in @flags. + */ + __u64 modifier[4]; +}; + +#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 +#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02 +#define DRM_MODE_FB_DIRTY_FLAGS 0x03 + +#define DRM_MODE_FB_DIRTY_MAX_CLIPS 256 + +/* + * Mark a region of a framebuffer as dirty. + * + * Some hardware does not automatically update display contents + * as a hardware or software draw to a framebuffer. This ioctl + * allows userspace to tell the kernel and the hardware what + * regions of the framebuffer have changed. + * + * The kernel or hardware is free to update more then just the + * region specified by the clip rects. The kernel or hardware + * may also delay and/or coalesce several calls to dirty into a + * single update. + * + * Userspace may annotate the updates, the annotates are a + * promise made by the caller that the change is either a copy + * of pixels or a fill of a single color in the region specified. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then + * the number of updated regions are half of num_clips given, + * where the clip rects are paired in src and dst. The width and + * height of each one of the pairs must match. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller + * promises that the region specified of the clip rects is filled + * completely with a single color as given in the color argument. + */ + +struct drm_mode_fb_dirty_cmd { + __u32 fb_id; + __u32 flags; + __u32 color; + __u32 num_clips; + __u64 clips_ptr; +}; + +struct drm_mode_mode_cmd { + __u32 connector_id; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_CURSOR_BO 0x01 +#define DRM_MODE_CURSOR_MOVE 0x02 +#define DRM_MODE_CURSOR_FLAGS 0x03 + +/* + * depending on the value in flags different members are used. + * + * CURSOR_BO uses + * crtc_id + * width + * height + * handle - if 0 turns the cursor off + * + * CURSOR_MOVE uses + * crtc_id + * x + * y + */ +struct drm_mode_cursor { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; +}; + +struct drm_mode_cursor2 { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; + __s32 hot_x; + __s32 hot_y; +}; + +struct drm_mode_crtc_lut { + __u32 crtc_id; + __u32 gamma_size; + + /* pointers to arrays */ + __u64 red; + __u64 green; + __u64 blue; +}; + +struct drm_color_ctm { + /* + * Conversion matrix in S31.32 sign-magnitude + * (not two's complement!) format. + * + * out matrix in + * |R| |0 1 2| |R| + * |G| = |3 4 5| x |G| + * |B| |6 7 8| |B| + */ + __u64 matrix[9]; +}; + +struct drm_color_lut { + /* + * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and + * 0xffff == 1.0. + */ + __u16 red; + __u16 green; + __u16 blue; + __u16 reserved; +}; + +/** + * struct drm_plane_size_hint - Plane size hints + * @width: The width of the plane in pixel + * @height: The height of the plane in pixel + * + * The plane SIZE_HINTS property blob contains an + * array of struct drm_plane_size_hint. + */ +struct drm_plane_size_hint { + __u16 width; + __u16 height; +}; + +/** + * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. + * + * HDR Metadata Infoframe as per CTA 861.G spec. This is expected + * to match exactly with the spec. + * + * Userspace is expected to pass the metadata information as per + * the format described in this structure. + */ +struct hdr_metadata_infoframe { + /** + * @eotf: Electro-Optical Transfer Function (EOTF) + * used in the stream. + */ + __u8 eotf; + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u8 metadata_type; + /** + * @display_primaries: Color Primaries of the Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @display_primaries.x: X coordinate of color primary. + * @display_primaries.y: Y coordinate of color primary. + */ + struct { + __u16 x, y; + } display_primaries[3]; + /** + * @white_point: White Point of Colorspace Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @white_point.x: X coordinate of whitepoint of color primary. + * @white_point.y: Y coordinate of whitepoint of color primary. + */ + struct { + __u16 x, y; + } white_point; + /** + * @max_display_mastering_luminance: Max Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_display_mastering_luminance; + /** + * @min_display_mastering_luminance: Min Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of + * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF + * represents 6.5535 cd/m2. + */ + __u16 min_display_mastering_luminance; + /** + * @max_cll: Max Content Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_cll; + /** + * @max_fall: Max Frame Average Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_fall; +}; + +/** + * struct hdr_output_metadata - HDR output metadata + * + * Metadata Information to be passed from userspace + */ +struct hdr_output_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u32 metadata_type; + /** + * @hdmi_metadata_type1: HDR Metadata Infoframe. + */ + union { + struct hdr_metadata_infoframe hdmi_metadata_type1; + }; +}; + +/** + * DRM_MODE_PAGE_FLIP_EVENT + * + * Request that the kernel sends back a vblank event (see + * struct drm_event_vblank) with the &DRM_EVENT_FLIP_COMPLETE type when the + * page-flip is done. + */ +#define DRM_MODE_PAGE_FLIP_EVENT 0x01 +/** + * DRM_MODE_PAGE_FLIP_ASYNC + * + * Request that the page-flip is performed as soon as possible, ie. with no + * delay due to waiting for vblank. This may cause tearing to be visible on + * the screen. + * + * When used with atomic uAPI, the driver will return an error if the hardware + * doesn't support performing an asynchronous page-flip for this update. + * User-space should handle this, e.g. by falling back to a regular page-flip. + * + * Note, some hardware might need to perform one last synchronous page-flip + * before being able to switch to asynchronous page-flips. As an exception, + * the driver will return success even though that first page-flip is not + * asynchronous. + */ +#define DRM_MODE_PAGE_FLIP_ASYNC 0x02 +#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 +#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8 +#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \ + DRM_MODE_PAGE_FLIP_TARGET_RELATIVE) +/** + * DRM_MODE_PAGE_FLIP_FLAGS + * + * Bitmask of flags suitable for &drm_mode_crtc_page_flip_target.flags. + */ +#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \ + DRM_MODE_PAGE_FLIP_ASYNC | \ + DRM_MODE_PAGE_FLIP_TARGET) + +/* + * Request a page flip on the specified crtc. + * + * This ioctl will ask KMS to schedule a page flip for the specified + * crtc. Once any pending rendering targeting the specified fb (as of + * ioctl time) has completed, the crtc will be reprogrammed to display + * that fb after the next vertical refresh. The ioctl returns + * immediately, but subsequent rendering to the current fb will block + * in the execbuffer ioctl until the page flip happens. If a page + * flip is already pending as the ioctl is called, EBUSY will be + * returned. + * + * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank + * event (see drm.h: struct drm_event_vblank) when the page flip is + * done. The user_data field passed in with this ioctl will be + * returned as the user_data field in the vblank event struct. + * + * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen + * 'as soon as possible', meaning that it not delay waiting for vblank. + * This may cause tearing on the screen. + * + * The reserved field must be zero. + */ + +struct drm_mode_crtc_page_flip { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 reserved; + __u64 user_data; +}; + +/* + * Request a page flip on the specified crtc. + * + * Same as struct drm_mode_crtc_page_flip, but supports new flags and + * re-purposes the reserved field: + * + * The sequence field must be zero unless either of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When + * the ABSOLUTE flag is specified, the sequence field denotes the absolute + * vblank sequence when the flip should take effect. When the RELATIVE + * flag is specified, the sequence field denotes the relative (to the + * current one when the ioctl is called) vblank sequence when the flip + * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to + * make sure the vblank sequence before the target one has passed before + * calling this ioctl. The purpose of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify + * the target for when code dealing with a page flip runs during a + * vertical blank period. + */ + +struct drm_mode_crtc_page_flip_target { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 sequence; + __u64 user_data; +}; + +/** + * struct drm_mode_create_dumb - Create a KMS dumb buffer for scanout. + * @height: buffer height in pixels + * @width: buffer width in pixels + * @bpp: bits per pixel + * @flags: must be zero + * @handle: buffer object handle + * @pitch: number of bytes between two consecutive lines + * @size: size of the whole buffer in bytes + * + * User-space fills @height, @width, @bpp and @flags. If the IOCTL succeeds, + * the kernel fills @handle, @pitch and @size. + */ +struct drm_mode_create_dumb { + __u32 height; + __u32 width; + __u32 bpp; + __u32 flags; + + __u32 handle; + __u32 pitch; + __u64 size; +}; + +/* set up for mmap of a dumb scanout buffer */ +struct drm_mode_map_dumb { + /** Handle for the object being mapped. */ + __u32 handle; + __u32 pad; + /** + * Fake offset to use for subsequent mmap call + * + * This is a fixed-size type for 32/64 compatibility. + */ + __u64 offset; +}; + +struct drm_mode_destroy_dumb { + __u32 handle; +}; + +/** + * DRM_MODE_ATOMIC_TEST_ONLY + * + * Do not apply the atomic commit, instead check whether the hardware supports + * this configuration. + * + * See &drm_mode_config_funcs.atomic_check for more details on test-only + * commits. + */ +#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100 +/** + * DRM_MODE_ATOMIC_NONBLOCK + * + * Do not block while applying the atomic commit. The &DRM_IOCTL_MODE_ATOMIC + * IOCTL returns immediately instead of waiting for the changes to be applied + * in hardware. Note, the driver will still check that the update can be + * applied before retuning. + */ +#define DRM_MODE_ATOMIC_NONBLOCK 0x0200 +/** + * DRM_MODE_ATOMIC_ALLOW_MODESET + * + * Allow the update to result in temporary or transient visible artifacts while + * the update is being applied. Applying the update may also take significantly + * more time than a page flip. All visual artifacts will disappear by the time + * the update is completed, as signalled through the vblank event's timestamp + * (see struct drm_event_vblank). + * + * This flag must be set when the KMS update might cause visible artifacts. + * Without this flag such KMS update will return a EINVAL error. What kind of + * update may cause visible artifacts depends on the driver and the hardware. + * User-space that needs to know beforehand if an update might cause visible + * artifacts can use &DRM_MODE_ATOMIC_TEST_ONLY without + * &DRM_MODE_ATOMIC_ALLOW_MODESET to see if it fails. + * + * To the best of the driver's knowledge, visual artifacts are guaranteed to + * not appear when this flag is not set. Some sinks might display visual + * artifacts outside of the driver's control. + */ +#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400 + +/** + * DRM_MODE_ATOMIC_FLAGS + * + * Bitfield of flags accepted by the &DRM_IOCTL_MODE_ATOMIC IOCTL in + * &drm_mode_atomic.flags. + */ +#define DRM_MODE_ATOMIC_FLAGS (\ + DRM_MODE_PAGE_FLIP_EVENT |\ + DRM_MODE_PAGE_FLIP_ASYNC |\ + DRM_MODE_ATOMIC_TEST_ONLY |\ + DRM_MODE_ATOMIC_NONBLOCK |\ + DRM_MODE_ATOMIC_ALLOW_MODESET) + +struct drm_mode_atomic { + __u32 flags; + __u32 count_objs; + __u64 objs_ptr; + __u64 count_props_ptr; + __u64 props_ptr; + __u64 prop_values_ptr; + __u64 reserved; + __u64 user_data; +}; + +struct drm_format_modifier_blob { +#define FORMAT_BLOB_CURRENT 1 + /* Version of this blob format */ + __u32 version; + + /* Flags */ + __u32 flags; + + /* Number of fourcc formats supported */ + __u32 count_formats; + + /* Where in this blob the formats exist (in bytes) */ + __u32 formats_offset; + + /* Number of drm_format_modifiers */ + __u32 count_modifiers; + + /* Where in this blob the modifiers exist (in bytes) */ + __u32 modifiers_offset; + + /* __u32 formats[] */ + /* struct drm_format_modifier modifiers[] */ +}; + +struct drm_format_modifier { + /* Bitmask of formats in get_plane format list this info applies to. The + * offset allows a sliding window of which 64 formats (bits). + * + * Some examples: + * In today's world with < 65 formats, and formats 0, and 2 are + * supported + * 0x0000000000000005 + * ^-offset = 0, formats = 5 + * + * If the number formats grew to 128, and formats 98-102 are + * supported with the modifier: + * + * 0x0000007c00000000 0000000000000000 + * ^ + * |__offset = 64, formats = 0x7c00000000 + * + */ + __u64 formats; + __u32 offset; + __u32 pad; + + /* The modifier that applies to the >get_plane format list bitmask. */ + __u64 modifier; +}; + +/** + * struct drm_mode_create_blob - Create New blob property + * + * Create a new 'blob' data property, copying length bytes from data pointer, + * and returning new blob ID. + */ +struct drm_mode_create_blob { + /** @data: Pointer to data to copy. */ + __u64 data; + /** @length: Length of data to copy. */ + __u32 length; + /** @blob_id: Return: new property ID. */ + __u32 blob_id; +}; + +/** + * struct drm_mode_destroy_blob - Destroy user blob + * @blob_id: blob_id to destroy + * + * Destroy a user-created blob property. + * + * User-space can release blobs as soon as they do not need to refer to them by + * their blob object ID. For instance, if you are using a MODE_ID blob in an + * atomic commit and you will not make another commit re-using the same ID, you + * can destroy the blob as soon as the commit has been issued, without waiting + * for it to complete. + */ +struct drm_mode_destroy_blob { + __u32 blob_id; +}; + +/** + * struct drm_mode_create_lease - Create lease + * + * Lease mode resources, creating another drm_master. + * + * The @object_ids array must reference at least one CRTC, one connector and + * one plane if &DRM_CLIENT_CAP_UNIVERSAL_PLANES is enabled. Alternatively, + * the lease can be completely empty. + */ +struct drm_mode_create_lease { + /** @object_ids: Pointer to array of object ids (__u32) */ + __u64 object_ids; + /** @object_count: Number of object ids */ + __u32 object_count; + /** @flags: flags for new FD (O_CLOEXEC, etc) */ + __u32 flags; + + /** @lessee_id: Return: unique identifier for lessee. */ + __u32 lessee_id; + /** @fd: Return: file descriptor to new drm_master file */ + __u32 fd; +}; + +/** + * struct drm_mode_list_lessees - List lessees + * + * List lesses from a drm_master. + */ +struct drm_mode_list_lessees { + /** + * @count_lessees: Number of lessees. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_lessees; + /** @pad: Padding. */ + __u32 pad; + + /** + * @lessees_ptr: Pointer to lessees. + * + * Pointer to __u64 array of lessee ids + */ + __u64 lessees_ptr; +}; + +/** + * struct drm_mode_get_lease - Get Lease + * + * Get leased objects. + */ +struct drm_mode_get_lease { + /** + * @count_objects: Number of leased objects. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_objects; + /** @pad: Padding. */ + __u32 pad; + + /** + * @objects_ptr: Pointer to objects. + * + * Pointer to __u32 array of object ids. + */ + __u64 objects_ptr; +}; + +/** + * struct drm_mode_revoke_lease - Revoke lease + */ +struct drm_mode_revoke_lease { + /** @lessee_id: Unique ID of lessee */ + __u32 lessee_id; +}; + +/** + * struct drm_mode_rect - Two dimensional rectangle. + * @x1: Horizontal starting coordinate (inclusive). + * @y1: Vertical starting coordinate (inclusive). + * @x2: Horizontal ending coordinate (exclusive). + * @y2: Vertical ending coordinate (exclusive). + * + * With drm subsystem using struct drm_rect to manage rectangular area this + * export it to user-space. + * + * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS. + */ +struct drm_mode_rect { + __s32 x1; + __s32 y1; + __s32 x2; + __s32 y2; +}; + +/** + * struct drm_mode_closefb + * @fb_id: Framebuffer ID. + * @pad: Must be zero. + */ +struct drm_mode_closefb { + __u32 fb_id; + __u32 pad; +}; + +#if defined(__cplusplus) +} +#endif + +#endif From 29525f8cb3fa244b3b5ecf9fc92e42b9587fd9ef Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 4 Nov 2025 14:34:12 +0000 Subject: [PATCH 743/775] codespell: skip amdgpu kernel headers These header files are copied directly from the Linux kernel and contain typos. We skip these files in codespell to simplify maintenance. Signed-off-by: Radostin Stoyanov --- .codespellrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codespellrc b/.codespellrc index e91a6d2eb..5def594b2 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./.git,./test/pki,./tags +skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems From 1db7eed69fa974563abc6d7348ee93b679c06cc3 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Tue, 4 Nov 2025 14:41:52 +0000 Subject: [PATCH 744/775] amdgpu: use local kernel headers instead of libdrm Use local copies of amdgpu and DRM headers for consistency. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 2 +- plugins/amdgpu/amdgpu_plugin_drm.c | 2 ++ plugins/amdgpu/kfd_ioctl.h | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 36dc0b6b0..713ffed6e 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -20,7 +20,6 @@ #include #include -#include #include "criu-plugin.h" #include "plugin.h" @@ -38,6 +37,7 @@ #include "rst-malloc.h" #include "common/list.h" +#include "amdgpu_drm.h" #include "amdgpu_plugin_dmabuf.h" #include "amdgpu_plugin_drm.h" #include "amdgpu_plugin_util.h" diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c index 923bfcdd1..3520bca7a 100644 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -23,6 +23,8 @@ #include "fdstore.h" #include "criu-amdgpu.pb-c.h" + +/* Define __user as empty for kernel headers in user-space */ #define __user #include "drm.h" diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index 1a3bcea95..a63d453f0 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,9 +23,12 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include #include +/* Define __user as empty for kernel headers in user-space */ +#define __user +#include "drm.h" + /* * - 1.1 - initial version * - 1.3 - Add SMI events support From 62aadb22ab1efeccef7fb322f525bd1b2cb6969c Mon Sep 17 00:00:00 2001 From: Yanning Yang Date: Fri, 14 Nov 2025 23:08:16 +0000 Subject: [PATCH 745/775] amdgpu: use 64-bit offsets for parallel restore On AMD Instinct MI300 systems, restoring a large GPU application can fail because the checkpoint size is too large and the maximum value of an offset (with integer type) is insufficient. This problem occurs when the total size of all buffer objects exceeds int max, not because any single buffer is too large, but it can also happen with a large number of small buffers. Fixes: #2812 Signed-off-by: Yanning Yang Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 713ffed6e..574d7b829 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1651,7 +1651,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf { struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - int offset = 0; + uint64_t offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data) continue; entry = &restore_cmd->entries[i]; - fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET); ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE, false); @@ -2410,4 +2410,4 @@ int amdgpu_plugin_post_forking(void) return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); } -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) \ No newline at end of file +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) From 2cf8f13ca1f11a0491977e438b262e646137256c Mon Sep 17 00:00:00 2001 From: Mark Polyakov Date: Tue, 11 Nov 2025 15:48:42 -0800 Subject: [PATCH 746/775] doc: update pipe/socket examples for --inherit-fd The syntax of the inherit-fd functionality for unix socket and pipe includes a colon. Fixes: 0df3f79fc023 ("criu(8): fix --inherit-fd description") Fixes: c37324b6d0bc ("crtools: describe the inherit-fd option") Signed-off-by: Mark Polyakov Signed-off-by: Radostin Stoyanov --- Documentation/criu.txt | 4 ++-- criu/crtools.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 40ede84e2..0c9a9e527 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -502,8 +502,8 @@ Restores previously checkpointed processes. The 'resource' argument can be one of the following: + - **tty[**__rdev__**:**__dev__**]** - - **pipe[**__inode__**]** - - **socket[**__inode__*]* + - **pipe:[**__inode__**]** + - **socket:[**__inode__*]* - **file[**__mnt_id__**:**__inode__**]** - 'path/to/file' diff --git a/criu/crtools.c b/criu/crtools.c index e207133ac..4dc55a065 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -503,8 +503,8 @@ usage: " Inherit file descriptors, treating fd NUM as being\n" " already opened via an existing RES, which can be:\n" " tty[rdev:dev]\n" - " pipe[inode]\n" - " socket[inode]\n" + " pipe:[inode]\n" + " socket:[inode]\n" " file[mnt_id:inode]\n" " /memfd:name\n" " path/to/file\n" From bf82389de36ef940be3640229f5a68d0e9211b71 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 07:48:02 +0000 Subject: [PATCH 747/775] dump: fix "Defect type: IDENTICAL_BRANCHES" Static code analysis reported: criu/cr-dump.c:2328:2: identical_branches: The same code is executed when the condition "ret" is true or false, because the code in the if-then branch and after the if statement is identical. Should the if statement be removed? This is a fix for the warning. Signed-off-by: Adrian Reber --- criu/cr-dump.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 4df40e9b6..98b4223ba 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2329,8 +2329,6 @@ int cr_dump_tasks(pid_t pid) } ret = write_img_inventory(&he); - if (ret) - goto err; err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); From 09bb3626646f285a3c00c9d424df3028dba9a10b Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 07:49:05 +0000 Subject: [PATCH 748/775] restore: fix "Defect type: UNINIT" Static code analysis reported: 1. criu/cr-restore.c:2438:2: var_decl: Declaring variable "end_vma" without initializer. 4. criu/cr-restore.c:2451:5: assign: Assigning: "s_vma" = "&end_vma", which points to uninitialized data. 7. criu/cr-restore.c:2449:4: uninit_use: Using uninitialized value "s_vma->list.next". This tries to fix it by initializing the variable. Signed-off-by: Adrian Reber --- criu/cr-restore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 057ec0e93..a5eda8d60 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -2440,6 +2440,7 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; + INIT_LIST_HEAD(&end_vma.list); s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); From 90300748effc1cf0fe56e35d3d1cc2ddfedab246 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 13:21:07 +0000 Subject: [PATCH 749/775] tty: fix compiler error At least on tests running on Fedora rawhide following error could be seen: ``` criu/tty.c: In function 'pts_fd_get_index': criu/tty.c:262:21: error: initialization discards 'const' qualifier from pointer target type [-Werror=discarded-qualifiers] 262 | char *pos = strrchr(link->name, '/'); | ``` This fixes it. Signed-off-by: Adrian Reber --- criu/tty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/tty.c b/criu/tty.c index ae23094b7..9a4520d53 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -259,7 +259,7 @@ static int pts_fd_get_index(int fd, const struct fd_parms *p) { int index; const struct fd_link *link = p->link; - char *pos = strrchr(link->name, '/'); + const char *pos = strrchr(link->name, '/'); if (!pos || pos == (link->name + link->len - 1)) { pr_err("Unexpected format on path %s\n", link->name + 1); From 501b714f76b121e66a6f91ffbd707a29bc9edd39 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 14:22:14 +0200 Subject: [PATCH 750/775] compel/aarch64: refactor fpregs handling Refactor user_fpregs_struct_t to wrap user_fpsimd_state in a dedicated struct, preparing for future extending by just adding new members Signed-off-by: Igor Svilenkov Bozic [ alex: fixes ] Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/infect-types.h | 6 +++++- compel/arch/aarch64/src/lib/infect.c | 18 +++++++++--------- criu/arch/aarch64/crtools.c | 8 ++++---- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 9d4ce7e2e..39aed4ac5 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -16,7 +16,11 @@ */ typedef struct user_pt_regs user_regs_struct_t; -typedef struct user_fpsimd_state user_fpregs_struct_t; + +struct user_fpregs_struct { + struct user_fpsimd_state fpstate; +}; +typedef struct user_fpregs_struct user_fpregs_struct_t; #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index ec1d0d59e..503616df7 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -43,10 +43,10 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t * sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; - memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); + memcpy(fpsimd->vregs, fpregs->fpstate.vregs, 32 * sizeof(__uint128_t)); - fpsimd->fpsr = fpregs->fpsr; - fpsimd->fpcr = fpregs->fpcr; + fpsimd->fpsr = fpregs->fpstate.fpsr; + fpsimd->fpcr = fpregs->fpstate.fpcr; fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); @@ -59,7 +59,7 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { struct iovec iov; @@ -74,14 +74,14 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - iov.iov_base = fpsimd; - iov.iov_len = sizeof(*fpsimd); + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } - ret = save(pid, arg, regs, fpsimd); + ret = save(pid, arg, regs, ext_regs); err: return ret; } @@ -92,8 +92,8 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) pr_info("Restoring GP/FPU registers for %d\n", pid); - iov.iov_base = ext_regs; - iov.iov_len = sizeof(*ext_regs); + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { pr_perror("Failed to set FPU registers for %d", pid); return -1; diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 3ed5c9d63..3cd082a34 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -157,11 +157,11 @@ int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_str // Save the FP/SIMD state for (i = 0; i < 32; ++i) { - core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->vregs[i]; - core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->vregs[i] >> 64; + core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->fpstate.vregs[i]; + core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->fpstate.vregs[i] >> 64; } - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpsr); + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpcr); if (save_pac_keys(pid, core)) return -1; From 73ca07148398c58e344cd2be71933836f55d93f4 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Wed, 13 Aug 2025 17:48:56 +0200 Subject: [PATCH 751/775] gcs: add GCS constants and helper macros Introduce ARM64 Guarded Control Stack (GCS) constants and macros in a new uapi header for use in both CRIU and compel. Includes: - NT_ARM_GCS type - prctl(2) constants for GCS enable/write/push modes - Capability token helpers (GCS_CAP, GCS_SIGNAL_CAP) - HWCAP_GCS definition These are based on upstream Linux definitions Signed-off-by: Igor Svilenkov Bozic Reviewed-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/gcs-types.h | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h new file mode 100644 index 000000000..9f9655e3b --- /dev/null +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h @@ -0,0 +1,47 @@ +#ifndef __UAPI_ASM_GCS_TYPES_H__ +#define __UAPI_ASM_GCS_TYPES_H__ + +#ifndef NT_ARM_GCS +#define NT_ARM_GCS 0x410 /* ARM GCS state */ +#endif + +/* Shadow Stack/Guarded Control Stack interface */ +#define PR_GET_SHADOW_STACK_STATUS 74 +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +/* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack */ +#ifndef PR_SHADOW_STACK_ENABLE +#define PR_SHADOW_STACK_ENABLE (1UL << 0) +#endif + +/* Allows explicit GCS stores (eg. using GCSSTR) */ +#ifndef PR_SHADOW_STACK_WRITE +#define PR_SHADOW_STACK_WRITE (1UL << 1) +#endif + +/* Allows explicit GCS pushes (eg. using GCSPUSHM) */ +#ifndef PR_SHADOW_STACK_PUSH +#define PR_SHADOW_STACK_PUSH (1UL << 2) +#endif + +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +#define PR_SHADOW_STACK_ALL_MODES \ + PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH + +/* copied from: arch/arm64/include/asm/sysreg.h */ +#define GCS_CAP_VALID_TOKEN 0x1 +#define GCS_CAP_ADDR_MASK 0xFFFFFFFFFFFFF000ULL +#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | GCS_CAP_VALID_TOKEN) +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + +#include + +#ifndef HWCAP_GCS +#define HWCAP_GCS (1UL << 32) +#endif + +#endif /* __UAPI_ASM_GCS_TYPES_H__ */ \ No newline at end of file From 6bb856b0af85fc6c1a90f2a6f28afd9f3b0db493 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Fri, 1 Aug 2025 12:09:57 +0200 Subject: [PATCH 752/775] compel: gcs: initial GCS support for signal frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add basic prerequisites for Guarded Control Stack (GCS) state on AArch64. This adds a gcs_context to the signal frame and extends user_fpregs_struct_t to carry GCS metadata, preparing the groundwork for GCS in the parasite. For now, the GCS fields are zeroed during compel_get_task_regs(), technically ignoring GCS since it does not reach the control logic yet; that will be introduced in the next commit. The code path is gated and does not affect normal tests. Can be explicitly enabled and tested via:     make -C infect GCS_ENABLE=1 && make -C infect run Signed-off-by: Igor Svilenkov Bozic [ alex: clean up fixes ] Signed-off-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/infect-types.h | 14 +++++++ .../src/lib/include/uapi/asm/sigframe.h | 10 +++++ compel/arch/aarch64/src/lib/infect.c | 39 ++++++++++++++++++- compel/include/uapi/infect.h | 8 ++++ 4 files changed, 70 insertions(+), 1 deletion(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 39aed4ac5..e11f2910f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -2,6 +2,7 @@ #define UAPI_COMPEL_ASM_TYPES_H__ #include +#include #include #include #include @@ -17,8 +18,18 @@ typedef struct user_pt_regs user_regs_struct_t; +/* + * GCS (Guarded Control Stack) + */ +struct user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; + struct user_fpregs_struct { struct user_fpsimd_state fpstate; + struct user_gcs gcs; }; typedef struct user_fpregs_struct user_fpregs_struct_t; @@ -43,4 +54,7 @@ typedef struct user_fpregs_struct user_fpregs_struct_t; __NR_##syscall; \ }) +extern bool __compel_host_supports_gcs(void); +#define compel_host_supports_gcs __compel_host_supports_gcs + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index a3528500d..7efee528f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -10,11 +10,20 @@ /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ #define FPSIMD_MAGIC 0x46508001 +#define GCS_MAGIC 0x47435300 typedef struct fpsimd_context fpu_state_t; +struct gcs_context { + struct _aarch64_ctx head; + __u64 gcspr; + __u64 features_enabled; + __u64 reserved; +}; + struct aux_context { struct fpsimd_context fpsimd; + struct gcs_context gcs; /* additional context to be added before "end" */ struct _aarch64_ctx end; }; @@ -63,6 +72,7 @@ struct cr_sigcontext { #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 +#define RT_SIGFRAME_GCS(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->gcs) #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 503616df7..0f74a023a 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,8 +2,8 @@ #include #include #include +#include #include -#include #include #include "common/page.h" @@ -13,6 +13,8 @@ #include "infect.h" #include "infect-priv.h" #include "asm/breakpoints.h" +#include "asm/gcs-types.h" +#include unsigned __page_size = 0; unsigned __page_shift = 0; @@ -33,12 +35,32 @@ static inline void __always_unused __check_code_syscall(void) BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } +bool __compel_host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool __compel_gcs_enabled(struct user_gcs *gcs) +{ + if (!compel_host_supports_gcs()) + return false; + + if (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) + return true; + + return false; +} + int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs = RT_SIGFRAME_GCS(sigframe); memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); + pr_debug("sigreturn_prep_regs_plain: sp %lx pc %lx\n", (long)regs->sp, (long)regs->pc); + sigframe->uc.uc_mcontext.sp = regs->sp; sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; @@ -51,6 +73,19 @@ int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t * fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (__compel_gcs_enabled(&fpregs->gcs)) { + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = fpregs->gcs.gcspr_el0 - 8; + gcs->features_enabled = fpregs->gcs.features_enabled; + + pr_debug("sigframe gcspr=%llx features_enabled=%llx\n", fpregs->gcs.gcspr_el0 - 8, fpregs->gcs.features_enabled); + } else { + pr_debug("sigframe gcspr=[disabled]\n"); + memset(gcs, 0, sizeof(*gcs)); + } + return 0; } @@ -81,6 +116,8 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } + memset(&ext_regs->gcs, 0, sizeof(ext_regs->gcs)); + ret = save(pid, arg, regs, ext_regs); err: return ret; diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 1f61876ff..d21c261b7 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -192,6 +192,14 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); +#ifndef compel_host_supports_gcs +static inline bool compel_host_supports_gcs(void) +{ + return false; +} +#define compel_host_supports_gcs +#endif + #ifndef compel_shstk_enabled static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) { From 2f676d20e41337568403e9f8ac79f5cd3af620e3 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 18:32:55 +0200 Subject: [PATCH 753/775] compel: gcs: set up GCS token/restorer for rt_sigreturn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When GCS is enabled, the kernel expects a capability token at GCSPR_EL0-8 and sa_restorer at GCSPR_EL0-16 on rt_sigreturn. The sigframe must be consistent with the kernel’s expectations, with GCSPR_EL0 advanced by -8 having it point to the token on signal entry. On rt_sigreturn, the kernel verifies the cap at GCSPR_EL0, invalidates it and increments GCSPR_EL0 by 8 at the end of gcs_restore_signal() . Implement parasite_setup_gcs() to: - read NT_ARM_GCS via ptrace(PTRACE_GETREGSET) - write (via ptrace) the computed capability token and restorer address - update GCSPR_EL0 to point to the token's location Call parasite_setup_gcs() into parasite_start_daemon() so the sigreturn frame satisfies kernel's expectation Tests with GCS remain opt‑in: make -C compel/test/infect GCS_ENABLE=1 && make -C compel/test/infect run Signed-off-by: Igor Svilenkov Bozic [ alex: cleanup fixes ] Signed-off-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- .../src/lib/include/uapi/asm/infect-types.h | 5 ++ compel/arch/aarch64/src/lib/infect.c | 82 ++++++++++++++++++- compel/arch/x86/src/lib/infect.c | 2 +- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index e11f2910f..3a34ab4f6 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -57,4 +57,9 @@ typedef struct user_fpregs_struct user_fpregs_struct_t; extern bool __compel_host_supports_gcs(void); #define compel_host_supports_gcs __compel_host_supports_gcs +struct parasite_ctl; +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 0f74a023a..39be558ea 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -46,10 +46,7 @@ static bool __compel_gcs_enabled(struct user_gcs *gcs) if (!compel_host_supports_gcs()) return false; - if (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) - return true; - - return false; + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; } int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) @@ -118,6 +115,18 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct memset(&ext_regs->gcs, 0, sizeof(ext_regs->gcs)); + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &iov) == 0) { + pr_info("gcs: GCSPR_EL0 for %d: 0x%llx, features: 0x%llx\n", + pid, ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + if (!__compel_gcs_enabled(&ext_regs->gcs)) + pr_info("gcs: GCS is NOT enabled\n"); + } else { + pr_info("gcs: GCS state not available for %d\n", pid); + } + ret = save(pid, arg, regs, ext_regs); err: return ret; @@ -323,3 +332,68 @@ int ptrace_flush_breakpoints(pid_t pid) return 0; } + +int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct user_gcs *gcs) +{ + struct iovec gcs_iov = { .iov_base = gcs, .iov_len = sizeof(*gcs) }; + + uint64_t token_addr = gcs->gcspr_el0 - 8; + uint64_t sigtramp_addr = gcs->gcspr_el0 - 16; + + uint64_t cap_token = ALIGN_DOWN(GCS_SIGNAL_CAP(token_addr), 8); + unsigned long restorer_addr; + + pr_info("gcs: (setup) CAP token: 0x%lx at addr: 0x%lx\n", cap_token, token_addr); + + /* Inject capability token at gcspr_el0 - 8 */ + if (ptrace(PTRACE_POKEDATA, pid, (void *)token_addr, cap_token)) { + pr_perror("gcs: (setup) Inject GCS cap token failed"); + return -1; + } + + /* Inject restorer trampoline address (gcspr_el0 - 16) */ + restorer_addr = ctl->parasite_ip; + if (ptrace(PTRACE_POKEDATA, pid, (void *)sigtramp_addr, restorer_addr)) { + pr_perror("gcs: (setup) Inject GCS restorer failed"); + return -1; + } + + /* Update GCSPR_EL0 */ + gcs->gcspr_el0 = token_addr; + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &gcs_iov)) { + pr_perror("gcs: PTRACE_SETREGS FAILED"); + return -1; + } + + pr_debug("gcs: parasite_ip=%#lx sp=%#llx gcspr_el0=%#llx\n", + ctl->parasite_ip, ctl->orig.regs.sp, gcs->gcspr_el0); + + return 0; +} + +int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +{ + struct user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + pid_t pid = ctl->rpid; + + if(!__compel_host_supports_gcs()) + return 0; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) != 0) { + pr_perror("GCS state not available for %d", pid); + return -1; + } + + if (!__compel_gcs_enabled(&gcs)) + return 0; + + if (inject_gcs_cap_token(ctl, pid, &gcs)) { + pr_perror("Failed to inject GCS cap token for %d", pid); + return -1; + } + + pr_info("gcs: GCS enabled for %d\n", pid); + + return 0; +} diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 644c483b4..afcf2c53b 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -761,7 +761,7 @@ bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) return false; } -int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +int parasite_setup_shstk(struct parasite_ctl *ctl, __maybe_unused user_fpregs_struct_t *ext_regs) { pid_t pid = ctl->rpid; unsigned long sa_restorer = ctl->parasite_ip; From 92e6e523b51f342bdc6dbaf79d9c43e915f02af0 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Thu, 14 Aug 2025 14:39:52 +0200 Subject: [PATCH 754/775] compel: gcs: add opt-in GCS test support for AArch64 Introduce an opt-in mode for building and running compel tests with Guarded Control Stack (GCS) enabled on AArch64. Changes: - Extend compel/test/infect to support `GCS_ENABLE=1` builds, adding `-mbranch-protection=standard` and `-z experimental-gcs=check` to CFLAGS/LDFLAGS. - Export required GLIBC_TUNABLES at runtime via `TEST_ENV`. Usage: make -C compel/test/infect GCS_ENABLE=1 make -C compel/test/infect GCS_ENABLE=1 run By default (`GCS_ENABLE` unset or 0), builds and runs are unchanged. Signed-off-by: Igor Svilenkov Bozic --- compel/test/infect/Makefile | 7 ++++++- compel/test/infect/spy.c | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/compel/test/infect/Makefile b/compel/test/infect/Makefile index bacfad962..85efa5fd9 100644 --- a/compel/test/infect/Makefile +++ b/compel/test/infect/Makefile @@ -3,6 +3,11 @@ CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host +ifeq ($(GCS_ENABLE),1) +CFLAGS += -mbranch-protection=standard -DGCS_TEST_ENABLE=1 +LDFLAGS += -z experimental-gcs=check +endif + all: victim spy run: @@ -17,7 +22,7 @@ clean: rm -f parasite.o victim: victim.c - $(CC) $(CFLAGS) -o $@ $^ + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index b10db4d47..143946941 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -112,6 +112,9 @@ int main(int argc, char **argv) return -1; } +#ifdef GCS_TEST_ENABLE + setenv("GLIBC_TUNABLES", "glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2", 1); +#endif pid = vfork(); if (pid == 0) { close(p_in[1]); From 41ecb7ac71f1396d7471467d21443d144000d6c2 Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 18:52:36 +0200 Subject: [PATCH 755/775] images: aarch64: add user_aarch64_gcs_entry - Define user_aarch64_gcs_entry in core-aarch64.proto to store Guarded Control Stack state (gcspr_el0, features_enabled). - Extend thread_info_aarch64 with an optional gcs field Also extend thread_info_aarch64 with an optional gcs field Signed-off-by: Igor Svilenkov Bozic --- images/core-aarch64.proto | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/images/core-aarch64.proto b/images/core-aarch64.proto index 64b0ee9fb..a94911c0b 100644 --- a/images/core-aarch64.proto +++ b/images/core-aarch64.proto @@ -17,6 +17,11 @@ message user_aarch64_fpsimd_context_entry { required uint32 fpcr = 3; } +message user_aarch64_gcs_entry { + required uint64 gcspr_el0 = 1 [(criu).hex = true]; + required uint64 features_enabled = 2 [(criu).hex = true]; +} + message pac_address_keys { required uint64 apiakey_lo = 1; required uint64 apiakey_hi = 2; @@ -45,4 +50,5 @@ message thread_info_aarch64 { required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; optional pac_keys pac_keys = 5; + optional user_aarch64_gcs_entry gcs = 6; } From 2429d49e677377575aa72cc35e18f96d671ad72e Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 18:56:41 +0200 Subject: [PATCH 756/775] criu/dump: gcs: save GCS state during dump Add debug and info messages to log Guarded Control Stack state when dumping AArch64 threads. This includes the following values: - gcspr_el0 - features_enabled Signed-off-by: Igor Svilenkov Bozic [ alex: cleanup fixes ] Signed-off-by: Alexander Mikhalitsyn Acked-by: Mike Rapoport --- criu/arch/aarch64/crtools.c | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 3cd082a34..835a83400 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -12,6 +12,7 @@ #include "common/compiler.h" #include #include "asm/dump.h" +#include "asm/gcs-types.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" @@ -22,6 +23,7 @@ #include "restorer.h" #include "compel/infect.h" #include "pstree.h" +#include /* * cr_user_pac_* are a copy of the corresponding uapi structs @@ -146,6 +148,11 @@ static int save_pac_keys(int pid, CoreEntry *core) int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; + struct user_gcs gcs_live; + struct iovec gcs_iov = { + .iov_base = &gcs_live, + .iov_len = sizeof(gcs_live), + }; CoreEntry *core = x; // Save the Aarch64 CPU state @@ -165,6 +172,17 @@ int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_str if (save_pac_keys(pid, core)) return -1; + + /* Save the GCS state */ + if (compel_host_supports_gcs()) { + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_perror("Failed to get GCS for %d", pid); + return -1; + } + core->ti_aarch64->gcs->gcspr_el0 = gcs_live.gcspr_el0; + core->ti_aarch64->gcs->features_enabled = gcs_live.features_enabled; + } + return 0; } @@ -173,6 +191,7 @@ int arch_alloc_thread_info(CoreEntry *core) ThreadInfoAarch64 *ti_aarch64; UserAarch64RegsEntry *gpregs; UserAarch64FpsimdContextEntry *fpsimd; + UserAarch64GcsEntry *gcs; ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); if (!ti_aarch64) @@ -202,6 +221,15 @@ int arch_alloc_thread_info(CoreEntry *core) if (!fpsimd->vregs) goto err; + /* Allocate & init GCS */ + if (compel_host_supports_gcs()) { + gcs = xmalloc(sizeof(*gcs)); + if (!gcs) + goto err; + user_aarch64_gcs_entry__init(gcs); + ti_aarch64->gcs = gcs; + } + return 0; err: return -1; @@ -231,6 +259,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int i; struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs; if (core->ti_aarch64->fpsimd->n_vregs != 64) return 1; @@ -244,6 +273,18 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (compel_host_supports_gcs()) { + gcs = RT_SIGFRAME_GCS(sigframe); + + pr_debug("sigframe gcspr %llx enabled %llx\n", gcs->gcspr, gcs->features_enabled); + + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = core->ti_aarch64->gcs->gcspr_el0 - 8; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + } + return 0; } From d591e320e0ef3dd816a2c61a46a074e21f2b769f Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Tue, 12 Aug 2025 20:13:28 +0200 Subject: [PATCH 757/775] criu/restore: gcs: adds restore implementation for Guarded Control Stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit finalizes AArch64 Guarded Control Stack (GCS) support by wiring the full dump and restore flow. The restore path adds the following steps: - Define shared AArch64 GCS types and constants in a dedicated header for both compel and CRIU inclusion - compel: add get/set NT_ARM_GCS via ptrace, enabling user-space GCS state save and restore. - During restore switch to the new GCS (via GCSSTR) to place capability token sa_restorer address - arch_shstk_trampoline() — We enable GCS in a trampoline that using prctl(PR_SET_SHADOW_STACK_STATUS, ...) via inline SVC. The trampoline ineeded because we can’t RET without a valid GCS. - restorer: map the recorded GCS VMA, populate contents top-down with GCSSTR, write the signal capability at GCSPR_EL0 and the valid token at GCSPR_EL0-8, then switch to the rebuilt GCS (GCSSS1) - Save and restore registers via ptrace - Extend restorer argument structures to carry GCS state into post-restore execution - Add shstk_set_restorer_stack(): sets tmp_gcs to temporary restorer shadow stack start - Add gcs_vma_restore implementation (required for mremap of the GCS VMA) Tested with: GCS_ENABLE=1 ./zdtm.py run -t zdtm/static/env00 Signed-off-by: Igor Svilenkov Bozic --- compel/arch/aarch64/src/lib/infect.c | 30 +++ .../arch/arm/plugins/std/syscalls/syscall.def | 1 + compel/include/infect-priv.h | 1 + criu/arch/aarch64/Makefile | 1 + criu/arch/aarch64/gcs.c | 157 ++++++++++++++ criu/arch/aarch64/include/asm/gcs.h | 196 ++++++++++++++++++ criu/arch/aarch64/include/asm/restorer.h | 1 + 7 files changed, 387 insertions(+) create mode 100644 criu/arch/aarch64/gcs.c create mode 100644 criu/arch/aarch64/include/asm/gcs.h diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 39be558ea..7450ac026 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -136,6 +136,9 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; + struct user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + pr_info("Restoring GP/FPU registers for %d\n", pid); iov.iov_base = &ext_regs->fpstate; @@ -144,6 +147,33 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) pr_perror("Failed to set FPU registers for %d", pid); return -1; } + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_warn("gcs: Failed to get GCS for %d\n", pid); + } else { + ext_regs->gcs = gcs; + compel_set_task_gcs_regs(pid, ext_regs); + } + + return 0; +} + +int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("gcs: restoring GCS registers for %d\n", pid); + pr_info("gcs: restoring GCS: gcspr=%llx features=%llx\n", + ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &iov)) { + pr_perror("gcs: Failed to set GCS registers for %d", pid); + return -1; + } + return 0; } diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 9a33009eb..819678566 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -124,3 +124,4 @@ openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) +map_shadow_stack 453 ! (unsigned long addr, unsigned long size, unsigned int flags) \ No newline at end of file diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h index 9d3442839..8e78a7f6c 100644 --- a/compel/include/infect-priv.h +++ b/compel/include/infect-priv.h @@ -72,6 +72,7 @@ extern bool arch_can_dump_task(struct parasite_ctl *ctl); extern int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, unsigned long flags); extern int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs); +extern int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs); extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs); diff --git a/criu/arch/aarch64/Makefile b/criu/arch/aarch64/Makefile index b26487367..b87fcaa5b 100644 --- a/criu/arch/aarch64/Makefile +++ b/criu/arch/aarch64/Makefile @@ -6,3 +6,4 @@ obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o +obj-y += gcs.o \ No newline at end of file diff --git a/criu/arch/aarch64/gcs.c b/criu/arch/aarch64/gcs.c new file mode 100644 index 000000000..4bdb9d2e4 --- /dev/null +++ b/criu/arch/aarch64/gcs.c @@ -0,0 +1,157 @@ +#include +#include + +#include +#include + +#include "asm/gcs-types.h" +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +#include +#include + +static bool task_has_gcs_enabled(UserAarch64GcsEntry *gcs) +{ + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; +} + +static bool host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool task_needs_gcs(struct pstree_item *item, CoreEntry *core) +{ + UserAarch64GcsEntry *gcs; + + if (!task_alive(item)) + return false; + + gcs = core->ti_aarch64->gcs; + + if (task_has_gcs_enabled(gcs)) { + if (!host_supports_gcs()) { + pr_warn_once("Restoring task with GCS on non-GCS host\n"); + return false; + } + + pr_info("Restoring task with GCS\n"); + return true; + } + + pr_info("Restoring a task without GCS\n"); + return false; +} + +static int gcs_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *gcs) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, gcs->gcspr_el0)) { + unsigned long premapped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + gcs->vma_start = vma->e->start; + gcs->vma_size = size; + gcs->premapped_addr = premapped_addr; + + return 0; + } + } + + pr_err("Unable to find a shadow stack vma: %lx\n", gcs->gcspr_el0); + return -1; +} + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + int i; + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *gcs = &ta->shstk; + + if (!task_needs_gcs(item, core)) + return 0; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + gcs = &thread_args->shstk; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare GCS memory\n"); + return -1; + } + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + int fret; + unsigned long flags = PR_SHADOW_STACK_ENABLE | + PR_SHADOW_STACK_PUSH | + PR_SHADOW_STACK_WRITE; + + long ret, x1_after, x8_after; + + /* If task doesn't need GCS, just call func */ + if (!task_needs_gcs(item, core)) { + return func(arg); + } + + pr_debug("gcs: GCS enable SVC about to fire: x8=%d x0=%d x1=0x%lx\n", + __NR_prctl, PR_SET_SHADOW_STACK_STATUS, flags); + + asm volatile( + "mov x0, %3\n" // x0 = PR_SET_SHADOW_STACK_STATUS (75) + "mov x1, %4\n" // x1 = flags + "mov x2, xzr\n" // x2 = 0 + "mov x3, xzr\n" // x3 = 0 + "mov x4, xzr\n" // x4 = 0 + "mov x8, %5\n" // x8 = __NR_prctl (167) + "svc #0\n" // Invoke syscall + "mov %0, x0\n" // Capture return value + "mov %1, x1\n" // Capture x1 after + "mov %2, x8\n" // Capture x8 after + : "=r"(ret), "=r"(x1_after), "=r"(x8_after) + : "i"(PR_SET_SHADOW_STACK_STATUS), // x0 - %3rd + "r"(flags), // x1 - %4th + "i"(__NR_prctl) // x8 - %5th + : "x0", "x1", "x2", "x3", "x4", "x8", "memory", "cc"); + + pr_info("gcs: after SVC: ret=%ld x1=%ld x8=%ld\n", ret, x1_after, x8_after); + + if (ret != 0) { + int err = errno; + pr_err("gcs: failed to enable GCS: ret=%ld errno=%d (%s)\n", ret, err, strerror(err)); + return -1; + } + + fret = func(arg); + exit(fret); + + return -1; +} diff --git a/criu/arch/aarch64/include/asm/gcs.h b/criu/arch/aarch64/include/asm/gcs.h new file mode 100644 index 000000000..28faa23b7 --- /dev/null +++ b/criu/arch/aarch64/include/asm/gcs.h @@ -0,0 +1,196 @@ +#ifndef __CR_ASM_GCS_H__ +#define __CR_ASM_GCS_H__ + +#include + +struct rst_shstk_info { + unsigned long vma_start; /* start of GCS VMA */ + unsigned long vma_size; /* size of GCS VMA */ + unsigned long premapped_addr; /* premapped buffer */ + unsigned long tmp_gcs; /* temp area for GCS if needed */ + u64 gcspr_el0; /* GCS pointer */ + u64 features_enabled; /* GCS flags */ +}; + +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_gcs_prepare + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline + +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *gcs, void *ptr) +{ + gcs->tmp_gcs = (long unsigned)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size + +#ifdef CR_NOGLIBC +#include +#include +#include "vma.h" + +static inline unsigned long gcs_map(unsigned long addr, unsigned long size, unsigned int flags) +{ + long gcspr = sys_map_shadow_stack(addr, size, flags); + pr_info("gcs: syscall: map_shadow_stack at=%lx size=%ld\n", addr, size); + + if (gcspr < 0) { + pr_err("gcs: failed to map GCS at %lx: %ld\n", addr, gcspr); + return -1; + } + + if (addr && gcspr != addr) { + pr_err("gcs: address mismatch: need %lx, got %lx\n", addr, gcspr); + return -1; + } + + pr_info("gcs: mmapped GCS at %lx\n", gcspr); + + return gcspr; +} + +/* clang-format off */ +static always_inline void gcsss1(unsigned long *Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static always_inline unsigned long *gcsss2(void) +{ + unsigned long *Xt; + + asm volatile ( + "SYSL %0, #3, C7, C7, #3\n" + : "=r" (Xt) + : + : "memory"); + + return Xt; +} + +static inline void gcsstr(unsigned long addr, unsigned long val) +{ + asm volatile( + "mov x0, %0\n" + "mov x1, %1\n" + ".inst 0xd91f1c01\n" // GCSSTR x1, [x0] + "mov x0, #0\n" + : + : "r"(addr), "r"(val) + : "x0", "x1", "memory"); +} +/* clang-format on */ + +static always_inline int gcs_restore(struct rst_shstk_info *gcs) +{ + unsigned long gcspr, val; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + gcspr = gcs->gcspr_el0 - 8; + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8); + pr_debug("gcs: [0] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8) | GCS_CAP_VALID_TOKEN; + gcspr -= 8; + pr_debug("gcs: [1] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + pr_debug("gcs: about to switch stacks via GCSSS1 to: %lx\n", gcspr); + gcsss1((unsigned long *)gcspr); + return 0; +} +#define arch_shstk_restore gcs_restore + +static always_inline int gcs_vma_restore(VmaEntry *vma_entry) +{ + unsigned long shstk, i, ret; + unsigned long *gcs_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + + shstk = gcs_map(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", shstk, shstk); + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + gcsstr(shstk + i * 8, gcs_data[i]); + + pr_debug("unmap %lx %ld\n", (unsigned long)gcs_data, vma_size); + ret = sys_munmap(gcs_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore gcs_vma_restore + +static always_inline int gcs_switch_to_restorer(struct rst_shstk_info *gcs) +{ + int ret; + unsigned long *ssp; + unsigned long addr; + unsigned long gcspr; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + pr_debug("gcs->premapped_addr + gcs->vma_size = %lx\n", gcs->premapped_addr + gcs->vma_size); + pr_debug("gcs->tmp_gcs = %lx\n", gcs->tmp_gcs); + addr = gcs->tmp_gcs; + + if (addr % PAGE_SIZE != 0) { + pr_err("gcs: 0x%lx not page-aligned to size 0x%lx\n", addr, PAGE_SIZE); + return -1; + } + + ret = sys_munmap((void *)addr, PAGE_SIZE); + if (ret < 0) { + pr_err("gcs: Failed to unmap aarea for dumpee GCS VMAs\n"); + return -1; + } + + gcspr = gcs_map(addr, PAGE_SIZE, SHADOW_STACK_SET_TOKEN); + + if (gcspr == -1) { + pr_err("gcs: failed to gcs_map(%lx, %lx)\n", (unsigned long)addr, PAGE_SIZE); + return -1; + } + + ssp = (unsigned long *)(addr + PAGE_SIZE - 8); + gcsss1(ssp); + + return 0; +} +#define arch_shstk_switch_to_restorer gcs_switch_to_restorer + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_GCS_H__ */ diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 2174df4fa..8f3edc257 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -5,6 +5,7 @@ #include #include "asm/types.h" +#include "asm/gcs.h" #include "images/core.pb-c.h" #include From f78bea8d34cb54a53ae976f5abb2091f0a45a90a Mon Sep 17 00:00:00 2001 From: Igor Svilenkov Bozic Date: Sun, 23 Nov 2025 17:27:49 +0100 Subject: [PATCH 758/775] zdtm: gcs: add opt-in GCS test support for AArch64 Introduce an opt-in mode for building and running ZDTM static tests with Guarded Control Stack (GCS) enabled on AArch64. Changes: - Support `GCS_ENABLE=1` builds, adding `-mbranch-protection=standard` and `-z experimental-gcs=check` to CFLAGS/LDFLAGS. - Export required GLIBC_TUNABLES at runtime via `TEST_ENV`. - %.pid rules to prefix test binaries with `$(TEST_ENV)` so the tunables are set when running tests. - Makefile rules for selectively enabling GCS in tests Usage: # Build and run with GCS enabled make -C zdtm/static GCS_ENABLE=1 posix_timers GCS_ENABLE=1 ./zdtm.py run --keep-img=always \ -t zdtm/static/posix_timers By default (`GCS_ENABLE` unset or 0), test builds and runs are unchanged. NOTE: This assumes that the test victim was compiled also using GCS_ENABLE=1 so that the proper GCS AArch64 ELF headers are present Signed-off-by: Igor Svilenkov Bozic Reviewed-by: Alexander Mikhalitsyn aleksandr.mikhalitsyn@canonical.com --- test/zdtm/Makefile.inc | 8 ++++++++ test/zdtm/static/Makefile | 18 +++++++++--------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 465285f08..c95b4ef6a 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -79,6 +79,14 @@ define pkg-cflags $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --cflags $(1)) endef +ifeq ($(GCS_ENABLE),1) + CFLAGS += -mbranch-protection=standard + LDFLAGS += -z experimental-gcs=check + TEST_ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2 +else + TEST_ENV = +endif + %.d: %.c $(E) " DEP " $@ $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP $< -o $@ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index ea901a805..e1df2e5fa 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -520,30 +520,30 @@ install: all .PHONY: all install $(TST_NOFILE:%=%.pid): %.pid: % - $( Date: Tue, 9 Dec 2025 19:26:03 +0000 Subject: [PATCH 759/775] cr-dump: fix error handling Commit "plugin: Add DUMP_DEVICES_LATE callback" introduced a new plugin callback that is invoked in cr_dump_tasks(). The return value of this callback was assigned to the variable ret. However, this variable is later used as the return value when goto err is triggered in subsequent conditions. As a result, CRIU exits with "Dumping finished successfully" even when some actions have failed and inventory.img has not been created. To fix this, we replace ret with exit_code and use it only when it is actually needed. Signed-off-by: Radostin Stoyanov --- criu/cr-dump.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 98b4223ba..a58aaf34a 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -2138,8 +2138,8 @@ int cr_dump_tasks(pid_t pid) InventoryEntry he = INVENTORY_ENTRY__INIT; InventoryEntry *parent_ie = NULL; struct pstree_item *item; - int pre_dump_ret = 0; - int ret = -1; + int ret; + int exit_code = -1; kerndat_warn_about_madv_guards(); @@ -2159,9 +2159,9 @@ int cr_dump_tasks(pid_t pid) goto err; root_item->pid->real = pid; - pre_dump_ret = run_scripts(ACT_PRE_DUMP); - if (pre_dump_ret != 0) { - pr_err("Pre dump script failed with %d!\n", pre_dump_ret); + ret = run_scripts(ACT_PRE_DUMP); + if (ret != 0) { + pr_err("Pre dump script failed with %d!\n", ret); goto err; } if (init_stats(DUMP_STATS)) @@ -2287,39 +2287,32 @@ int cr_dump_tasks(pid_t pid) * ipc shared memory, but an ipc namespace is dumped in a child * process. */ - ret = cr_dump_shmem(); - if (ret) + if (cr_dump_shmem()) goto err; if (root_ns_mask) { - ret = dump_namespaces(root_item, root_ns_mask); - if (ret) + if (dump_namespaces(root_item, root_ns_mask)) goto err; } if ((root_ns_mask & CLONE_NEWTIME) == 0) { - ret = dump_time_ns(0); - if (ret) + if (dump_time_ns(0)) goto err; } if (dump_aa_namespaces() < 0) goto err; - ret = dump_cgroups(); - if (ret) + if (dump_cgroups()) goto err; - ret = fix_external_unix_sockets(); - if (ret) + if (fix_external_unix_sockets()) goto err; - ret = tty_post_actions(); - if (ret) + if (tty_post_actions()) goto err; - ret = inventory_save_uptime(&he); - if (ret) + if (inventory_save_uptime(&he)) goto err; he.has_pre_dump_mode = false; @@ -2328,10 +2321,10 @@ int cr_dump_tasks(pid_t pid) he.allow_uprobes = true; } - ret = write_img_inventory(&he); + exit_code = write_img_inventory(&he); err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); - return cr_dump_finish(ret); + return cr_dump_finish(exit_code); } From 30acbabcddbad502660f2e4b1aab9f16f99bca4f Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Wed, 26 Nov 2025 13:49:25 +0000 Subject: [PATCH 760/775] ci: also exclude docker version 29 Docker version 28 broke container restore in combination with network namespaces. The workaround in the CI script was excluding Docker version 28. Now that there is also Docker version 29, which is still broken, this also excludes Docker version 29. Signed-off-by: Adrian Reber --- scripts/ci/docker-test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index bc5a74667..c1c745544 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -2,7 +2,7 @@ set -x -e -o pipefail -# Workaround: Docker 28.x has a known regression that breaks the checkpoint and +# Workaround: Docker 28.x and 29.x has a known regression that breaks the checkpoint and # restore (C/R) feature. Let's install previous, or next major version. See # https://github.com/moby/moby/issues/50750 for details on the bug. export DEBIAN_FRONTEND=noninteractive @@ -17,7 +17,7 @@ echo \ $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list apt update -y apt-cache madison docker-ce | awk '{ print $3 }' -verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -v ':28\.'| tail -n 1)" +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -Ev ':(28|29)\.'| tail -n 1)" ../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" # docker checkpoint and restore is an experimental feature From d4e8114130bd97ad280f85568e9feeb93c27cd53 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 17 Dec 2025 00:53:59 +0000 Subject: [PATCH 761/775] readme: use a local copy of the CRIU logo The README currently uses an external link to criu.org for the embedded CRIU logo. Loading this URL when viewing the README on GitHub sometimes fails with "Error Fetching Resource". Using a local copy of the logo fixes this issue. Signed-off-by: Radostin Stoyanov --- Documentation/logo.svg | 136 +++++++++++++++++++++++++++++++++++++++++ README.md | 2 +- 2 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 Documentation/logo.svg diff --git a/Documentation/logo.svg b/Documentation/logo.svg new file mode 100644 index 000000000..f713e72b7 --- /dev/null +++ b/Documentation/logo.svg @@ -0,0 +1,136 @@ + + + + + + + diff --git a/README.md b/README.md index f578e745c..6e2a0de9e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)]( https://circleci.com/gh/checkpoint-restore/criu) -

+

## CRIU -- A project to implement checkpoint/restore functionality for Linux From 2e5f9facf92e90a81f06f3afd3fa214a9fa7bf1c Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 26 Dec 2025 12:28:45 +0800 Subject: [PATCH 762/775] util: Make close_safe() reset fd to -1 even on close() failure The "man 2 close":"Dealing with error returns from close()" says: "Retrying the close() after a failure return is the wrong thing to do" We should not leave the fd there, attempting to close it again on next close()/close_safe() may lead to accidentally closing something else. It confirms with the kernel code where sys_close() removes fd from fdtable in this stack: +-> sys_close +-> file_close_fd +-> file_close_fd_locked +-> rcu_assign_pointer(fdt->fd[fd], NULL) If there was an fd this stack is always reached and fd is always removed. Let's replace the fd with -1 after close no matter what. Signed-off-by: Pavel Tikhomirov --- criu/util.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/criu/util.c b/criu/util.c index e2f80e4c6..2eaad35bb 100644 --- a/criu/util.c +++ b/criu/util.c @@ -222,10 +222,9 @@ int close_safe(int *fd) if (*fd > -1) { ret = close(*fd); - if (!ret) - *fd = -1; - else - pr_perror("Unable to close fd %d", *fd); + if (ret) + pr_perror("Failed closing fd %d", *fd); + *fd = -1; } return ret; From fc1867c44d1b6b0771deb2ff317331fc30b7ac78 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 2 Jan 2026 13:15:43 +0800 Subject: [PATCH 763/775] kerndat: Fix error handling for kerndat_has_timer_cr_ids() fail After commit [1] we accidentally stopped reporting the errors from kerndat_has_timer_cr_ids(), let's fix that. Fixes: 1eaa870cc ("kerndat: check that hardware breakpoints work") [1] Signed-off-by: Pavel Tikhomirov --- criu/kerndat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/criu/kerndat.c b/criu/kerndat.c index 997181ce7..2dc2f77d5 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -2116,6 +2116,7 @@ int kerndat_init(void) } if (!ret && kerndat_has_timer_cr_ids()) { pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; } if (!ret && kerndat_breakpoints()) { pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); From b1a51489ddfce210ed6f64024fbd5ea823bacfd1 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 11:00:15 +0000 Subject: [PATCH 764/775] compel: fix sys_clock_gettime function signature The initialization of the struct timespec used as clockid input parameter was removed in commit: b4441d1bd8a56ed9ec08603e1d4acf5c779fe935 ("restorer.c: rm unneded struct init") This causes the build to fail on Alpine with clang version 21.1.2: GEN criu/pie/parasite-blob.h criu/pie/restorer.c:1230:39: error: variable 'ts' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 1230 | if (sys_clock_gettime(t->clockid, &ts)) { | ^~ 1 error generated. make[2]: *** [/criu/scripts/nmk/scripts/build.mk:118: criu/pie/restorer.o] Error 1 make[1]: *** [criu/Makefile:59: pie] Error 2 make: *** [Makefile:278: criu] Error 2 To fix this, we remove the "const" from the declaration of clock_gettime. Since the kernel writes the current time into the struct timespec provided by the caller, the pointer must be writable. Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- compel/arch/arm/plugins/std/syscalls/syscall.def | 2 +- compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl | 2 +- compel/arch/mips/plugins/std/syscalls/syscall_64.tbl | 2 +- compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 2 +- compel/arch/riscv64/plugins/std/syscalls/syscall.def | 2 +- compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl | 2 +- compel/arch/x86/plugins/std/syscalls/syscall_64.tbl | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 819678566..f4deb02b2 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -85,7 +85,7 @@ timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimer timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl index aa6ffb44d..83dcdab4a 100644 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -46,7 +46,7 @@ __NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimer __NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) __NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) __NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 113 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 113 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) __NR_restart_syscall 128 sys_restart_syscall (void) __NR_kill 129 sys_kill (long pid, int sig) diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 85faca5a9..ad3d44634 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -84,7 +84,7 @@ __NR_sys_timer_settime 5217 sys_timer_settime (kernel_timer_t timer_id, int fl __NR_sys_timer_gettime 5218 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 5219 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 5220 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 5222 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 5222 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 5205 sys_exit_group (int error_code) __NR_set_thread_area 5242 sys_set_thread_area (unsigned long *addr) __NR_openat 5247 sys_openat (int dfd, const char *filename, int flags, int mode) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index c56b4e6de..3deb41cf7 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 246 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 234 sys_exit_group (int error_code) __NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def index 17f763e90..967f097f9 100644 --- a/compel/arch/riscv64/plugins/std/syscalls/syscall.def +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall.def @@ -85,7 +85,7 @@ timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimer timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 018d58a59..ff2f33006 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 260 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 248 sys_exit_group (int error_code) __NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 7fbfd69ad..8c3620c2a 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -85,7 +85,7 @@ __NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int fla __NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 228 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 231 sys_exit_group (int error_code) __NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) From 974c1bc898bb4f5104a44d19029c5800ae431686 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 14:05:20 +0000 Subject: [PATCH 765/775] zdtm/tempfs_subns: fix uninitialized variable DEP tempfs_subns.d CC tempfs_subns.o tempfs_subns.c:50:23: error: variable 'fd' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 50 | if (write(fds[1], &fd, sizeof(fd)) != sizeof(fd)) { | ^~ 1 error generated. make[1]: *** [../Makefile.inc:96: tempfs_subns.o] Error 1 Signed-off-by: Radostin Stoyanov --- test/zdtm/static/tempfs_subns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/tempfs_subns.c b/test/zdtm/static/tempfs_subns.c index ed3ef9a3a..490fdad6e 100644 --- a/test/zdtm/static/tempfs_subns.c +++ b/test/zdtm/static/tempfs_subns.c @@ -20,7 +20,7 @@ int main(int argc, char **argv) { int fds[2], i; pid_t pid; - int fd, status; + int status, fd = -1; test_init(argc, argv); From 2dd66866e3e07c2cebeaa1713bc310c98d5027e7 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 14:13:05 +0000 Subject: [PATCH 766/775] zdtm/cgroup_stray: fix uninitialized variable 51.04 DEP cgroup_stray.d 51.07 CC cgroup_stray.o 51.11 cgroup_stray.c:164:18: error: variable 'c' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 51.11 164 | if (write(sk, &c, 1) != 1) { 51.11 | ^ 51.11 1 error generated. 51.12 make[1]: *** [../Makefile.inc:96: cgroup_stray.o] Error 1 51.12 make[1]: Leaving directory '/criu/test/zdtm/static' 51.12 make: *** [Makefile:7: static] Error 2 Signed-off-by: Radostin Stoyanov --- test/zdtm/static/cgroup_stray.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/cgroup_stray.c b/test/zdtm/static/cgroup_stray.c index 0c0ed93cf..f5754410f 100644 --- a/test/zdtm/static/cgroup_stray.c +++ b/test/zdtm/static/cgroup_stray.c @@ -135,7 +135,7 @@ out: int main(int argc, char **argv) { int ret = -1, sk_pair[2], sk, status; - char path[PATH_MAX], c; + char path[PATH_MAX], c = 0; pid_t pid = 0; test_init(argc, argv); From ddf7a170ff74befbe7da2114054e4b1c85ea2d3d Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 12:12:53 +0000 Subject: [PATCH 767/775] infect-types: fix user_gcs redefine error In file included from compel/arch/aarch64/src/lib/infect.c:10: compel/include/uapi/compel/asm/infect-types.h:24:8: error: redefinition of 'user_gcs' 24 | struct user_gcs { | ^ /usr/include/asm/ptrace.h:329:8: note: previous definition is here 329 | struct user_gcs { | ^ 1 error generated. make[1]: *** [/criu/scripts/nmk/scripts/build.mk:215: compel/arch/aarch64/src/lib/infect.o] Error 1 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- .../arch/aarch64/src/lib/include/uapi/asm/infect-types.h | 7 +++++-- compel/arch/aarch64/src/lib/infect.c | 8 ++++---- criu/arch/aarch64/crtools.c | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 3a34ab4f6..606c92ffe 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -20,8 +20,11 @@ typedef struct user_pt_regs user_regs_struct_t; /* * GCS (Guarded Control Stack) + * + * This mirrors the kernel definition but renamed to cr_user_gcs + * to avoid conflict with kernel headers (/usr/include/asm/ptrace.h). */ -struct user_gcs { +struct cr_user_gcs { __u64 features_enabled; __u64 features_locked; __u64 gcspr_el0; @@ -29,7 +32,7 @@ struct user_gcs { struct user_fpregs_struct { struct user_fpsimd_state fpstate; - struct user_gcs gcs; + struct cr_user_gcs gcs; }; typedef struct user_fpregs_struct user_fpregs_struct_t; diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 7450ac026..42f593c79 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -41,7 +41,7 @@ bool __compel_host_supports_gcs(void) return (hwcap & HWCAP_GCS) != 0; } -static bool __compel_gcs_enabled(struct user_gcs *gcs) +static bool __compel_gcs_enabled(struct cr_user_gcs *gcs) { if (!compel_host_supports_gcs()) return false; @@ -136,7 +136,7 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; - struct user_gcs gcs; + struct cr_user_gcs gcs; struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; pr_info("Restoring GP/FPU registers for %d\n", pid); @@ -363,7 +363,7 @@ int ptrace_flush_breakpoints(pid_t pid) return 0; } -int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct user_gcs *gcs) +int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct cr_user_gcs *gcs) { struct iovec gcs_iov = { .iov_base = gcs, .iov_len = sizeof(*gcs) }; @@ -403,7 +403,7 @@ int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct user_gcs *g int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) { - struct user_gcs gcs; + struct cr_user_gcs gcs; struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; pid_t pid = ctl->rpid; diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 835a83400..2e89f9ce3 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -148,7 +148,7 @@ static int save_pac_keys(int pid, CoreEntry *core) int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; - struct user_gcs gcs_live; + struct cr_user_gcs gcs_live; struct iovec gcs_iov = { .iov_base = &gcs_live, .iov_len = sizeof(gcs_live), From 36f1e9d38c3b697ca38f405df666929bcd483034 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 29 Dec 2025 15:26:22 +0000 Subject: [PATCH 768/775] amdgpu: use fseeko with large-file support instead of fseeko64 As of Alpine Linux 3.19, musl libc no longer contains separate fopen64(), fseeko64(), or ftello64() functions. This causes building CRIU with amdgpu plugin to fail with the following error: amdgpu_plugin.c: In function 'parallel_restore_bo_contents': amdgpu_plugin.c:2286:17: error: implicit declaration of function 'fseeko64'; did you mean 'fseeko'? [-Wimplicit-function-declaration] 2286 | fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET); | ^~~~~~~~ | fseeko make[2]: *** [Makefile:31: amdgpu_plugin.so] Error 1 make[1]: *** [Makefile:363: amdgpu_plugin] Error 2 To fix this, add the missing $(DEFINES) to plugin builds, and since we always compile with _FILE_OFFSET_BITS=64, we don't need the 64 suffix. Fixes: #2826 Suggested-by: Andrei Vagin Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/Makefile | 2 +- plugins/amdgpu/amdgpu_plugin.c | 2 +- plugins/cuda/Makefile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 31e177e4a..250e7b0e7 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -28,7 +28,7 @@ criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc --proto_path=. --c_out=. criu-amdgpu.proto amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c - $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) + $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: $(call msg-clean, $@) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 574d7b829..ee55bde0a 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -2283,7 +2283,7 @@ void *parallel_restore_bo_contents(void *_thread_data) continue; entry = &restore_cmd->entries[i]; - fseeko64(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + fseeko(bo_contents_fp, entry->read_offset + offset, SEEK_SET); ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE, false); diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile index cc3d98ac9..2c1944a34 100644 --- a/plugins/cuda/Makefile +++ b/plugins/cuda/Makefile @@ -19,7 +19,7 @@ all: $(DEPS_CUDA) cuda_plugin.so: cuda_plugin.c $(call msg-gen, $@) - $(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) + $(Q) $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) clean: $(call msg-clean, $@) From 71fe85ec90871ffcfb57060f28760f607e0e6d08 Mon Sep 17 00:00:00 2001 From: ImranullahKhann Date: Thu, 8 Jan 2026 19:55:04 +0500 Subject: [PATCH 769/775] ci: add iproute2 to the list of packages in apt-packages.sh When running the command 'make docker-test', almost all zdtm tests fail, logging 'ip: not found'. 'ip' command of the iproute2 package was missing. So added the package to the list of dependencies in 'apt-packages.sh'. Now tests run Signed-off-by: ImranullahKhann --- contrib/dependencies/apt-packages.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh index 1fd42d4e6..7963be7b4 100755 --- a/contrib/dependencies/apt-packages.sh +++ b/contrib/dependencies/apt-packages.sh @@ -13,6 +13,7 @@ fi build-essential \ gdb \ git-core \ + iproute2 \ iptables \ kmod \ libaio-dev \ From 9885fb3c75ee6d18e554ec1ddddf6ec2c89ea848 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Sun, 11 Jan 2026 14:56:49 +0000 Subject: [PATCH 770/775] crit: fix incorrect task state decoding CRIU defines the following constants for task state in compel/include/uapi/task-state.h COMPEL_TASK_ALIVE = 0x01 COMPEL_TASK_STOPPED = 0x03 COMPEL_TASK_ZOMBIE = 0x06 Thus, we need to swap the values for "zombie" and "stopped" used in CRIT. Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index a35dd3c3f..b80e3475f 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -154,8 +154,8 @@ flags_maps = { gen_maps = { 'task_state': { 1: 'Alive', - 3: 'Zombie', - 6: 'Stopped' + 3: 'Stopped', + 6: 'Zombie', }, } From b208bec12d5bc68b47b2a1f026f527e7f3141445 Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Mon, 12 Jan 2026 05:47:02 +0000 Subject: [PATCH 771/775] crit: show dead task_state In some cases, CRIU can observe tasks that exit during checkpointing, and sets the state of these tasks to COMPEL_TASK_DEAD. This patch adds a string representation of this value that can be used by CRIT when decoding the images. Signed-off-by: Radostin Stoyanov --- lib/pycriu/images/pb2dict.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/pycriu/images/pb2dict.py b/lib/pycriu/images/pb2dict.py index b80e3475f..f22887a52 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -154,6 +154,7 @@ flags_maps = { gen_maps = { 'task_state': { 1: 'Alive', + 2: 'Dead', 3: 'Stopped', 6: 'Zombie', }, From fb59ae504e90edf08c133b4d2d938f645421c396 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 9 Jan 2026 08:15:05 +0000 Subject: [PATCH 772/775] test: fix GCC 16 compile error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fedora rawhide ships a pre-release of GCC 16 which produces following error: uprobes.c:34:22: error: variable ‘dummy’ set but not used [-Werror=unused-but-set-variable=] 34 | volatile int dummy = 0; | ^~~~~ Marking this variable as "__maybe_unused" to fix the error. Signed-off-by: Adrian Reber --- test/zdtm/static/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c index 4164375b7..6ef9a56bc 100644 --- a/test/zdtm/static/uprobes.c +++ b/test/zdtm/static/uprobes.c @@ -31,7 +31,7 @@ const char *test_author = "Shashank Balaji "; * compiler optimization) and use it (to prevent "unused variable" warning) */ void UPROBED_FUNCTION(void) { - volatile int dummy = 0; + volatile int dummy __maybe_unused = 0; dummy += 1; } /* Calling via volatile function pointer ensures noinline at callsite */ From 07af3304fdce72b479d8670fced93604c295461f Mon Sep 17 00:00:00 2001 From: liqiang2020 Date: Tue, 16 Dec 2025 18:14:13 +0800 Subject: [PATCH 773/775] restore/pie: check return value of sys_rseq on unregister The return value of sys_rseq was previously ignored during unregistration, under the assumption that it would not fail if the rseq structure was properly registered. However, if sys_rseq fails, the kernel retains the registration. If the memory containing the rseq structure is subsequently unmapped or reused, kernel updates to the rseq area can cause the process to crash (e.g., via SIGSEGV). Check the return value of sys_rseq. If it fails, log the error code and abort the restoration process. This makes rseq unregistration failures fatal and explicit, aiding in debugging and preventing later obscure crashes. Signed-off-by: liqiang2020 --- criu/pie/restorer.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 008e1398d..0a8aba41b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -1363,13 +1363,19 @@ __visible void __export_unmap(void) sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } -static void unregister_libc_rseq(struct rst_rseq_param *rseq) +static int unregister_libc_rseq(struct rst_rseq_param *rseq) { - if (!rseq->rseq_abi_pointer) - return; + long ret; - /* can't fail if rseq is registered */ - sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 1, rseq->signature); + if (!rseq->rseq_abi_pointer) + return 0; + + ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 1, rseq->signature); + if (ret) { + pr_err("Failed to unregister libc rseq %ld\n", ret); + return -1; + } + return 0; } /* @@ -1803,7 +1809,8 @@ __visible long __export_restore_task(struct task_restore_args *args) * for instance once the kernel will want to update (struct rseq).cpu_id field: * https://github.com/torvalds/linux/blob/ce522ba9ef7e/kernel/rseq.c#L89 */ - unregister_libc_rseq(&args->libc_rseq); + if (unregister_libc_rseq(&args->libc_rseq)) + goto core_restore_end; if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, bootstrap_start, bootstrap_len, args->task_size)) From 21a6758268e9230ae258d3c70aa12ae1a4da6750 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Fri, 16 Jan 2026 12:33:10 +0800 Subject: [PATCH 774/775] cr-restore/shstk: Make arch_shstk_unlock use correct pid In a simple case where the parent process and the child one are in one pid namespace we can safely use vpid(item) to prace the child. But, for the cases where the child is a pid namespace init, or the child is put into external pid namespace, the parent and the child have different pid namespaces and using pid vpid(item) (which e.g. for init will always be 1 here) to ptrace the child process is inorrect. Let's use the pid reported to us from clone as it's always the right pid of the child from the parent's point of view. Fixes: 7dd583002 ("restore: add infrastructure to enable shadow stack") Signed-off-by: Pavel Tikhomirov --- criu/cr-restore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index a5eda8d60..b92b92715 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -1238,7 +1238,7 @@ static inline int fork_with_pid(struct pstree_item *item) pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } - arch_shstk_unlock(item, ca.core, pid); + arch_shstk_unlock(item, ca.core, ret); err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) From 9e5fbcd668709a79a876e4c6eff13b975b0f3631 Mon Sep 17 00:00:00 2001 From: unichronic Date: Sun, 18 Jan 2026 00:37:44 +0530 Subject: [PATCH 775/775] pycriu: Fix self-dump failure with explicit PID When `opts.pid` is explicitly set to `os.getpid()`, `pycriu` fails to daemonize the `criu` process. This causes `criu` to run as a child of the dumped process, leading to the error "The criu itself is within dumped tree". This can be fixed by modifying `_send_req_and_recv_resp` to check if the target PID matches the current process PID. If so, it enables daemon mode, ensuring `criu` is detached and the dump succeeds. Signed-off-by: unichronic --- lib/pycriu/criu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pycriu/criu.py b/lib/pycriu/criu.py index 760d2be78..51a5c2902 100644 --- a/lib/pycriu/criu.py +++ b/lib/pycriu/criu.py @@ -242,7 +242,7 @@ class criu: # process resources from its own if criu is located in a same # process tree it is trying to dump. daemon = False - if req.type == rpc.DUMP and not req.opts.HasField('pid'): + if req.type == rpc.DUMP and (not req.opts.HasField('pid') or req.opts.pid == os.getpid()): daemon = True try: