diff --git a/.circleci/config.yml b/.circleci/config.yml index 47f7ad9b1..785b383e1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2.1 jobs: test-local-gcc: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout @@ -11,7 +11,7 @@ jobs: command: sudo -E make -C scripts/ci local test-local-clang: machine: - image: ubuntu-2004:202010-01 + image: default working_directory: ~/criu steps: - checkout diff --git a/.cirrus.yml b/.cirrus.yml index 2b6903ddc..72dbb3898 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -13,12 +13,41 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso +task: + name: CentOS Stream 9 based test + environment: + HOME: "/root" + CIRRUS_WORKING_DIR: "/tmp/criu" + + compute_engine_instance: + image_project: centos-cloud + image: family/centos-stream-9 + platform: linux + cpu: 4 + memory: 8G + + setup_script: | + dnf config-manager --set-enabled crb # Same as CentOS 8 powertools + dnf -y install epel-release epel-next-release + contrib/dependencies/dnf-packages.sh + # The image has a too old version of nettle which does not work with gnutls. + # Just upgrade to the latest to make the error go away. + dnf -y upgrade nettle nettle-devel + systemctl stop sssd + # Even with selinux in permissive mode the selinux tests will be executed. + # The Cirrus CI user runs as a service from selinux point of view and is + # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0). + # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode. + setenforce 0 + + build_script: | + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" + task: name: Vagrant Fedora Rawhide based test environment: @@ -34,67 +63,39 @@ task: nested_virtualization: true setup_script: | - scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-rawhide task: - name: CentOS 8 based test + name: Vagrant Fedora based test (non-root) environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: - image_project: centos-cloud - image: family/centos-stream-8 + image_project: cirrus-images + image: family/docker-kvm platform: linux cpu: 4 - memory: 8G + memory: 16G + nested_virtualization: true setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core - yum config-manager --set-enabled powertools - yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf xmlto - alternatives --set python /usr/bin/python3 - systemctl stop sssd - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - pip3 install junit_xml - + contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + sudo kvm-ok build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" + make -C scripts/ci vagrant-fedora-non-root task: - name: CentOS 7 based test - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: centos-cloud - image: family/centos-7 - platform: linux + name: aarch64 Fedora Rawhide + arm_container: + image: registry.fedoraproject.org/fedora:rawhide cpu: 4 - memory: 8G - - setup_script: | - ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto - yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel - # Even with selinux in permissive mode the selinux tests will be executed - # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode - setenforce 0 - # Enable user namespaces on CentOS 7 - echo 10000 > /proc/sys/user/max_user_namespaces - # Adapt sudoers to our needs - echo 'root ALL=(ALL:ALL) ALL' | EDITOR='tee -a' visudo - + memory: 4G + script: uname -a build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" + scripts/ci/prepare-for-fedora-rawhide.sh + make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 + make -C test/zdtm -j 4 diff --git a/.clang-format b/.clang-format index 96ba5909f..fb40bc613 100644 --- a/.clang-format +++ b/.clang-format @@ -53,7 +53,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false -ColumnLimit: 120 +ColumnLimit: 0 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false @@ -71,6 +71,7 @@ FixNamespaceComments: false # Unknown to clang-format-4.0 # | sort | uniq ForEachMacros: - 'for_each_pstree_item' + - 'for_each_bit' - 'apei_estatus_for_each_section' - 'ata_for_each_dev' - 'ata_for_each_link' @@ -515,6 +516,7 @@ IncludeCategories: Priority: 1 IncludeIsMainRegex: '(Test)?$' IndentCaseLabels: false +IndentGotoLabels: false IndentPPDirectives: None # Unknown to clang-format-5.0 IndentWidth: 8 IndentWrappedFunctionNames: false diff --git a/.codespellrc b/.codespellrc index 765dacfab..5def594b2 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -skip = ./.git,./test/pki -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng +skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h +ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index 07eb8be65..000000000 --- a/.drone.yml +++ /dev/null @@ -1,82 +0,0 @@ ---- -kind: pipeline -type: docker -name: aarch64 build GCC (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: aarch64 build CLANG (native) - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: ubuntu:focal - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: armhf build GCC (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local - ---- -kind: pipeline -type: docker -name: armhf build CLANG (native) - -platform: - os: linux - arch: arm - -steps: -- name: build - # At the time of setting up focal did not work - image: ubuntu:bionic - commands: - - scripts/ci/apt-install make - - make -C scripts/ci local CLANG=1 - ---- -kind: pipeline -type: docker -name: aarch64 Fedora Rawhide - -platform: - os: linux - arch: arm64 - -steps: -- name: build - image: registry.fedoraproject.org/fedora:rawhide - commands: - - scripts/ci/prepare-for-fedora-rawhide.sh - - make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 - - make -C test/zdtm -j 4 diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml new file mode 100644 index 000000000..ebbecadb3 --- /dev/null +++ b/.github/workflows/aarch64-test.yaml @@ -0,0 +1,34 @@ +name: aarch64 test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: aarch64-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + strategy: + matrix: + os: [ubuntu-24.04-arm, ubuntu-22.04-arm] + target: [GCC=1, CLANG=1] + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} + # Following tests are failing on the VMs: + # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out + # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) + # + # In combination with '--remote-lazy-pages' following error occurs: + # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) + run: | + # The 'sched_policy00' needs the following: + sudo sysctl -w kernel.sched_rt_runtime_us=-1 + # etc/hosts entry is needed for netns_lock_iptables + echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts + sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ + ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 6fc546ff5..0f5c20f48 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -2,14 +2,20 @@ name: Alpine Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: alpine-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 strategy: matrix: + os: [ubuntu-22.04, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] + runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Alpine ${{ matrix.target }} Test run: sudo -E make -C scripts/ci alpine ${{ matrix.target }} diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index bb98623a8..425f0662b 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -2,10 +2,15 @@ name: Arch Linux Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: archlinux-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Arch Linux Test run: sudo -E make -C scripts/ci archlinux diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml new file mode 100644 index 000000000..bf7d06697 --- /dev/null +++ b/.github/workflows/check-commits.yml @@ -0,0 +1,30 @@ +name: Verify self-contained commits + +on: pull_request + +# Cancel any preceding run on the pull request +concurrency: + group: commit-test-${{ github.event.pull_request.number }} + +jobs: + build: + runs-on: ubuntu-latest + # Check if pull request does not have label "not-selfcontained-ok" + if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" + steps: + - uses: actions/checkout@v4 + with: + # Needed to rebase against the base branch + fetch-depth: 0 + # Checkout pull request HEAD commit instead of merge commit + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev + - name: Configure git user details + run: | + git config --global user.email "checkpoint-restore@users.noreply.github.com" + git config --global user.name "checkpoint-restore" + - name: Configure base branch without switching current branch + run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} + - name: Build each commit + run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 000000000..9c9e46c1b --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,50 @@ +name: "CodeQL" + +on: + push: + branches: [ "criu-dev", "master" ] + pull_request: + branches: [ "criu-dev" ] + schedule: + - cron: "11 6 * * 3" + +# Cancel any preceding run on the pull request. +concurrency: + group: codeql-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ python, cpp ] + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Packages (cpp) + if: ${{ matrix.language == 'cpp' }} + run: | + sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index 5ae25fb73..8a64ce185 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -2,15 +2,20 @@ name: Compat Tests on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: compat-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: target: [GCC, CLANG] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Compat Tests (${{ matrix.target }}) run: sudo -E make -C scripts/ci local COMPAT_TEST=y ${{ matrix.target }}=1 diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index 927ddced2..c709cca00 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -10,11 +10,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross] + target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, riscv64-stable-cross] branches: [criu-dev, master] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 with: ref: ${{ matrix.branches }} - name: Run Cross Compilation Targets diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index be8e7f09c..96672b294 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -2,6 +2,11 @@ name: Cross Compile Tests on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: cross-compile-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: @@ -16,6 +21,7 @@ jobs: aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, + riscv64-stable-cross, ] include: - experimental: true @@ -28,7 +34,7 @@ jobs: target: mips64el-unstable-cross steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Cross Compilation Targets run: > sudo make -C scripts/ci ${{ matrix.target }} diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index 564691449..23696905a 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -2,13 +2,18 @@ name: Docker Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: docker-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-20.04] + os: [ubuntu-22.04] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Docker Test (${{ matrix.os }}) run: sudo make -C scripts/ci docker-test diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 44b0f16d6..02dc9a1b3 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -2,11 +2,16 @@ name: Fedora ASAN Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: fedora-asan-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora ASAN Test run: sudo -E make -C scripts/ci fedora-asan diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 00bc3b2bd..83e2ead82 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -2,11 +2,20 @@ name: Fedora Rawhide Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: fedora-rawhide-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Fedora Rawhide Test - run: sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" + # We need to pass environment variables from the CI environment to + # distinguish between CI environments. However, we need to make sure that + # XDG_RUNTIME_DIR environment variable is not set due to a bug in Podman. + # FIXME: https://github.com/containers/podman/issues/14920 + run: sudo -E XDG_RUNTIME_DIR= make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index f782c5b9d..cc4e1d44a 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -2,12 +2,17 @@ name: Coverage Tests on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: gcov-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Coverage Tests run: sudo -E make -C scripts/ci local GCOV=1 - name: Run gcov diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml new file mode 100644 index 000000000..cbd3c1f23 --- /dev/null +++ b/.github/workflows/java-test.yml @@ -0,0 +1,16 @@ +name: Java Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: java-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Run Java Test + run: sudo make -C scripts/ci java-test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c3886c707..f7da4f6f6 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -2,6 +2,11 @@ name: Run code linter on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: lint-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: runs-on: ubuntu-latest @@ -9,9 +14,9 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils codespell + run: sudo dnf -y install git make ruff xz clang-tools-extra codespell git-clang-format ShellCheck - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set git safe directory # https://github.com/actions/checkout/issues/760 @@ -21,13 +26,15 @@ jobs: run: make lint - name: Run make indent - run: > - make indent && - STATUS=$(git status --porcelain) && - if [ ! -z "$STATUS" ]; then - echo "FAIL: some files are not correctly formatted."; - echo "$STATUS" - git diff - echo "FAIL: please run 'make indent'"; - exit 1; + continue-on-error: true + run: | + if [ -z "${{github.base_ref}}" ]; then + git fetch --deepen=1 + make indent + else + git fetch origin ${{github.base_ref}} + make indent BASE=origin/${{github.base_ref}} fi + - name: Raise in-line make indent warnings + run: | + git diff | ./scripts/github-indent-warnings.py diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml new file mode 100644 index 000000000..d7c554c87 --- /dev/null +++ b/.github/workflows/loongarch64-qemu-test.yml @@ -0,0 +1,15 @@ +name: LoongArch64 Qemu Test + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/.github/workflows/manage-labels.yml b/.github/workflows/manage-labels.yml new file mode 100644 index 000000000..a2bcd8860 --- /dev/null +++ b/.github/workflows/manage-labels.yml @@ -0,0 +1,14 @@ +name: Remove labels +on: [issue_comment, pull_request_review_comment] +jobs: + remove-labels-on-comments: + name: Remove labels on comments + if: github.event_name == 'issue_comment' + runs-on: ubuntu-latest + steps: + - uses: mondeja/remove-labels-gh-action@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + labels: | + changes requested + awaiting reply diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml new file mode 100644 index 000000000..7a7d8bd30 --- /dev/null +++ b/.github/workflows/nftables-test.yml @@ -0,0 +1,24 @@ +name: Nftables bases testing + +on: [push, pull_request] + +# Cancel any preceding run on the pull request. +concurrency: + group: nftables-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + +jobs: + build: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + - name: Remove iptables + run: sudo apt remove -y iptables + - name: Install libnftables-dev + run: sudo contrib/apt-install libnftables-dev + - name: chmod 755 /home/runner + # CRIU's tests are sometimes running as some random user and need + # to be able to access the test files. + run: sudo chmod 755 /home/runner + - name: Build with nftables network locking backend + run: sudo make -C scripts/ci local COMPILE_FLAGS="NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES" diff --git a/.github/workflows/openj9-test.yml b/.github/workflows/openj9-test.yml deleted file mode 100644 index 1d7a1eb6b..000000000 --- a/.github/workflows/openj9-test.yml +++ /dev/null @@ -1,11 +0,0 @@ -name: OpenJ9 Test - -on: [push, pull_request] - -jobs: - build: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - name: Run OpenJ9 Test - run: sudo make -C scripts/ci openj9-test diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index 447cbf0b6..a07edbe5b 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -2,10 +2,15 @@ name: Podman Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: podman-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run Podman Test run: sudo make -C scripts/ci podman-test diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index ecdd81e0a..76bd96edf 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -2,11 +2,16 @@ name: CRIU Image Streamer Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: stream-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run CRIU Image Streamer Test run: sudo -E make -C scripts/ci local STREAM_TEST=1 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index e6e84ef52..1f0a469bd 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -2,10 +2,15 @@ name: X86_64 CLANG Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: clang-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 CLANG Test run: sudo make -C scripts/ci x86_64 CLANG=1 diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index b8b81ef15..15e84a0df 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -2,10 +2,15 @@ name: X86_64 GCC Test on: [push, pull_request] +# Cancel any preceding run on the pull request. +concurrency: + group: gcc-test-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} + jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run X86_64 GCC Test run: sudo make -C scripts/ci x86_64 diff --git a/.gitignore b/.gitignore index d5135f5f8..94daa13ea 100644 --- a/.gitignore +++ b/.gitignore @@ -20,26 +20,16 @@ compel/compel compel/compel-host-bin images/*.c images/*.h -images/google/protobuf/*.c -images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest -crit/crit -criu/arch/*/sys-exec-tbl*.c -# x86 syscalls-table is not generated -!criu/arch/x86/sys-exec-tbl.c -criu/arch/*/syscalls*.S -criu/include/syscall-codes*.h -criu/include/syscall*.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h criu/protobuf-desc-gen.h lib/build/ lib/c/criu.pc -lib/.crit-setup.files compel/include/asm include/common/asm include/common/config.h -build/ +build/** diff --git a/.lgtm.yml b/.lgtm.yml index a884a53ef..4beadcc63 100644 --- a/.lgtm.yml +++ b/.lgtm.yml @@ -22,10 +22,4 @@ extraction: - "libbsd-dev" - "python3-yaml" - "libnl-route-3-dev" - - "python-future" - "gnutls-dev" - configure: - command: - - "ls -laR images/google" - - "ln -s /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto" - - "ls -laR images/google" diff --git a/.mailmap b/.mailmap index 6f046b972..8076f0bc9 100644 --- a/.mailmap +++ b/.mailmap @@ -6,3 +6,5 @@ Andrei Vagin Andrei Vagin Andrei Vagin Cyrill Gorcunov +Alexander Mikhalitsyn +Alexander Mikhalitsyn diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 94841b3f3..000000000 --- a/.travis.yml +++ /dev/null @@ -1,35 +0,0 @@ -language: c -os: linux -dist: bionic -services: - - docker -jobs: - include: - - os: linux - arch: ppc64le - env: TR_ARCH=local - dist: bionic - - os: linux - arch: ppc64le - env: TR_ARCH=local CLANG=1 - dist: bionic - - os: linux - arch: s390x - env: TR_ARCH=local - dist: bionic - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local RUN_TESTS=1 - dist: focal - group: edge - virt: vm - - os: linux - arch: arm64-graviton2 - env: TR_ARCH=local CLANG=1 RUN_TESTS=1 - group: edge - virt: vm - dist: bionic -script: - - sudo make -C scripts/ci $TR_ARCH -after_success: - - make -C scripts/ci after_success diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 000000000..e3c5a92d9 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +GEMINI.md \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 864caf93e..03875639d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -8,8 +8,8 @@ Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); @@ -27,54 +27,137 @@ The repository may contain multiple branches. Development happens in the **criu- To clone CRIU repo and switch to the proper branch, run: ``` - git clone https://github.com/checkpoint-restore/criu criu - cd criu - git checkout criu-dev +git clone https://github.com/checkpoint-restore/criu criu +cd criu +git checkout criu-dev ``` -### Compile +### Building from source -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. +Follow these steps to compile CRIU from source code. -To compile CRIU, run: +#### Installing build dependencies + +First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). + +##### On Ubuntu/Debian-based systems: ``` - make +./contrib/dependencies/apt-packages.sh +``` + +##### On Fedora/CentOS-based systems: + +``` +./contrib/dependencies/dnf-packages.sh +``` + +##### Using Nix: + +``` +nix develop +``` + +#### Compiling CRIU + +Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: + +``` +make ``` This should create the `./criu/criu` executable. ## Edit the source code -If you use ctags, you can generate the ctags file by running - -``` - make tags -``` - When you change the source code, please keep in mind the following code conventions: +* code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches * we prefer tabs and indentations to be 8 characters width -* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community. +* we prefer line length of 80 characters or less, more is allowed if it helps with code readability +* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community -Other conventions can be learned from the source code itself. In short, make sure your new code -looks similar to what is already there. +Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. + +## Automatic tools to fix coding-style + +Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. + +The following command can be used to automatically run a code linter for Python files (ruff), Shell scripts (shellcheck), +text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). + +``` +make lint +``` + +In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) +based on the kernel source tree. However, compliance with the clang-format autoformat rules is optional. If the automatic code formatting +results in decreased readability, we may choose to ignore these errors. + +Run the following command to check if your changes are compliant with the clang-format rules: + +``` +make indent +``` + +This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to +specify a range of commits to check for coding style issues. By default, it is set to `HEAD~1`, so that only the last commit is checked. +If you are developing on top of the criu-dev branch and want to check all your commits for compliance with the clang-format rules, you +can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional options to `git-clang-format`. For example, if you want +to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. + +``` +make indent OPTS=--diff BASE=HEAD~N +``` + +Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected +we need to review the suggested changes and decide if they should be fixed before merging. + +Here are some bad examples of clang-format-ing: + +* if clang-format tries to force 120 characters and breaks readability - it is wrong: + +``` +@@ -58,8 +59,7 @@ static int register_membarriers(void) + } + + if (!all_ok) { +- fail("can't register membarrier()s - tried %#x, kernel %#x", +- barriers_registered, barriers_supported); ++ fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); + return -1; + } +``` + +* if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: + +``` +--- a/test/zdtm/static/membarrier.c ++++ b/test/zdtm/static/membarrier.c +@@ -27,9 +27,10 @@ static const struct { + int register_cmd; + int execute_cmd; + } membarrier_cmds[] = { +- { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, +- { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, +- { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, ++ { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, ++ { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, ++ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, ++ { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, + }; +``` ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` - make test +make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. -In case you'd rather have someone else run the tests, you can use travis-ci for your -own GitHub fork of CRIU. It will check the compilation for various supported platforms, -as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu -for more details. - ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or @@ -102,21 +185,21 @@ If your change fixes a bug in a specific commit, e.g. you found an issue using the SHA-1 ID, and the one line summary. For example: ``` - Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") +Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` - [pretty] - fixes = Fixes: %h (\"%s\") +[pretty] + fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` - Fixes: #339 +Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. @@ -199,7 +282,7 @@ can certify the below: then you just add a line saying ``` - Signed-off-by: Random J Developer +Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if @@ -211,14 +294,14 @@ commit message. To append such line to a commit you already made, use ``` From: Random J Developer - Subject: [PATCH] component: Short patch description +Subject: [PATCH] component: Short patch description - Long patch description (could be skipped if patch - is trivial enough) +Long patch description (could be skipped if patch +is trivial enough) - Signed-off-by: Random J Developer - --- - Patch body here +Signed-off-by: Random J Developer +--- +Patch body here ``` ## Submit your work upstream @@ -252,8 +335,8 @@ contains the following: revisions should be listed. For example: ``` - v3: rebase on the current criu-dev - v2: add commit to foo() and update bar() coding style +v3: rebase on the current criu-dev +v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is @@ -271,7 +354,7 @@ Historically, CRIU worked with mailing lists and patches so if you still prefer To create a patch, run ``` - git format-patch --signoff origin/criu-dev +git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches @@ -282,8 +365,8 @@ at all. We recommend to post patches using `git send-email` ``` - git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@openvz.org criu-dev +git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@lists.linux.dev criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -295,14 +378,14 @@ If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` - git config --global sendemail.smtpServer stmp.example.net +git config --global sendemail.smtpServer stmp.example.net ``` -If you get tired of typing `--to=criu@openvz.org` all the time, +If you get tired of typing `--to=criu@lists.linux.dev` all the time, you can configure that to be automatically handled as well: ``` - git config sendemail.to criu@openvz.org +git config sendemail.to criu@lists.linux.dev ``` If a developer is sending another version of the patch (e.g. to address @@ -315,7 +398,7 @@ version if needed though). ### Mail patches -The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. +The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). @@ -332,5 +415,3 @@ sometimes a patch may fly around a week before it gets reviewed. Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. - -We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. diff --git a/Documentation/Makefile b/Documentation/Makefile index 508551450..de0cc448d 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -12,11 +12,9 @@ endif FOOTER := footer.txt SRC1 += crit.txt -ifeq ($(PYTHON),python3) SRC1 += criu-ns.txt -endif SRC1 += compel.txt -SRC1 += amdgpu_plugin.txt +SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/Documentation/compel.txt b/Documentation/compel.txt index a44ca22c6..506228f59 100644 --- a/Documentation/compel.txt +++ b/Documentation/compel.txt @@ -97,7 +97,10 @@ Following steps are performed to infect the victim process: - execute system call: *int compel_syscall(ctl, int syscall_nr, long *ret, int arg ...);* - infect victim: *int compel_infect(ctl, nr_thread, size_of_args_area);* - cure the victim: *int compel_cure(ctl);* //ctl pointer is freed by this call - - Resume victim: *int compel_resume_task(pid, orig_state, state);* + - Resume victim: *int compel_resume_task(pid, orig_state, state)* or + *int compel_resume_task_sig(pid, orig_state, state, stop_signo).* + //compel_resume_task_sig() could be used in case when victim is in stopped state. + stop_signo could be read by calling compel_parse_stop_signo(). *ctl* must be configured with blob information by calling *PREFIX_setup_c_header()*, with ctl as its argument. *PREFIX* is the argument given to *-p* when calling hgen, else it is deduced from file name. diff --git a/Documentation/amdgpu_plugin.txt b/Documentation/criu-amdgpu-plugin.txt similarity index 82% rename from Documentation/amdgpu_plugin.txt rename to Documentation/criu-amdgpu-plugin.txt index 0d490b429..fe76fc3bc 100644 --- a/Documentation/amdgpu_plugin.txt +++ b/Documentation/criu-amdgpu-plugin.txt @@ -3,7 +3,7 @@ ROCM Support(1) NAME ---- -amdgpu_plugin - A plugin extension to CRIU to support checkpoint/restore in +criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in userspace for AMD GPUs. @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container Pytorch Tensorflow Using CRIU Image Streamer +Parallel Restore DESCRIPTION ----------- @@ -22,19 +23,15 @@ Though *criu* is a great tool for checkpointing and restoring running applications, it has certain limitations such as it cannot handle applications that have device files open. In order to support *ROCm* based workloads with *criu* we need to augment criu's core functionality with a -plugin based extension mechanism. *amdgpu_plugin* provides the necessary support +plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support to criu to allow Checkpoint / Restore with ROCm. Dependencies -~~~~~~~~~~~~~~ +------------ *amdkfd support*:: In order to snapshot the *VRAM* and other *GPU* device states, we require - an updated version of amdkfd(amdgpu) driver. The kernel patches are under - review currently. - -*criu 3.16*:: - This work is rebased on latest criu release available at this time. + an updated version of amdkfd(amdgpu) driver. OPTIONS ------- @@ -97,6 +94,15 @@ executing criu command. E.g: KFD_CAPABILITY_CHECK=1 +*KFD_MAX_BUFFER_SIZE*:: + On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping + and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) + to set a limit on the plugin's memory usage. + Default:0 (Disabled) + + E.g: + KFD_MAX_BUFFER_SIZE="2G" + AUTHOR ------ diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 8b128f63e..0c9a9e527 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -155,6 +155,17 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty + *query-ext-files*::: + called after the process tree is stopped and network is locked. + This hook is used only in the RPC mode. The notification reply + contains file ids to be added to external file list (may be empty). + +*--unprivileged*:: + This option tells *criu* to accept the limitations when running + as non-root. Running as non-root requires *criu* at least to have + *CAP_SYS_ADMIN* or *CAP_CHECKPOINT_RESTORE*. For details about running + *criu* as non-root please consult the *NON-ROOT* section. + *-V*, *--version*:: Print program version and exit. @@ -378,6 +389,13 @@ mount -t cgroup -o devices,freezer none devices,freezer 'size' may be postfixed with a *K*, *M* or *G*, which stands for kilo-, mega, and gigabytes, accordingly. +*--ghost-fiemap*:: + Enable an optimization based on fiemap ioctl that can reduce the + number of system calls used when checkpointing highly sparse ghost + files. This option is enabled by default, and it can be disabled + with *--no-ghost-fiemap*. An automatic fallback to SEEK_HOLE/SEEK_DATA + is used when fiemap is not supported. + *-j*, *--shell-job*:: Allow one to dump shell jobs. This implies the restored task will inherit session and process group ID from the *criu* itself. @@ -444,6 +462,33 @@ The 'mode' may be one of the following: *nftables*::: Use nftables rules to drop the packets. + *skip*::: Don't lock the network. If *--tcp-close* is not used, the network + must be locked externally to allow CRIU to dump TCP connections. + +*--allow-uprobes*:: + Allow dumping when uprobes vma is present. When used on dump, this option is + required on restore as well. + + A uprobes vma is automatically created by the kernel once a uprobe is + triggered. This mapping is not removed even once the uprobe is deleted. So, + even if a process once had uprobes attached to it, and they're removed by + the time the process is dumped, this option is still required because criu + has no way of knowing whether there are active uprobes or not. + + When using this option on restore, make sure the uprobes (if any) active on + the dumped processes are still active. Otherwise, when execution reaches + a uprobe'd location in any of the restored processes, that process will be + sent a SIGTRAP. + + As an example, say a uprobe is set at function foo in the executable of the + process p_bar. Whenever execution in p_bar reaches function foo, the uprobe + is triggered. If the uprobe has been triggered at least once, then the kernel + will have created the uprobes vma. To dump p_bar, this option is + necessary. After dumping, say the uprobe is deleted. Now, on restoring with + this option, once execution reaches function foo, SIGTRAP will be sent to + the restored p_bar. Unless it has a signal handler installed for SIGTRAP, + it will be terminated and core dumped. + *restore* ~~~~~~~~~ Restores previously checkpointed processes. @@ -457,8 +502,8 @@ Restores previously checkpointed processes. The 'resource' argument can be one of the following: + - **tty[**__rdev__**:**__dev__**]** - - **pipe[**__inode__**]** - - **socket[**__inode__*]* + - **pipe:[**__inode__**]** + - **socket:[**__inode__*]* - **file[**__mnt_id__**:**__inode__**]** - 'path/to/file' @@ -668,6 +713,13 @@ The 'mode' may be one of the following: build-ID cannot be obtained, 'chksm-first' method will be used. This is the default if mode is unspecified. +*--skip-file-rwx-check*:: + Skip checking file permissions (r/w/x for u/g/o) on restore. + +*--allow-uprobes*:: + Required when dumped with this option. Refer to this option in the section + on dumping for more details. + *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to @@ -874,6 +926,42 @@ configuration file will overwrite all other configuration file settings or RPC options. *This can lead to undesired behavior of criu and should only be used carefully.* +NON-ROOT +-------- +*criu* can be used as non-root with either the *CAP_SYS_ADMIN* capability +or with the *CAP_CHECKPOINT_RESTORE* capability introduces in Linux kernel 5.9. +*CAP_CHECKPOINT_RESTORE* is the minimum that is required. + +*criu* also needs either *CAP_SYS_PTRACE* or a value of 0 in +*/proc/sys/kernel/yama/ptrace_scope* (see *ptrace*(2)) to be able to interrupt +the process for dumping. + +Running *criu* as non-root has many limitations and depending on the process +to checkpoint and restore it may not be possible. + +In addition to *CAP_CHECKPOINT_RESTORE* it is possible to give *criu* additional +capabilities to enable additional features in non-root mode. + +Currently *criu* can benefit from the following additional capabilities: + + - *CAP_NET_ADMIN* + - *CAP_SYS_CHROOT* + - *CAP_SETUID* + - *CAP_SYS_RESOURCE* + +Note that for some operations, having a capability in a namespace other than +the init namespace (i.e. the default/root namespace) is not sufficient. For +example, in order to read symlinks in proc/[pid]/map_files CRIU requires +CAP_CHECKPOINT_RESTORE in the init namespace; having CAP_CHECKPOINT_RESTORE +while running in another user namespace (e.g. in a container) does not allow +CRIU to read symlinks in /proc/[pid]/map_files. + +Without access to /proc/[pid]/map_files checkpointing/restoring processes +that have mapped deleted files may not be possible. + +Independent of the capabilities it is always necessary to use "*--unprivileged*" to +accept *criu*'s limitation in non-root mode. + EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into diff --git a/Documentation/logo.svg b/Documentation/logo.svg new file mode 100644 index 000000000..f713e72b7 --- /dev/null +++ b/Documentation/logo.svg @@ -0,0 +1,136 @@ + + + + + + + diff --git a/GEMINI.md b/GEMINI.md new file mode 100644 index 000000000..e56c1de12 --- /dev/null +++ b/GEMINI.md @@ -0,0 +1,136 @@ +# CRIU (Checkpoint/Restore In User-space) + +CRIU is a tool for saving the state of a running application to a set of files +(checkpointing) and restoring it back to a live state. It is primarily used for +live migration of containers, in-place updates, and fast application startup. + +It is implemented as a command-line tool called `criu`. The two primary commands +are `dump` and `restore`. + +- `dump`: Saves a process tree and all its related resources (file + descriptors, IPC, sockets, namespaces, etc.) into a collection of image + files. +- `restore`: Restores processes from image files to the same state they were + in before the dump. + +## Quick Start + +To get a feel for `criu`, you can try checkpointing and restoring a simple +process. + +1. **Run a simple process:** + Open a terminal and run a command that will run for a while. Find its PID. + ```bash + sleep 1000 & + [1] 12345 + ``` + +2. **Dump the process:** + As root, use `criu dump` with the process ID (`-t`) and a directory for the + image files (`-D`). + ```bash + sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will no longer be running. + +3. **Restore the process:** + Use `criu restore` to bring the process back to life from the images. + ```bash + sudo criu restore -D /tmp/sleep_images -v4 --shell-job + ``` + The `sleep` process will be running again as if nothing happened. + +# For Developers and Contributors + +This section contains more technical details about CRIU's internals and +development process. + +## Dump Process + +On dump, CRIU uses available kernel interfaces to collect information about +processes. For properties that can only be retrieved from within the process +itself, CRIU injects a binary blob (called a "parasite") into the process's +address space and executes it in the context of one of the process's threads. +This injection is handled by a subproject called **Compel**. + +## Restore Process + +On restore, CRIU reads the image files to reconstruct the processes. The goal is +to restore them to the exact state they were in before the dump. The restore +process is divided into several stages (defined as `CR_STATE_*` in +`./criu/include/restorer.h`). + +The main `criu` process acts as a coordinator. It first restores resources with +inter-process dependencies (file descriptors, sockets, shared memory, +namespaces, etc.). It then forks the process tree and sets up namespaces. +Finally, it restores process-specific resources like file descriptors and memory +mappings. + +A key step involves a small, self-contained binary called the "restorer". All +restored processes switch to executing this code, which unmaps the CRIU-specific +memory and restores the application's original memory mappings. On the final +step, the restorer calls `sigreturn` on a prepared signal frame to resume the +process with the state it had at the moment of the dump. + +## Compel + +Compel is a subproject responsible for generating the binary blobs used for the +parasite code (for dumping) and the restorer code (for restoring). It provides a +library for injecting and executing this code within the target process's +address space. It is a separate project because the logic for generating and +injecting Position-Independent Executable (PIE) code is complex and +self-contained. + +## Coding Style + +The C code in the CRIU project follows the +[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). +Here are some of the main points: + +- **Indentation**: Use tabs, which are set to 8 characters. +- **Line Length**: The preferred line limit is 80 characters, but it can be + extended to 120 if it improves code readability. +- **Braces**: + - The opening brace for a function goes on a new line. + - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes + on the same line. +- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, + `=`, etc.). +- **Naming**: Use descriptive names for functions and variables. +- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, + the preferred format is: + ```c + /* + * This is a multi-line + * comment. + */ + ``` + +## Code Layout + +The code is organized into the following directories: + +- `./compel`: The Compel sub-project. +- `./criu`: The main `criu` tool source code. +- `./images`: Protobuf descriptions for the image files. +- `./test`: All tests. +- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. +- `./test/zdtm.py`: The executor script for ZDTM tests. +- `./scripts`: Helper scripts. +- `./scripts/build`: Docker image files used for CI and cross-compilation + checks. +- `./crit`: A tool to inspect and manipulate CRIU image files. +- `./soccr`: A library for TCP socket checkpoint/restore. + +## Tests + +The main test suite is ZDTM. Here is an example of how to run a single test: + +```bash +sudo ./test/zdtm.py run -t zdtm/static/env00 +``` + +Each ZDTM test has three stages: preparation, C/R, and results checks. During +the test, a process calls `test_daemon()` to signal it is ready for C/R, then +calls `test_waitsig()` to wait for the C/R stage to complete. After being +restored, the test checks that all its resources are still in a valid state. diff --git a/INSTALL.md b/INSTALL.md index d786d06eb..af0702518 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,11 +1,31 @@ +## Building CRIU from source code + +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. + +To compile CRIU, run: +``` +make +``` +This should create the `./criu/criu` executable. + +To change the default behaviour of CRIU, the following variables can be passed +to the make command: + + * **NETWORK_LOCK_DEFAULT**, can be set to one of the following + values: `NETWORK_LOCK_IPTABLES`, `NETWORK_LOCK_NFTABLES`, + `NETWORK_LOCK_SKIP`. CRIU defaults to `NETWORK_LOCK_IPTABLES` + if nothing is specified. If another network locking backend is + needed, `make` can be called like this: + `make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES` + ## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing - - make install - +``` +make install +``` this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); @@ -16,17 +36,17 @@ this command accepts the following variables: * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type - - make DESTDIR=/some/new/place install - +``` +make DESTDIR=/some/new/place install +``` and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type - - make uninstall - +``` +make uninstall +``` and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. diff --git a/MAINTAINERS b/MAINTAINERS index bb153f1ab..8fee8e571 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4,3 +4,5 @@ Mike Rapoport Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov +Radostin Stoyanov +Alexander Mikhalitsyn diff --git a/Makefile b/Makefile index 436ebfd0d..e26807158 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64 riscv64,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -35,18 +35,18 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp endif ifeq ($(ARMV),8) - # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. + # Running 'setarch linux32 uname -m' returns armv8l on aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. - USERCFLAGS += -march=armv7-a + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif @@ -64,6 +64,8 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 + CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") + CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) @@ -80,6 +82,14 @@ ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif +ifeq ($(ARCH),loongarch64) + DEFINES := -DCONFIG_LOONGARCH64 +endif + +ifeq ($(ARCH),riscv64) + DEFINES := -DCONFIG_RISCV64 +endif + # # CFLAGS_PIE: # @@ -102,10 +112,20 @@ export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 +DEFINES += -D_LARGEFILE64_SOURCE DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes +# -Wdangling-pointer results in false warning when we add a list element to +# local list head variable. It is false positive because before leaving the +# function we always check that local list head variable is empty, thus +# insuring that pointer to it is not dangling anywhere, but gcc can't +# understand it. +# Note: There is similar problem with kernel list, where this warning is also +# disabled: https://github.com/torvalds/linux/commit/49beadbd47c2 +WARNINGS += -Wno-dangling-pointer -Wno-unknown-warning-option + CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline -fprofile-update=atomic export CFLAGS-GCOV @@ -113,11 +133,19 @@ ifeq ($(ARCH),mips) WARNINGS := -rdynamic endif +ifeq ($(ARCH),loongarch64) +WARNINGS += -Wno-implicit-function-declaration +endif + ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) endif +ifneq ($(NETWORK_LOCK_DEFAULT),) + CFLAGS += -DNETWORK_LOCK_DEFAULT=$(NETWORK_LOCK_DEFAULT) +endif + ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN @@ -142,12 +170,12 @@ export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ -CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit +all: criu lib crit cuda_plugin .PHONY: all # @@ -250,26 +278,19 @@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu -crit/Makefile: ; -crit/%: criu .FORCE - $(Q) $(MAKE) $(build)=crit $@ -crit: criu - $(Q) $(MAKE) $(build)=crit all -.PHONY: crit - unittest: $(criu-deps) $(Q) $(MAKE) $(build)=criu unittest .PHONY: unittest # -# Libraries next once crit it ready +# Libraries next once criu is ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; -lib/%: crit .FORCE +lib/%: criu .FORCE $(Q) $(MAKE) $(build)=lib $@ -lib: crit +lib: criu $(Q) $(MAKE) $(build)=lib all .PHONY: lib @@ -278,25 +299,28 @@ clean mrproper: $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ - $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper clean-amdgpu_plugin: $(Q) $(MAKE) -C plugins/amdgpu clean .PHONY: clean-amdgpu_plugin +clean-cuda_plugin: + $(Q) $(MAKE) -C plugins/cuda clean +.PHONY: clean-cuda_plugin + clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top clean-amdgpu_plugin +clean: clean-top clean-amdgpu_plugin clean-cuda_plugin -mrproper-top: clean-top clean-amdgpu_plugin +mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -328,6 +352,14 @@ amdgpu_plugin: criu $(Q) $(MAKE) -C plugins/amdgpu all .PHONY: amdgpu_plugin +cuda_plugin: criu + $(Q) $(MAKE) -C plugins/cuda all +.PHONY: cuda_plugin + +crit: lib + $(Q) $(MAKE) -C crit +.PHONY: crit + # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -393,6 +425,7 @@ help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' + @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @@ -409,38 +442,57 @@ help: @echo ' lint - Run code linters' @echo ' indent - Indent C code' @echo ' amdgpu_plugin - Make AMD GPU plugin' + @echo ' cuda_plugin - Make NVIDIA CUDA plugin' .PHONY: help -lint: - flake8 --version - flake8 --config=scripts/flake8.cfg test/zdtm.py - flake8 --config=scripts/flake8.cfg test/inhfd/*.py - flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py - flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py - flake8 --config=scripts/flake8.cfg scripts/criu-ns - flake8 --config=scripts/flake8.cfg scripts/crit-setup.py - flake8 --config=scripts/flake8.cfg coredump/ +ruff: + @ruff --version + ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ + test/zdtm.py \ + test/inhfd/*.py \ + test/others/rpc/config_file.py \ + test/others/action-script/check_actions.py \ + test/others/pycriu/*.py \ + lib/pycriu/criu.py \ + lib/pycriu/__init__.py \ + lib/pycriu/images/pb2dict.py \ + lib/pycriu/images/images.py \ + scripts/criu-ns \ + test/others/criu-ns/run.py \ + crit/*.py \ + crit/crit/*.py \ + scripts/uninstall_module.py \ + coredump/ coredump/coredump \ + scripts/github-indent-warnings.py + +shellcheck: shellcheck --version shellcheck scripts/*.sh - shellcheck scripts/ci/*.sh scripts/ci/apt-install - shellcheck test/others/crit/*.sh - shellcheck test/others/libcriu/*.sh - shellcheck test/others/crit/*.sh test/others/criu-coredump/*.sh - shellcheck test/others/config-file/*.sh + shellcheck scripts/ci/*.sh + shellcheck contrib/apt-install contrib/dependencies/*.sh + shellcheck -x test/others/crit/*.sh + shellcheck -x test/others/libcriu/*.sh + shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh + shellcheck -x test/others/config-file/*.sh + shellcheck -x test/others/action-script/*.sh + +codespell: codespell - # Do not append \n to pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' - # Do not use %m with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*%m' - # Do not use errno with pr_perror or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>\(".*".*errno' + +lint: ruff shellcheck codespell + # Do not append \n to pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' + # Do not use %m with pr_* or fail + ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' + # Do not use errno with pr_perror, pr_pwarn or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files ! git --no-pager grep -E '\s+$$' \*.c \*.h -.PHONY: lint +.PHONY: lint ruff shellcheck codespell -codecov: SHELL := $(shell which bash) +codecov: SHELL := $(shell command -v bash) codecov: curl -Os https://uploader.codecov.io/latest/linux/codecov chmod +x codecov @@ -451,8 +503,10 @@ fetch-clang-format: .FORCE $(E) ".clang-format" $(Q) scripts/fetch-clang-format.sh +BASE ?= "HEAD~1" +OPTS ?= "--quiet" indent: - find . -name '*.[ch]' -type f -print0 | xargs --null --max-args 128 --max-procs 4 clang-format -i + git clang-format --style file --extensions c,h $(OPTS) $(BASE) .PHONY: indent include Makefile.install diff --git a/Makefile.compel b/Makefile.compel index 764afadc8..a4209edc5 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series (used on -# Travis) is not, so we have to write them here explicitly. +# match targeting, where GNU make 3.x series is not, +# so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/Makefile.config b/Makefile.config index d46d84f2d..5cf4b8216 100644 --- a/Makefile.config +++ b/Makefile.config @@ -2,12 +2,15 @@ include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak +# This is a kludge for $(info ...) to not eat spaces. +S := + ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() and strlcpy() support.) - $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) + $(info Note: Building without setproctitle() support.) + $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libselinux),y) @@ -23,10 +26,10 @@ endif ifeq ($(call pkg-config-check,libdrm),y) export CONFIG_AMDGPU := y - $(info Note: Building criu with amdgpu_plugin.) + $(info Note: Building with amdgpu_plugin.) else - $(info Note: Building criu without amdgpu_plugin.) - $(info Note: libdrm and libdrm_amdgpu are required to build amdgpu_plugin.) + $(info Note: Building without amdgpu_plugin.) + $(info $S Install libdrm-devel (RPM) or libdrm-dev (DEB) to fix.) endif ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) @@ -34,7 +37,8 @@ ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) export CONFIG_GNUTLS := y FEATURE_DEFINES += -DCONFIG_GNUTLS else - $(info Note: Building without GnuTLS support) + $(info Note: Building without GnuTLS support.) + $(info $S Install gnutls-devel (RPM) or gnutls-dev (DEB) to fix.) endif ifeq ($(call pkg-config-check,libnftables),y) @@ -46,16 +50,19 @@ ifeq ($(call pkg-config-check,libnftables),y) LIBS_FEATURES += $(LIB_NFTABLES) FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 else - $(warning Warn: you have libnftables installed but it has incompatible API) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support (incompatible API version).) endif else - $(warning Warn: you have no libnftables installed) - $(warning Warn: Building without nftables support) + $(info Warn: Building without nftables support.) + $(info $S Install nftables-devel (RPM) or libnftables-dev (DEB) to fix.) endif export LIBS += $(LIBS_FEATURES) +ifneq ($(PLUGINDIR),) + FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" +endif + CONFIG_FILE = .config $(CONFIG_FILE): @@ -67,24 +74,26 @@ ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else - $(info Note: Building without ia32 C/R, missed ia32 support in gcc) - $(info $(info) That may be related to missing gcc-multilib in your) - $(info $(info) distribution or you may have Debian with buggy toolchain) - $(info $(info) (issue https://github.com/checkpoint-restore/criu/issues/315)) + $(info Note: Building without ia32 C/R, missing ia32 support in gcc.) + $(info $S It may be related to missing gcc-multilib in your) + $(info $S distribution, or you may have Debian with buggy toolchain.) + $(info $S See https://github.com/checkpoint-restore/criu/issues/315.) endif endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE OPENAT2 +FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ + SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ + OPENAT2 NO_LIBC_RSEQ_DEFS # $1 - config name define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ - $(Q) echo '' >> $$@ +else + $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ endif endef diff --git a/Makefile.install b/Makefile.install index c798637be..70c607ec6 100644 --- a/Makefile.install +++ b/Makefile.install @@ -29,6 +29,33 @@ LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR +# Detect externally managed Python environment (PEP 668). +PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') +PIP_BREAK_SYSTEM_PACKAGES ?= 0 + +# If Python environment is externally managed and PIP_BREAK_SYSTEM_PACKAGES is not set, skip pip install. +SKIP_PIP_INSTALL := 0 +ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) +ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) + +SKIP_PIP_INSTALL := 1 +$(info Warn: Externally managed python environment) +$(info Consider using PIP_BREAK_SYSTEM_PACKAGES=1) + +endif +endif + +# Default flags for pip install: +# --ignore-installed: Overwrite already installed pycriu/crit packages +# --no-build-isolation: Use current Python environment to build pycriu/crit packages +# --no-deps: Don't install any dependencies +# --no-index: Don't use PyPI index to find packages +# --progress-bar: Cleaner output +# --upgrade: Treat the install as an upgrade when replacing the installed version +PIPFLAGS ?= --ignore-installed --no-build-isolation --no-deps --no-index --progress-bar off --upgrade + +export SKIP_PIP_INSTALL PIPFLAGS + install-man: $(Q) $(MAKE) -C Documentation install .PHONY: install-man @@ -37,6 +64,10 @@ install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib +install-crit: lib + $(Q) $(MAKE) $(build)=crit install +.PHONY: install-crit + install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu @@ -45,19 +76,25 @@ install-amdgpu_plugin: amdgpu_plugin $(Q) $(MAKE) -C plugins/amdgpu install .PHONY: install-amdgpu_plugin +install-cuda_plugin: cuda_plugin + $(Q) $(MAKE) -C plugins/cuda install +.PHONY: install-cuda_plugin + install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-criu install-compel install-amdgpu_plugin ; +install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ $(Q) $(MAKE) -C plugins/amdgpu $@ + $(Q) $(MAKE) -C plugins/cuda $@ .PHONY: uninstall diff --git a/Makefile.versions b/Makefile.versions index 73bc2d5fa..3e6c9ed22 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. -CRIU_VERSION_MAJOR := 3 -CRIU_VERSION_MINOR := 17 -CRIU_VERSION_SUBLEVEL := 1 +CRIU_VERSION_MAJOR := 4 +CRIU_VERSION_MINOR := 2 +CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := Radiant Redstart +CRIU_VERSION_NAME := CRIUTIBILITY CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL diff --git a/README.md b/README.md index ff4aa1a23..6e2a0de9e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)]( https://circleci.com/gh/checkpoint-restore/criu) -

+

## CRIU -- A project to implement checkpoint/restore functionality for Linux @@ -35,10 +35,10 @@ Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) -- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/index.php?title=FAQ) ### Checkpoint and restore of simple loop process -[

](https://asciinema.org/a/232445) +

## Advanced features diff --git a/compel/.gitignore b/compel/.gitignore index eab3337d6..5e770a86c 100644 --- a/compel/.gitignore +++ b/compel/.gitignore @@ -4,6 +4,9 @@ arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S +arch/mips/plugins/std/syscalls/syscalls-64.S +arch/loongarch64/plugins/std/syscalls/syscalls-64.S +arch/riscv64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h diff --git a/compel/Makefile b/compel/Makefile index b79aee687..c0b8a82a0 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -32,8 +32,8 @@ ifeq ($(ARCH),x86) lib-y += arch/$(ARCH)/src/lib/thread_area.o endif -# handle_elf() has no support of ELF relocations on ARM (yet?) -ifneq ($(filter arm aarch64,$(ARCH)),) +# handle_elf() has no support of ELF relocations on ARM and RISCV64 (yet?) +ifneq ($(filter arm aarch64 loongarch64 riscv64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 5f090490d..8a61b268f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,14 +2,41 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} +#include +#include -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} +struct hwbp_cap { + char arch; + char bp_count; +}; + +/* copied from `linux/arch/arm64/include/asm/hw_breakpoint.h` */ +/* Lengths */ +#define ARM_BREAKPOINT_LEN_1 0x1 +#define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 +#define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f +#define ARM_BREAKPOINT_LEN_8 0xff + +/* Privilege Levels */ +#define AARCH64_BREAKPOINT_EL1 1 +#define AARCH64_BREAKPOINT_EL0 2 + +/* Breakpoint */ +#define ARM_BREAKPOINT_EXECUTE 0 + +/* Watchpoints */ +#define ARM_BREAKPOINT_LOAD 1 +#define ARM_BREAKPOINT_STORE 2 +#define AARCH64_ESR_ACCESS_MASK (1 << 6) + +#define DISABLE_HBP 0 +#define ENABLE_HBP 1 + +int ptrace_set_breakpoint(pid_t pid, void *addr); +int ptrace_flush_breakpoints(pid_t pid); #endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h new file mode 100644 index 000000000..9f9655e3b --- /dev/null +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h @@ -0,0 +1,47 @@ +#ifndef __UAPI_ASM_GCS_TYPES_H__ +#define __UAPI_ASM_GCS_TYPES_H__ + +#ifndef NT_ARM_GCS +#define NT_ARM_GCS 0x410 /* ARM GCS state */ +#endif + +/* Shadow Stack/Guarded Control Stack interface */ +#define PR_GET_SHADOW_STACK_STATUS 74 +#define PR_SET_SHADOW_STACK_STATUS 75 +#define PR_LOCK_SHADOW_STACK_STATUS 76 + +/* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack */ +#ifndef PR_SHADOW_STACK_ENABLE +#define PR_SHADOW_STACK_ENABLE (1UL << 0) +#endif + +/* Allows explicit GCS stores (eg. using GCSSTR) */ +#ifndef PR_SHADOW_STACK_WRITE +#define PR_SHADOW_STACK_WRITE (1UL << 1) +#endif + +/* Allows explicit GCS pushes (eg. using GCSPUSHM) */ +#ifndef PR_SHADOW_STACK_PUSH +#define PR_SHADOW_STACK_PUSH (1UL << 2) +#endif + +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +#define PR_SHADOW_STACK_ALL_MODES \ + PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH + +/* copied from: arch/arm64/include/asm/sysreg.h */ +#define GCS_CAP_VALID_TOKEN 0x1 +#define GCS_CAP_ADDR_MASK 0xFFFFFFFFFFFFF000ULL +#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | GCS_CAP_VALID_TOKEN) +#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) + +#include + +#ifndef HWCAP_GCS +#define HWCAP_GCS (1UL << 32) +#endif + +#endif /* __UAPI_ASM_GCS_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 9d4ce7e2e..606c92ffe 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -2,6 +2,7 @@ #define UAPI_COMPEL_ASM_TYPES_H__ #include +#include #include #include #include @@ -16,7 +17,24 @@ */ typedef struct user_pt_regs user_regs_struct_t; -typedef struct user_fpsimd_state user_fpregs_struct_t; + +/* + * GCS (Guarded Control Stack) + * + * This mirrors the kernel definition but renamed to cr_user_gcs + * to avoid conflict with kernel headers (/usr/include/asm/ptrace.h). + */ +struct cr_user_gcs { + __u64 features_enabled; + __u64 features_locked; + __u64 gcspr_el0; +}; + +struct user_fpregs_struct { + struct user_fpsimd_state fpstate; + struct cr_user_gcs gcs; +}; +typedef struct user_fpregs_struct user_fpregs_struct_t; #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 @@ -39,4 +57,12 @@ typedef struct user_fpsimd_state user_fpregs_struct_t; __NR_##syscall; \ }) +extern bool __compel_host_supports_gcs(void); +#define compel_host_supports_gcs __compel_host_supports_gcs + +struct parasite_ctl; +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index f8ec55d6c..7efee528f 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,24 +1,34 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include +#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ #define FPSIMD_MAGIC 0x46508001 +#define GCS_MAGIC 0x47435300 typedef struct fpsimd_context fpu_state_t; +struct gcs_context { + struct _aarch64_ctx head; + __u64 gcspr; + __u64 features_enabled; + __u64 reserved; +}; + struct aux_context { struct fpsimd_context fpsimd; + struct gcs_context gcs; /* additional context to be added before "end" */ struct _aarch64_ctx end; }; -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include @@ -62,6 +72,7 @@ struct cr_sigcontext { #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 +#define RT_SIGFRAME_GCS(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->gcs) #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index bd1ed0da3..42f593c79 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,7 +2,9 @@ #include #include #include -#include +#include +#include + #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" @@ -10,6 +12,9 @@ #include "errno.h" #include "infect.h" #include "infect-priv.h" +#include "asm/breakpoints.h" +#include "asm/gcs-types.h" +#include unsigned __page_size = 0; unsigned __page_shift = 0; @@ -30,24 +35,54 @@ static inline void __always_unused __check_code_syscall(void) BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } +bool __compel_host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool __compel_gcs_enabled(struct cr_user_gcs *gcs) +{ + if (!compel_host_supports_gcs()) + return false; + + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; +} + int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs = RT_SIGFRAME_GCS(sigframe); memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); + pr_debug("sigreturn_prep_regs_plain: sp %lx pc %lx\n", (long)regs->sp, (long)regs->pc); + sigframe->uc.uc_mcontext.sp = regs->sp; sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; - memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); + memcpy(fpsimd->vregs, fpregs->fpstate.vregs, 32 * sizeof(__uint128_t)); - fpsimd->fpsr = fpregs->fpsr; - fpsimd->fpcr = fpregs->fpcr; + fpsimd->fpsr = fpregs->fpstate.fpsr; + fpsimd->fpcr = fpregs->fpstate.fpcr; fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (__compel_gcs_enabled(&fpregs->gcs)) { + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = fpregs->gcs.gcspr_el0 - 8; + gcs->features_enabled = fpregs->gcs.features_enabled; + + pr_debug("sigframe gcspr=%llx features_enabled=%llx\n", fpregs->gcs.gcspr_el0 - 8, fpregs->gcs.features_enabled); + } else { + pr_debug("sigframe gcspr=[disabled]\n"); + memset(gcs, 0, sizeof(*gcs)); + } + return 0; } @@ -59,7 +94,6 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; struct iovec iov; int ret; @@ -72,14 +106,28 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - iov.iov_base = fpsimd; - iov.iov_len = sizeof(*fpsimd); + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } - ret = save(arg, regs, fpsimd); + memset(&ext_regs->gcs, 0, sizeof(ext_regs->gcs)); + + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &iov) == 0) { + pr_info("gcs: GCSPR_EL0 for %d: 0x%llx, features: 0x%llx\n", + pid, ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + if (!__compel_gcs_enabled(&ext_regs->gcs)) + pr_info("gcs: GCS is NOT enabled\n"); + } else { + pr_info("gcs: GCS state not available for %d\n", pid); + } + + ret = save(pid, arg, regs, ext_regs); err: return ret; } @@ -88,14 +136,44 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; + struct cr_user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + pr_info("Restoring GP/FPU registers for %d\n", pid); - iov.iov_base = ext_regs; - iov.iov_len = sizeof(*ext_regs); + iov.iov_base = &ext_regs->fpstate; + iov.iov_len = sizeof(ext_regs->fpstate); if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { pr_perror("Failed to set FPU registers for %d", pid); return -1; } + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_warn("gcs: Failed to get GCS for %d\n", pid); + } else { + ext_regs->gcs = gcs; + compel_set_task_gcs_regs(pid, ext_regs); + } + + return 0; +} + +int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("gcs: restoring GCS registers for %d\n", pid); + pr_info("gcs: restoring GCS: gcspr=%llx features=%llx\n", + ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); + + iov.iov_base = &ext_regs->gcs; + iov.iov_len = sizeof(ext_regs->gcs); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &iov)) { + pr_perror("gcs: Failed to set GCS registers for %d", pid); + return -1; + } + return 0; } @@ -176,3 +254,176 @@ unsigned long compel_task_size(void) break; return task_size; } + +static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) +{ + static struct hwbp_cap info; + static int available = -1; + + if (available == -1) { + unsigned int val; + struct iovec iovec = { + .iov_base = &val, + .iov_len = sizeof(val), + }; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_HW_BREAK, &iovec) < 0) + available = 0; + else { + info.arch = (char)((val >> 8) & 0xff); + info.bp_count = (char)(val & 0xff); + + available = (info.arch != 0); + } + } + + return available == 1 ? &info : NULL; +} + +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + k_rtsigset_t block; + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + /* + * The struct is copied from `arch/arm64/include/asm/hw_breakpoint.h` in + * linux kernel: + * struct arch_hw_breakpoint_ctrl { + * __u32 __reserved : 19, + * len : 8, + * type : 2, + * privilege : 2, + * enabled : 1; + * }; + * + * The part of `struct arch_hw_breakpoint_ctrl` bits meaning is defined + * in <>, + * D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers. + */ + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | ENABLE_HBP; + regs.dbg_regs[0].addr = (__u64)addr; + regs.dbg_regs[0].ctrl = ctrl; + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } + + if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { + pr_perror("Unable to restart the stopped tracee process %d", pid); + return -1; + } + + return 1; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); + struct user_hwdebug_state regs = {}; + unsigned int ctrl = 0; + struct iovec iovec; + + if (info == NULL || info->bp_count == 0) + return 0; + + ctrl = ARM_BREAKPOINT_LEN_4; + ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; + ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; + ctrl = (ctrl << 1) | DISABLE_HBP; + regs.dbg_regs[0].addr = 0ul; + regs.dbg_regs[0].ctrl = ctrl; + + iovec.iov_base = ®s; + iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); + + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) + return -1; + + return 0; +} + +int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct cr_user_gcs *gcs) +{ + struct iovec gcs_iov = { .iov_base = gcs, .iov_len = sizeof(*gcs) }; + + uint64_t token_addr = gcs->gcspr_el0 - 8; + uint64_t sigtramp_addr = gcs->gcspr_el0 - 16; + + uint64_t cap_token = ALIGN_DOWN(GCS_SIGNAL_CAP(token_addr), 8); + unsigned long restorer_addr; + + pr_info("gcs: (setup) CAP token: 0x%lx at addr: 0x%lx\n", cap_token, token_addr); + + /* Inject capability token at gcspr_el0 - 8 */ + if (ptrace(PTRACE_POKEDATA, pid, (void *)token_addr, cap_token)) { + pr_perror("gcs: (setup) Inject GCS cap token failed"); + return -1; + } + + /* Inject restorer trampoline address (gcspr_el0 - 16) */ + restorer_addr = ctl->parasite_ip; + if (ptrace(PTRACE_POKEDATA, pid, (void *)sigtramp_addr, restorer_addr)) { + pr_perror("gcs: (setup) Inject GCS restorer failed"); + return -1; + } + + /* Update GCSPR_EL0 */ + gcs->gcspr_el0 = token_addr; + if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &gcs_iov)) { + pr_perror("gcs: PTRACE_SETREGS FAILED"); + return -1; + } + + pr_debug("gcs: parasite_ip=%#lx sp=%#llx gcspr_el0=%#llx\n", + ctl->parasite_ip, ctl->orig.regs.sp, gcs->gcspr_el0); + + return 0; +} + +int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) +{ + struct cr_user_gcs gcs; + struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; + pid_t pid = ctl->rpid; + + if(!__compel_host_supports_gcs()) + return 0; + + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) != 0) { + pr_perror("GCS state not available for %d", pid); + return -1; + } + + if (!__compel_gcs_enabled(&gcs)) + return 0; + + if (inject_gcs_cap_token(ctl, pid, &gcs)) { + pr_perror("Failed to inject GCS cap token for %d", pid); + return -1; + } + + pr_info("gcs: GCS enabled for %d\n", pid); + + return 0; +} diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 8bcc3cc50..f4deb02b2 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -39,7 +39,7 @@ recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, str sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) -bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +bind 200 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) @@ -85,7 +85,7 @@ timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimer timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) @@ -118,7 +118,10 @@ fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) +close_range 436 436 (unsigned int fd, unsigned int max_fd, unsigned int flags) pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) +map_shadow_stack 453 ! (unsigned long addr, unsigned long size, unsigned int flags) \ No newline at end of file diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index 7700f52ca..a9fb639e2 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -65,10 +65,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr } #define PTRACE_GETVFPREGS 27 -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *vfp, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *vfp = ext_regs ? ext_regs : &tmp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); @@ -95,7 +94,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } - ret = save(arg, regs, vfp); + ret = save(pid, arg, regs, vfp); err: return ret; } diff --git a/compel/arch/loongarch64/plugins/include/asm/prologue.h b/compel/arch/loongarch64/plugins/include/asm/prologue.h new file mode 100644 index 000000000..c19ce54d7 --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..b883bd8be --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h @@ -0,0 +1,30 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#include +/* Types for sigaction, sigprocmask syscalls */ +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +/* refer to arch/loongarch/include/uapi/asm/signal.h */ +#define _KNSIG 64 +#define _NSIG_BPW BITS_PER_LONG +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + uint64_t sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#define SA_RESTORER 0x04000000 + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/features.h b/compel/arch/loongarch64/plugins/include/features.h new file mode 100644 index 000000000..b4a3cded2 --- /dev/null +++ b/compel/arch/loongarch64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/loongarch64/plugins/std/parasite-head.S b/compel/arch/loongarch64/plugins/std/parasite-head.S new file mode 100644 index 000000000..3a960490e --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/parasite-head.S @@ -0,0 +1,9 @@ + +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + bl parasite_service; + break 0; +END(__export_parasite_head_start) + diff --git a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..0d08f34e1 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,117 @@ +std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o +sys-proto-types := $(obj)/include/uapi/std/syscall-types.h +sys-proto-generic := $(obj)/include/uapi/std/syscall.h +sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h +sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h +sys-proto = $(obj)/include/uapi/std/syscall-$(1).h +sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl +sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S +sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S +sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c + +sys-bits := 64 + +AV := $$$$ + +define gen-rule-sys-codes +$(sys-codes): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ + $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ + sub("^__NR", "SYS", SYSN); \ + print "\n#ifndef ", $(AV)1; \ + print "#define", $(AV)1, $(AV)2; \ + print "#endif"; \ + print "\n#ifndef ", SYSN; \ + print "#define ", SYSN, $(AV)1; \ + print "#endif";}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-proto +$(sys-proto): $(sys-def) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include ' >> $$@ +ifeq ($(1),32) + $(Q) echo '#include "asm/syscall32.h"' >> $$@ +endif + $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ + substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ + $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ +endef + +define gen-rule-sys-asm +$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) echo '#include ' >> $$@ + $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ + $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ +endef + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(sys-codes-generic): $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ + sub("^__NR", "__NR32", NR32); \ + print "\n#ifndef ", NR32; \ + print "#define ", NR32, $$2; \ + print "#endif";}' >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ +mrproper-y += $(sys-codes-generic) + +$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) + $(call msg-gen, $@) + $(Q) echo "/* Autogenerated, don't edit */" > $@ + $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ + $(Q) echo "" >> $@ + $(Q) echo '#include ' >> $@ + $(Q) echo "" >> $@ + $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ +mrproper-y += $(sys-proto-generic) + +define gen-rule-sys-exec-tbl +$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) + $(call msg-gen, $$@) + $(Q) echo "/* Autogenerated, don't edit */" > $$@ + $(Q) cat $$< | awk '/^__NR/{print \ + "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ +endef + +$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) +$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) +$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) +$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + +std-headers-deps += $(call sys-codes,$(sys-bits)) +std-headers-deps += $(call sys-proto,$(sys-bits)) +std-headers-deps += $(call sys-asm,$(sys-bits)) +std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) +std-headers-deps += $(sys-codes-generic) +std-headers-deps += $(sys-proto-generic) +std-headers-deps += $(sys-asm-types) +mrproper-y += $(std-headers-deps) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S new file mode 100644 index 000000000..fff894466 --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S @@ -0,0 +1,44 @@ +#include "common/asm/linkage.h" + +#define SYSCALL(name, opcode) \ +ENTRY(name); \ + addi.d $a7, $zero, opcode; \ + syscall 0; \ + jirl $r0, $r1, 0; \ +END(name) + +#ifndef AT_FDCWD +#define AT_FDCWD -100 +#endif + +#ifndef AT_REMOVEDIR +#define AT_REMOVEDIR 0x200 +#endif + +ENTRY(sys_open) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_openat +END(sys_open) + +ENTRY(sys_mkdir) + or $a3, $zero, $a2 + or $a2, $zero, $a1 + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_mkdirat +END(sys_mkdir) + +ENTRY(sys_rmdir) + addi.d $a2, $zero, AT_REMOVEDIR + or $a1, $zero, $a0 + addi.d $a0, $zero, AT_FDCWD + b sys_unlinkat +END(sys_rmdir) + +ENTRY(__cr_restore_rt) + addi.d $a7, $zero, __NR_rt_sigreturn + syscall 0 +END(__cr_restore_rt) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl new file mode 100644 index 000000000..83dcdab4a --- /dev/null +++ b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl @@ -0,0 +1,122 @@ +# +# System calls table, please make sure the table consist only the syscalls +# really used somewhere in project. +# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. +# +# __NR_name code name arguments +# ------------------------------------------------------------------------------------------------------------------------------------------------------------- +__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) +__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) +__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) +__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) +__NR_flock 32 sys_flock (int fd, unsigned long cmd) +__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) +__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) +__NR_umount2 39 sys_umount2 (char *name, int flags) +__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) +__NR_close 57 sys_close (int fd) +__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) +__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) +__NR_read 63 sys_read (int fd, void *buf, unsigned long count) +__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) +__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) +__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) +__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) +__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) +__NR_personality 92 sys_personality (unsigned int personality) +__NR_exit 93 sys_exit (unsigned long error_code) +__NR_exit_group 94 sys_exit_group (int error_code) +__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) +__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) +__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) +__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) +__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) +__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) +__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) +__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) +__NR_clock_gettime 113 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) +__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) +__NR_restart_syscall 128 sys_restart_syscall (void) +__NR_kill 129 sys_kill (long pid, int sig) +__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) +__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) +__NR_rt_sigreturn 139 sys_rt_sigreturn (void) +__NR_setpriority 140 sys_setpriority (int which, int who, int nice) +__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) +__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) +__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) +__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) +__NR_getpgid 155 sys_getpgid (pid_t pid) +__NR_setfsuid 151 sys_setfsuid (int fsuid) +__NR_setfsgid 152 sys_setfsgid (int fsgid) +__NR_getsid 156 sys_getsid (void) +__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) +__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) +__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) +__NR_umask 166 sys_umask (int mask) +__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) +__NR_getpid 172 sys_getpid (void) +__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) +__NR_gettid 178 sys_gettid (void) +__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) +__NR_socket 198 sys_socket (int domain, int type, int protocol) +__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) +__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) +__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +__NR_shutdown 210 sys_shutdown (int sockfd, int how) +__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) +__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) +__NR_brk 214 sys_brk (void *addr) +__NR_munmap 215 sys_munmap (void *addr, unsigned long len) +__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) +__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) +__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) +__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) +__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) +__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) +__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) +__NR_setns 268 sys_setns (int fd, int nstype) +__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) +__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) +__NR_userfaultfd 282 sys_userfaultfd (int flags) +__NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) +__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) +__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) +#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) +#__NR_rmdir ! sys_rmdir (const char *name) +#__NR_unlink ! sys_unlink (char *pathname) +#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) +#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) +#__NR_mkdir ! sys_mkdir (const char *name, int mode) +#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) diff --git a/compel/arch/loongarch64/scripts/compel-pack.lds.S b/compel/arch/loongarch64/scripts/compel-pack.lds.S new file mode 100644 index 000000000..cfb7a2fb3 --- /dev/null +++ b/compel/arch/loongarch64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(loongarch) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} diff --git a/compel/arch/loongarch64/src/lib/cpu.c b/compel/arch/loongarch64/src/lib/cpu.c new file mode 100644 index 000000000..172b90e27 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/cpu.c @@ -0,0 +1,41 @@ +#include +#include + +#include "compel-cpu.h" +#include "common/bitops.h" +#include "common/compiler.h" +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; +static bool rt_info_done = false; + +void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ +} + +int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) +{ + return 0; +} + +int compel_cpuid(compel_cpuinfo_t *c) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } + + return compel_test_cpu_cap(&rt_info, feature); +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf-host.c b/compel/arch/loongarch64/src/lib/handle-elf-host.c new file mode 100644 index 000000000..a605a5a45 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf-host.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/handle-elf.c b/compel/arch/loongarch64/src/lib/handle-elf.c new file mode 100644 index 000000000..a605a5a45 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/handle-elf.c @@ -0,0 +1,22 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +extern int __handle_elf(void *mem, size_t size); + +int handle_binary(void *mem, size_t size) +{ + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} diff --git a/compel/arch/loongarch64/src/lib/include/handle-elf.h b/compel/arch/loongarch64/src/lib/include/handle-elf.h new file mode 100644 index 000000000..b0a66ef87 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/handle-elf.h @@ -0,0 +1,8 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) + +#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/syscall.h b/compel/arch/loongarch64/src/lib/include/syscall.h new file mode 100644 index 000000000..ac3e2799a --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ + +#ifndef SIGSTKFLT +#define SIGSTKFLT 16 +#endif + +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..21eb1309f --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,6 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT +extern int ptrace_set_breakpoint(pid_t pid, void *addr); +extern int ptrace_flush_breakpoints(pid_t pid); +#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..e568df789 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_CPU_H__ +#define __CR_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; +#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..7f476d541 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..0b047a5b0 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,67 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h + * + * A thread LoongArch CPU context + * + * struct user_fp_state { + * uint64_t fpr[32]; + * uint64_t fcc; + * uint32_t fcsr; + * }; + * + * struct user_pt_regs { + * unsigned long regs[32]; + * unsigned long csr_era; + * unsigned long csr_badv; + * unsigned long reserved[11]; + * }; + */ + +struct user_gp_regs { + uint64_t regs[32]; + uint64_t orig_a0; + uint64_t pc; + uint64_t csr_badv; + uint64_t reserved[10]; +} __attribute__((aligned(8))); + +struct user_fp_regs { + uint64_t regs[32]; + uint64_t fcc; + uint32_t fcsr; +}; + +typedef struct user_gp_regs user_regs_struct_t; +typedef struct user_fp_regs user_fpregs_struct_t; + +#define user_regs_native(regs) true + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(r) ((uint64_t)(r).regs[4]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)(r).regs[3]) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) +#define SET_REG_IP(r, val) ((r).pc = (val)) + +#define GPR_NUM 32 +#define FPR_NUM 32 + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..fcb545a1d --- /dev/null +++ b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,86 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include +#include +#include + +#include +#include + +#include + +#define rt_sigcontext sigcontext +/* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ +#include +typedef __u32 u32; + +typedef struct sigcontext_t { + __u64 pc; + __u64 regs[32]; + __u32 flags; + __u64 extcontext[0] __attribute__((__aligned__(16))); +} sigcontext_t; + +typedef struct context_info_t { + __u32 magic; + __u32 size; + __u64 padding; +} context_info_t; + +#define FPU_CTX_MAGIC 0x46505501 +#define FPU_CTX_ALIGN 8 +typedef struct fpu_context_t { + __u64 regs[32]; + __u64 fcc; + __u64 fcsr; +} fpu_context_t; + +typedef struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + sigset_t uc_sigmask; + __u8 __unused[1024 / 8 - sizeof(sigset_t)]; + sigcontext_t uc_mcontext; +} ucontext; + +/* Copy from the kernel source arch/loongarch/kernel/signal.c */ +struct rt_sigframe { + rt_siginfo_t rs_info; + ucontext rs_uc; +}; + +#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) +#define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) +#define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) + +#define RT_SIGFRAME_FPU(rt_sigframe) \ + ({ \ + context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ + ctx->magic = FPU_CTX_MAGIC; \ + ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ + (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ + }) + +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "addi.d $sp, %0, 0 \n" \ + "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ + "syscall 0" \ + : \ + :"r"(new_sp) \ + : "$a7", "memory") +/* clang-format on */ + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); + +#define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) +#define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c new file mode 100644 index 000000000..190c39227 --- /dev/null +++ b/compel/arch/loongarch64/src/lib/infect.c @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include + +#include +#include +#include "errno.h" +#include +#include +#include "common/err.h" +#include "common/page.h" +#include "asm/infect-types.h" +#include "ptrace.h" +#include "infect.h" +#include "infect-priv.h" +#include "log.h" +#include "common/bug.h" + +/* + * Injected syscall instruction + * loongarch64 is Little Endian + */ +const char code_syscall[] = { + 0x00, 0x00, 0x2b, 0x00, /* syscall */ + 0x00, 0x00, 0x2a, 0x00 /* break */ +}; + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigcontext_t *sc; + fpu_context_t *fpu; + + sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, regs->regs, sizeof(regs->regs)); + sc->pc = regs->pc; + + fpu = RT_SIGFRAME_FPU(sigframe); + memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret; + + pr_info("Dumping GP/FPU registers for %d\n", pid); + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { + pr_perror("Failed to obtain CPU registers for %d", pid); + goto err; + } + + /* + * Refer to Linux kernel arch/loongarch/kernel/signal.c + */ + if (regs->regs[0]) { + switch (regs->regs[4]) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->regs[4] = regs->orig_a0; + regs->pc -= 4; + break; + case -ERESTART_RESTARTBLOCK: + regs->regs[4] = regs->orig_a0; + regs->regs[11] = __NR_restart_syscall; + regs->pc -= 4; + break; + } + regs->regs[0] = 0; /* Don't deal with this again. */ + } + + iov.iov_base = fpregs; + iov.iov_len = sizeof(user_fpregs_struct_t); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + goto err; + } + + ret = save(pid, arg, regs, fpregs); +err: + return 0; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +/* + * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is + * used as syscall number. + */ +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + int err; + user_regs_struct_t regs = ctl->orig.regs; + + regs.regs[11] = (unsigned long)nr; + regs.regs[4] = arg1; + regs.regs[5] = arg2; + regs.regs[6] = arg3; + regs.regs[7] = arg4; + regs.regs[8] = arg5; + regs.regs[9] = arg6; + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.regs[4]; + + return err; +} + +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); + + if (err < 0 || IS_ERR_VALUE(map)) { + pr_err("remote mmap() failed: %s\n", strerror(-map)); + return NULL; + } + + return (void *)map; +} + +/* + * regs must be inited when calling this function from original context + */ +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->regs[4] = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + return true; +} + +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * TODO: add feature + */ +int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +/* + * Refer to Linux kernel arch/loongarch/include/asm/processor.h + */ +#define TASK_SIZE32 (1UL) << 31 +#define TASK_SIZE64_MIN (1UL) << 40 +#define TASK_SIZE64_MAX (1UL) << 48 + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index 505ec849d..ad3d44634 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -84,7 +84,7 @@ __NR_sys_timer_settime 5217 sys_timer_settime (kernel_timer_t timer_id, int fl __NR_sys_timer_gettime 5218 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 5219 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 5220 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 5222 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 5222 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 5205 sys_exit_group (int error_code) __NR_set_thread_area 5242 sys_set_thread_area (unsigned long *addr) __NR_openat 5247 sys_openat (int dfd, const char *filename, int flags, int mode) @@ -115,7 +115,9 @@ __NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 5436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c index a605a5a45..e086761c2 100644 --- a/compel/arch/mips/src/lib/handle-elf.c +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -5,18 +5,31 @@ #include "piegen.h" #include "log.h" -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; + /* check ELF magic */ + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return -EINVAL; + } + + /* check ELF class and data encoding */ + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || + ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF class or data encoding\n"); + return -EINVAL; + } + + if (ehdr->e_ident[EI_ABIVERSION] != 0) { + pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); + } + + return __handle_elf(mem, size); } diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index afa0f5ed5..a1d4865cc 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -119,10 +119,9 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); @@ -150,7 +149,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct regs->regs[0] = 0; } - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); return ret; } diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index af40d7104..3deb41cf7 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 246 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 234 sys_exit_group (int error_code) __NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) @@ -114,7 +114,9 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index eb12c9f7c..0c4ccb648 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -14,7 +14,7 @@ */ #include -// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include @@ -23,6 +23,11 @@ /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ #define USER_REDZONE_SIZE 512 +#if _CALL_ELF != 2 +#error Only supporting ABIv2. +#else +#define STACK_FRAME_MIN_SIZE 32 +#endif /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ #define TRAMP_SIZE 6 diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 61cd6e985..54abd48a4 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -11,6 +11,7 @@ #include "log.h" #include "common/bug.h" #include "common/page.h" +#include "common/err.h" #include "infect.h" #include "infect-priv.h" @@ -303,34 +304,59 @@ out_free: return -1; /* still failing the checkpoint */ } -static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - pr_info("Dumping GP/FPU registers for %d\n", pid); +/* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ - /* - * This is inspired by kernel function check_syscall_restart in - * arch/powerpc/kernel/signal.c - */ #ifndef TRAP #define TRAP(r) ((r).trap & ~0xF) #endif - if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { - /* Restart the system call */ - switch (regs->gpr[3]) { - case ERESTARTNOHAND: - case ERESTARTSYS: - case ERESTARTNOINTR: - regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; - break; - case ERESTART_RESTARTBLOCK: - pr_warn("Will restore %d with interrupted system call\n", pid); - regs->gpr[3] = EINTR; - break; - } +static bool trap_is_scv(user_regs_struct_t *regs) +{ + return TRAP(*regs) == 0x3000; +} + +static bool trap_is_syscall(user_regs_struct_t *regs) +{ + return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; +} + +static void handle_syscall(pid_t pid, user_regs_struct_t *regs) +{ + unsigned long ret = regs->gpr[3]; + + if (trap_is_scv(regs)) { + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { + return; } + /* Restart or interrupt the system call */ + switch (ret) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; + break; + } +} + +static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + pr_info("Dumping GP/FPU registers for %d\n", pid); + + if (trap_is_syscall(regs)) + handle_syscall(pid, regs); + /* Resetting trap since we are now coming from user space. */ regs->trap = 0; @@ -365,17 +391,16 @@ static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_stru return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; int ret; ret = __get_task_regs(pid, regs, fpregs); if (ret) return ret; - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) @@ -441,13 +466,13 @@ void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* - * OpenPOWER ABI requires that r12 is set to the calling function addressi + * OpenPOWER ABI requires that r12 is set to the calling function address * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) - regs->gpr[1] = (unsigned long)stack; + regs->gpr[1] = (unsigned long)stack - STACK_FRAME_MIN_SIZE; regs->trap = 0; } diff --git a/compel/arch/riscv64/plugins/include/asm/prologue.h b/compel/arch/riscv64/plugins/include/asm/prologue.h new file mode 100644 index 000000000..5c22b7b06 --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/prologue.h @@ -0,0 +1,35 @@ +#ifndef __ASM_PROLOGUE_H__ +#define __ASM_PROLOGUE_H__ + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) + +typedef struct prologue_init_args { + struct sockaddr_un ctl_sock_addr; + unsigned int ctl_sock_addr_len; + + unsigned int arg_s; + void *arg_p; + + void *sigframe; +} prologue_init_args_t; + +#endif /* __ASSEMBLY__ */ + +/* + * Reserve enough space for sigframe. + * + * FIXME It is rather should be taken from sigframe header. + */ +#define PROLOGUE_SGFRAME_SIZE 4096 + +#define PROLOGUE_INIT_ARGS_SIZE 1024 + +#endif /* __ASM_PROLOGUE_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/asm/syscall-types.h b/compel/arch/riscv64/plugins/include/asm/syscall-types.h new file mode 100644 index 000000000..b9740a9ee --- /dev/null +++ b/compel/arch/riscv64/plugins/include/asm/syscall-types.h @@ -0,0 +1,28 @@ +#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ +#define COMPEL_ARCH_SYSCALL_TYPES_H__ + +#define SA_RESTORER 0x04000000 + +typedef void rt_signalfn_t(int, siginfo_t *, void *); +typedef rt_signalfn_t *rt_sighandler_t; + +typedef void rt_restorefn_t(void); +typedef rt_restorefn_t *rt_sigrestore_t; + +#define _KNSIG 64 // number of signals +#define _NSIG_BPW 64 // number of signals per word + +#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) + +typedef struct { + unsigned long sig[_KNSIG_WORDS]; +} k_rtsigset_t; + +typedef struct { + rt_sighandler_t rt_sa_handler; + unsigned long rt_sa_flags; + rt_sigrestore_t rt_sa_restorer; + k_rtsigset_t rt_sa_mask; +} rt_sigaction_t; + +#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/features.h b/compel/arch/riscv64/plugins/include/features.h new file mode 100644 index 000000000..274cee52a --- /dev/null +++ b/compel/arch/riscv64/plugins/include/features.h @@ -0,0 +1,4 @@ +#ifndef __COMPEL_ARCH_FEATURES_H +#define __COMPEL_ARCH_FEATURES_H + +#endif /* __COMPEL_ARCH_FEATURES_H */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/parasite-head.S b/compel/arch/riscv64/plugins/std/parasite-head.S new file mode 100644 index 000000000..3e9d272e3 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/parasite-head.S @@ -0,0 +1,7 @@ +#include "common/asm/linkage.h" + + .section .head.text, "ax" +ENTRY(__export_parasite_head_start) + jal parasite_service + ebreak +END(__export_parasite_head_start) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls new file mode 100644 index 000000000..5af35bcb4 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls @@ -0,0 +1,59 @@ +ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ +asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ + +sys-types := $(obj)/include/uapi/std/syscall-types.h +sys-codes := $(obj)/include/uapi/std/syscall-codes.h +sys-proto := $(obj)/include/uapi/std/syscall.h + +sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def +sys-asm-common-name := std/syscalls/syscall-common.S +sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) +sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h +sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c + +sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl +sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl + +sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S +std-lib-y += $(sys-asm:.S=).o + +ifeq ($(ARCH),arm) +arch_bits := 32 +else +arch_bits := 64 +endif + +sys-exec-tbl := sys-exec-tbl.c + +$(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen) \ + $(sys-def) \ + $(sys-codes) \ + $(sys-proto) \ + $(sys-asm) \ + $(sys-asm-common-name) \ + $(sys-types) \ + $(arch_bits) + +$(sys-asm:.S=).o: $(sys-asm) + +$(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) + $(E) " GEN " $@ + $(Q) perl \ + $(sys-gen-tbl) \ + $(sys-def) \ + $(sys-exec-tbl) \ + $(arch_bits) + +$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h + $(call msg-gen, $@) + $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S + $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h + +std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) $(sys-codes) +mrproper-y += $(std-headers-deps) +mrproper-y += $(obj)/include/uapi/std/syscall-aux.S +mrproper-y += $(obj)/include/uapi/std/syscall-aux.h \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl new file mode 100755 index 000000000..61a807eb6 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $tblout = $ARGV[1]; +my $bits = $ARGV[2]; + +my $code = "code$bits"; + +open TBLOUT, ">", $tblout or die $!; +open IN, "<", $in or die $!; + +print TBLOUT "/* Autogenerated, don't edit */\n"; +print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; + +for () { + if ($_ =~ /\#/) { + next; + } + + my $sys_name; + my $sys_num; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{alias}; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $sys_name = $+{name}; + } else { + unlink $tblout; + die "Invalid syscall definition file: invalid entry $_\n"; + } + + $sys_num = $+{$code}; + + if ($sys_num ne "!") { + print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; + } +} + +print TBLOUT " { }, /* terminator */"; +print TBLOUT "};" \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl new file mode 100755 index 000000000..a53f1962f --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $in = $ARGV[0]; +my $codesout = $ARGV[1]; +my $codes = $ARGV[1]; +$codes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $protosout = $ARGV[2]; +my $protos = $ARGV[2]; +$protos =~ s/.*include\/uapi\//compel\/plugins\//g; +my $asmout = $ARGV[3]; +my $asmcommon = $ARGV[4]; +my $prototypes = $ARGV[5]; +$prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; +my $bits = $ARGV[6]; + +my $codesdef = $codes; +$codesdef =~ tr/.\-\//_/; +my $protosdef = $protos; +$protosdef =~ tr/.\-\//_/; +my $code = "code$bits"; +my $need_aux = 0; + +unlink $codesout; +unlink $protosout; +unlink $asmout; + +open CODESOUT, ">", $codesout or die $!; +open PROTOSOUT, ">", $protosout or die $!; +open ASMOUT, ">", $asmout or die $!; +open IN, "<", $in or die $!; + +print CODESOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $codesdef +#define $codesdef +END + +print PROTOSOUT <<"END"; +/* Autogenerated, don't edit */ +#ifndef $protosdef +#define $protosdef +#include <$prototypes> +#include <$codes> +END + +print ASMOUT <<"END"; +/* Autogenerated, don't edit */ +#include <$codes> +#include "$asmcommon" +END + + +for () { + if ($_ =~ /\#/) { + next; + } + + my $code_macro; + my $sys_macro; + my $sys_name; + + if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{alias}"; + } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { + $code_macro = "__NR_$+{name}"; + $sys_macro = "SYS_$+{name}"; + $sys_name = "sys_$+{name}"; + } else { + unlink $codesout; + unlink $protosout; + unlink $asmout; + + die "Invalid syscall definition file: invalid entry $_\n"; + } + + if ($+{$code} ne "!") { + print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; + print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; + print ASMOUT "syscall $sys_name, $code_macro\n"; + + } else { + $need_aux = 1; + } + + print PROTOSOUT "extern long $sys_name($+{args});\n"; +} + +if ($need_aux == 1) { + print ASMOUT "#include \n"; + print CODESOUT "#include \n"; +} + +print CODESOUT "#endif /* $codesdef */"; +print PROTOSOUT "#endif /* $protosdef */"; \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S new file mode 100644 index 000000000..04160b7ac --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S @@ -0,0 +1,37 @@ +/** + * This source contains emulation of syscalls + * that are not implemented in the riscv64 Linux kernel + */ + +ENTRY(sys_open) + add a3, x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_openat +END(sys_open) + + +ENTRY(sys_mkdir) + add a3,x0, a2 + add a2, x0, a1 + add a1, x0, a0 + addi a0, x0, -100 + j sys_mkdirat +END(sys_mkdir) + + +ENTRY(sys_rmdir) + addi a2, x0, 0x200 // flags = AT_REMOVEDIR + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_rmdir) + + +ENTRY(sys_unlink) + addi a2, x0, 0 // flags = 0 + add a1, x0, a0 + addi a0, x0, -100 + j sys_unlinkat +END(sys_unlink) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h new file mode 100644 index 000000000..881765bbb --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h @@ -0,0 +1,3 @@ +#ifndef __NR_openat +#define __NR_openat 56 +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S new file mode 100644 index 000000000..fdef3b47a --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S @@ -0,0 +1,17 @@ +#include "common/asm/linkage.h" + +syscall_common: + ecall + ret + +.macro syscall name, nr + ENTRY(\name) + li a7, \nr + j syscall_common + END(\name) +.endm + +ENTRY(__cr_restore_rt) + li a7, __NR_rt_sigreturn + ecall +END(__cr_restore_rt) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def new file mode 100644 index 000000000..967f097f9 --- /dev/null +++ b/compel/arch/riscv64/plugins/std/syscalls/syscall.def @@ -0,0 +1,125 @@ +# +# System calls table, please make sure the table consists of only the syscalls +# really used somewhere in the project. +# +# The template is (name and arguments are optional if you need only __NR_x +# defined, but no real entry point in syscalls lib). +# +# name/alias code64 code32 arguments +# ----------------------------------------------------------------------- +# +read 63 3 (int fd, void *buf, unsigned long count) +write 64 4 (int fd, const void *buf, unsigned long count) +open ! 5 (const char *filename, unsigned long flags, unsigned long mode) +close 57 6 (int fd) +lseek 62 19 (int fd, unsigned long offset, unsigned long origin) +mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) +mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) +munmap 215 91 (void *addr, unsigned long len) +brk 214 45 (void *addr) +rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) +rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) +rt_sigreturn 139 173 (void) +ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) +pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) +ptrace 117 26 (long request, pid_t pid, void *addr, void *data) +mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) +mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) +madvise 233 220 (unsigned long start, size_t len, int behavior) +shmat 196 305 (int shmid, void *shmaddr, int shmflag) +pause 1061 29 (void) +nanosleep 101 162 (struct timespec *req, struct timespec *rem) +getitimer 102 105 (int which, const struct itimerval *val) +setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) +getpid 172 20 (void) +socket 198 281 (int domain, int type, int protocol) +connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) +sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) +recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) +sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) +recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) +shutdown 210 293 (int sockfd, int how) +bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) +setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) +getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) +clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) +exit 93 1 (unsigned long error_code) +wait4 260 114 (int pid, int *status, int options, struct rusage *ru) +waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) +kill 129 37 (long pid, int sig) +fcntl 25 55 (int fd, int type, long arg) +flock 32 143 (int fd, unsigned long cmd) +mkdir ! 39 (const char *name, int mode) +rmdir ! 40 (const char *name) +unlink ! 10 (char *pathname) +readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) +umask 166 60 (int mask) +getgroups 158 205 (int gsize, unsigned int *groups) +setgroups 159 206 (int gsize, unsigned int *groups) +setresuid 147 164 (int uid, int euid, int suid) +getresuid 148 165 (int *uid, int *euid, int *suid) +setresgid 149 170 (int gid, int egid, int sgid) +getresgid 150 171 (int *gid, int *egid, int *sgid) +getpgid 155 132 (pid_t pid) +setfsuid 151 138 (int fsuid) +setfsgid 152 139 (int fsgid) +getsid 156 147 (void) +capget 90 184 (struct cap_header *h, struct cap_data *d) +capset 91 185 (struct cap_header *h, struct cap_data *d) +rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) +setpriority 140 97 (int which, int who, int nice) +sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) +sigaltstack 132 186 (const void *uss, void *uoss) +personality 92 136 (unsigned int personality) +prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) +arch_prctl ! 17 (int option, unsigned long addr) +setrlimit 164 75 (int resource, struct krlimit *rlim) +mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) +umount2 39 52 (char *name, int flags) +gettid 178 224 (void) +futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +set_tid_address 96 256 (int *tid_addr) +restart_syscall 128 0 (void) +timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) +timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) +timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) +timer_getoverrun 109 260 (int timer_id) +timer_delete 111 261 (kernel_timer_t timer_id) +clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) +exit_group 94 248 (int error_code) +set_robust_list 99 338 (struct robust_list_head *head, size_t len) +get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) +signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) +rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) +vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) +timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) +fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) +fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) +open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) +setns 268 375 (int fd, int nstype) +kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) +openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) +mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) +unlinkat 35 328 (int dirfd, const char *pathname, int flags) +memfd_create 279 385 (const char *name, unsigned int flags) +io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) +io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) +io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) +seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) +gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) +preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) +userfaultfd 282 388 (int flags) +fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) +cacheflush ! 983042 (void *start, void *end, int flags) +ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +fsopen 430 430 (char *fsname, unsigned int flags) +fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) +fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) +pidfd_open 434 434 (pid_t pid, unsigned int flags) +pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) +rseq 293 293 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) +open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) +openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) +membarrier 283 283 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/riscv64/scripts/compel-pack.lds.S b/compel/arch/riscv64/scripts/compel-pack.lds.S new file mode 100644 index 000000000..a61235b44 --- /dev/null +++ b/compel/arch/riscv64/scripts/compel-pack.lds.S @@ -0,0 +1,32 @@ +OUTPUT_ARCH(riscv) +EXTERN(__export_parasite_head_start) + +SECTIONS +{ + .crblob 0x0 : { + *(.head.text) + ASSERT(DEFINED(__export_parasite_head_start), + "Symbol __export_parasite_head_start is missing"); + *(.text*) + . = ALIGN(32); + *(.data*) + . = ALIGN(32); + *(.rodata*) + . = ALIGN(32); + *(.bss*) + . = ALIGN(32); + *(.got*) + . = ALIGN(32); + *(.toc*) + . = ALIGN(32); + } =0x00000000, + + /DISCARD/ : { + *(.debug*) + *(.comment*) + *(.note*) + *(.group*) + *(.eh_frame*) + *(*) + } +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/cpu.c b/compel/arch/riscv64/src/lib/cpu.c new file mode 100644 index 000000000..9a0291f70 --- /dev/null +++ b/compel/arch/riscv64/src/lib/cpu.c @@ -0,0 +1,78 @@ +#include +#include + +#include "compel-cpu.h" + +#include "common/bitops.h" + +#include "log.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +static compel_cpuinfo_t rt_info; + +static void fetch_rt_cpuinfo(void) +{ + static bool rt_info_done = false; + + if (!rt_info_done) { + compel_cpuid(&rt_info); + rt_info_done = true; + } +} + +void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ +} +int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) +{ + return 0; +} +int compel_cpuid(compel_cpuinfo_t *info) +{ + return 0; +} + +bool compel_cpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_cpu_cap(&rt_info, feature); +} + +bool compel_fpu_has_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_test_fpu_cap(&rt_info, feature); +} + +uint32_t compel_fpu_feature_size(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +uint32_t compel_fpu_feature_offset(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return 0; +} + +void compel_cpu_clear_feature(unsigned int feature) +{ + fetch_rt_cpuinfo(); + return compel_clear_cpu_cap(&rt_info, feature); +} + +void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) +{ + fetch_rt_cpuinfo(); + memcpy(c, &rt_info, sizeof(rt_info)); +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf-host.c b/compel/arch/riscv64/src/lib/handle-elf-host.c new file mode 120000 index 000000000..fe4611886 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf-host.c @@ -0,0 +1 @@ +handle-elf.c \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf.c b/compel/arch/riscv64/src/lib/handle-elf.c new file mode 100644 index 000000000..22420bc78 --- /dev/null +++ b/compel/arch/riscv64/src/lib/handle-elf.c @@ -0,0 +1,32 @@ +#include +#include + +#include "handle-elf.h" +#include "piegen.h" +#include "log.h" + +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +int handle_binary(void *mem, size_t size) +{ + const unsigned char *elf_ident = +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + elf_ident_64_le; +#else + elf_ident_64_be; +#endif + + if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) + return handle_elf_riscv64(mem, size); + + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; +} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/cpu.h b/compel/arch/riscv64/src/lib/include/cpu.h new file mode 100644 index 000000000..e69de29bb diff --git a/compel/arch/riscv64/src/lib/include/handle-elf.h b/compel/arch/riscv64/src/lib/include/handle-elf.h new file mode 100644 index 000000000..582770583 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/handle-elf.h @@ -0,0 +1,12 @@ +#ifndef COMPEL_HANDLE_ELF_H__ +#define COMPEL_HANDLE_ELF_H__ + +#include "elf64-types.h" + +#define __handle_elf handle_elf_riscv64 +#define ELF_RISCV +#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) + +extern int handle_elf_riscv64(void *mem, size_t size); + +#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/syscall.h b/compel/arch/riscv64/src/lib/include/syscall.h new file mode 100644 index 000000000..53f10525d --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/syscall.h @@ -0,0 +1,8 @@ +#ifndef __COMPEL_SYSCALL_H__ +#define __COMPEL_SYSCALL_H__ +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h new file mode 100644 index 000000000..f2ba799cb --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h @@ -0,0 +1,15 @@ +#ifndef __COMPEL_BREAKPOINTS_H__ +#define __COMPEL_BREAKPOINTS_H__ +#define ARCH_SI_TRAP TRAP_BRKPT + +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} + +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} + +#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h new file mode 100644 index 000000000..ac58567e3 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h @@ -0,0 +1,7 @@ +#ifndef UAPI_COMPEL_ASM_CPU_H__ +#define UAPI_COMPEL_ASM_CPU_H__ + +typedef struct { +} compel_cpuinfo_t; + +#endif /* UAPI_COMPEL_ASM_CPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h new file mode 100644 index 000000000..a74decc23 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h @@ -0,0 +1,4 @@ +#ifndef __CR_ASM_FPU_H__ +#define __CR_ASM_FPU_H__ + +#endif /* __CR_ASM_FPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h new file mode 100644 index 000000000..192810cac --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h @@ -0,0 +1,52 @@ +#ifndef UAPI_COMPEL_ASM_TYPES_H__ +#define UAPI_COMPEL_ASM_TYPES_H__ + +#include +#include +#include +#include + +#define SIGMAX 64 +#define SIGMAX_OLD 31 + +/* + * Copied from the Linux kernel header arch/riscv/include/uapi/asm/ptrace.h + * + * A thread RISC-V CPU context + */ +typedef struct user_regs_struct user_regs_struct_t; +typedef struct __riscv_d_ext_state user_fpregs_struct_t; + +#define __compel_arch_fetch_thread_area(tid, th) 0 +#define compel_arch_fetch_thread_area(tctl) 0 +#define compel_arch_get_tls_task(ctl, tls) +#define compel_arch_get_tls_thread(tctl, tls) + +#define REG_RES(registers) ((uint64_t)(registers).a0) +#define REG_IP(registers) ((uint64_t)(registers).pc) +#define SET_REG_IP(registers, val) ((registers).pc = (val)) + +/* + * REG_SP is also defined in riscv64-linux-gnu/include/sys/ucontext.h + * with a different meaning, and it's not used in CRIU. So we have to + * undefine it here. + */ +#ifdef REG_SP +#undef REG_SP +#endif + +#define REG_SP(registers) ((uint64_t)((registers).sp)) + +#define REG_SYSCALL_NR(registers) ((uint64_t)(registers).a7) + +#define user_regs_native(pregs) true + +#define ARCH_SI_TRAP TRAP_BRKPT + +#define __NR(syscall, compat) \ + ({ \ + (void)compat; \ + __NR_##syscall; \ + }) + +#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h new file mode 100644 index 000000000..e231d0465 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h @@ -0,0 +1,26 @@ +#ifndef COMPEL_RELOCATIONS_H__ +#define COMPEL_RELOCATIONS_H__ + +#include + +static inline uint32_t riscv_b_imm(uint32_t val) +{ + return (val & 0x00001000) << 19 | (val & 0x000007e0) << 20 | (val & 0x0000001e) << 7 | (val & 0x00000800) >> 4; +} + +static inline uint32_t riscv_i_imm(uint32_t val) +{ + return val << 20; +} + +static inline uint32_t riscv_u_imm(uint32_t val) +{ + return val & 0xfffff000; +} + +static inline uint32_t riscv_j_imm(uint32_t val) +{ + return (val & 0x00100000) << 11 | (val & 0x000007fe) << 20 | (val & 0x00000800) << 9 | (val & 0x000ff000); +} + +#endif /* COMPEL_RELOCATIONS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h new file mode 100644 index 000000000..e40fb6fce --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h @@ -0,0 +1,4 @@ +#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ +#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ + +#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h new file mode 100644 index 000000000..761a08f62 --- /dev/null +++ b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h @@ -0,0 +1,68 @@ +#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ +#define UAPI_COMPEL_ASM_SIGFRAME_H__ + +#include + +#include + +#include + +/* Copied from the kernel header arch/riscv/include/uapi/asm/sigcontext.h */ +/* + * Signal context structure + * + * This contains the context saved before a signal handler is invoked; + * it is restored by sys_sigreturn / sys_rt_sigreturn. + */ +// struct sigcontext { +// struct user_regs_struct sc_regs; +// union __riscv_fp_state sc_fpregs; +// /* +// * 4K + 128 reserved for vector state and future expansion. +// * This space is enough to store the vector context whose VLENB +// * is less or equal to 128. +// * (The size of the vector context is 4144 byte as VLENB is 128) +// */ +// __u8 __reserved[4224] __attribute__((__aligned__(16))); +// }; + +#define rt_sigcontext sigcontext + +#include + +/* Copied from the kernel source arch/riscv/kernel/signal.c */ +struct rt_sigframe { + siginfo_t info; + ucontext_t uc; //ucontext_t structure holds the user context, e.g., the signal mask, GP regs +}; + +/* + generates inline assembly code for triggering the rt_sigreturn system call. + used to return from a signal handler back to the normal execution flow of the process. +*/ +/* clang-format off */ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ + asm volatile( \ + "mv sp, %0\n" \ + "li a7, "__stringify(__NR_rt_sigreturn)" \n" \ + "ecall\n" \ + : \ + : "r"(new_sp) \ + : "a7", "memory") +/* clang-format on */ + +#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) +#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.__gregs[REG_PC]) +#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 +#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 + +// #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) +// #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct sigcontext *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) +// #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) + +#define rt_sigframe_erase_sigset(sigframe) \ + memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) // erase the signal mask +#define rt_sigframe_copy_sigset(sigframe, from) \ + memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) // copy the signal mask + +#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c new file mode 100644 index 000000000..3f3a4b7ec --- /dev/null +++ b/compel/arch/riscv64/src/lib/infect.c @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include +#include "common/page.h" +#include "uapi/compel/asm/infect-types.h" +#include "log.h" +#include "errno.h" +#include "infect.h" +#include "infect-priv.h" + +unsigned __page_size = 0; +unsigned __page_shift = 0; + +/* + * Injected syscall instruction + */ +const char code_syscall[] = { + 0x73, 0x00, 0x00, 0x00, /* ecall */ + 0x73, 0x00, 0x10, 0x00 /* ebreak */ +}; + +static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); + +static inline void __always_unused __check_code_syscall(void) +{ + BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); + BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); +} + +int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + sigframe->uc.uc_mcontext.__gregs[0] = regs->pc; + sigframe->uc.uc_mcontext.__gregs[1] = regs->ra; + sigframe->uc.uc_mcontext.__gregs[2] = regs->sp; + sigframe->uc.uc_mcontext.__gregs[3] = regs->gp; + sigframe->uc.uc_mcontext.__gregs[4] = regs->tp; + sigframe->uc.uc_mcontext.__gregs[5] = regs->t0; + sigframe->uc.uc_mcontext.__gregs[6] = regs->t1; + sigframe->uc.uc_mcontext.__gregs[7] = regs->t2; + sigframe->uc.uc_mcontext.__gregs[8] = regs->s0; + sigframe->uc.uc_mcontext.__gregs[9] = regs->s1; + sigframe->uc.uc_mcontext.__gregs[10] = regs->a0; + sigframe->uc.uc_mcontext.__gregs[11] = regs->a1; + sigframe->uc.uc_mcontext.__gregs[12] = regs->a2; + sigframe->uc.uc_mcontext.__gregs[13] = regs->a3; + sigframe->uc.uc_mcontext.__gregs[14] = regs->a4; + sigframe->uc.uc_mcontext.__gregs[15] = regs->a5; + sigframe->uc.uc_mcontext.__gregs[16] = regs->a6; + sigframe->uc.uc_mcontext.__gregs[17] = regs->a7; + sigframe->uc.uc_mcontext.__gregs[18] = regs->s2; + sigframe->uc.uc_mcontext.__gregs[19] = regs->s3; + sigframe->uc.uc_mcontext.__gregs[20] = regs->s4; + sigframe->uc.uc_mcontext.__gregs[21] = regs->s5; + sigframe->uc.uc_mcontext.__gregs[22] = regs->s6; + sigframe->uc.uc_mcontext.__gregs[23] = regs->s7; + sigframe->uc.uc_mcontext.__gregs[24] = regs->s8; + sigframe->uc.uc_mcontext.__gregs[25] = regs->s9; + sigframe->uc.uc_mcontext.__gregs[26] = regs->s10; + sigframe->uc.uc_mcontext.__gregs[27] = regs->s11; + sigframe->uc.uc_mcontext.__gregs[28] = regs->t3; + sigframe->uc.uc_mcontext.__gregs[29] = regs->t4; + sigframe->uc.uc_mcontext.__gregs[30] = regs->t5; + sigframe->uc.uc_mcontext.__gregs[31] = regs->t6; + + memcpy(sigframe->uc.uc_mcontext.__fpregs.__d.__f, fpregs->f, sizeof(fpregs->f)); + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpregs->fcsr; + + return 0; +} + +int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} + +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, + void *arg, __maybe_unused unsigned long flags) +{ + user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; + struct iovec iov; + int ret = -1; + + pr_info("Dumping FPU registers for %d\n", pid); + + iov.iov_base = fpsimd; + iov.iov_len = sizeof(*fpsimd); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { + pr_perror("Failed to obtain FPU registers for %d", pid); + return -1; + } + + ret = save(pid, arg, regs, fpsimd); + return ret; +} + +int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) +{ + struct iovec iov; + + pr_info("Restoring GP/FPU registers for %d\n", pid); + + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); + if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { + pr_perror("Failed to set FPU registers for %d", pid); + return -1; + } + return 0; +} + +int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) +{ + user_regs_struct_t regs = ctl->orig.regs; + int err; + + regs.a7 = (unsigned long)nr; + regs.a0 = arg1; + regs.a1 = arg2; + regs.a2 = arg3; + regs.a3 = arg4; + regs.a4 = arg5; + regs.a5 = arg6; + regs.a6 = 0; + + err = compel_execute_syscall(ctl, ®s, code_syscall); + + *ret = regs.a0; + return err; +} + +/* + * Calling the mmap system call in the context of the target (victim) process using the compel_syscall function. + * Used during the infection process to allocate memory for the parasite code. +*/ +void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) +{ + long map; + int err; + + err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); + if (err < 0 || (long)map < 0) + map = 0; + + return (void *)map; +} + +void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) +{ + regs->pc = new_ip; + if (stack) + regs->sp = (unsigned long)stack; +} + +bool arch_can_dump_task(struct parasite_ctl *ctl) +{ + /* + * TODO: Add proper check here. + */ + return true; +} + +/* + * Fetch the signal alternate stack (sigaltstack), + * sas is a separate memory area for the signal handler to run on, + * avoiding potential issues with the main process stack +*/ +int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) +{ + long ret; + int err; + + err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); + return err ? err : ret; +} + +/* + * Task size is the maximum virtual address space size that a process can occupy in the memory + * Refer to linux kernel arch/riscv/include/asm/pgtable.h, + * task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * - 0x100000000000000 ( 64PB) for RV64 using SV57 mmu + */ +#define TASK_SIZE_MIN (1UL << 38) +#define TASK_SIZE_MAX (1UL << 56) + +unsigned long compel_task_size(void) +{ + unsigned long task_size; + + for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) + if (munmap((void *)task_size, page_size())) + break; + return task_size; +} + +/* + * Get task registers (overwrites weak function) + */ +int ptrace_get_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); +} + +/* + * Set task registers (overwrites weak function) + */ +int ptrace_set_regs(int pid, user_regs_struct_t *regs) +{ + struct iovec iov; + + iov.iov_base = regs; + iov.iov_len = sizeof(user_regs_struct_t); + return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); +} diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 6a349e1cb..ff2f33006 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 260 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 248 sys_exit_group (int error_code) __NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) @@ -114,7 +114,9 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 3cd25e71d..a77b38917 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -293,10 +293,9 @@ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) /* * Prepare task registers for restart */ -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; struct iovec iov; int rewind; @@ -349,7 +348,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } /* Call save_task_regs() */ - return save(arg, regs, fpregs); + return save(pid, arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) diff --git a/compel/arch/x86/plugins/std/parasite-head.S b/compel/arch/x86/plugins/std/parasite-head.S index 4fb38d1f1..42cad4808 100644 --- a/compel/arch/x86/plugins/std/parasite-head.S +++ b/compel/arch/x86/plugins/std/parasite-head.S @@ -34,7 +34,21 @@ END(__export_parasite_head_start_compat) .code64 #endif +/* + * When parasite_service() runs in the daemon mode it will return the stack + * pointer for the sigreturn frame in %rax and we call sigreturn directly + * from here. + * Since a valid stack pointer is positive, it is safe to presume that + * return value <= 0 means that parasite_service() called parasite_trap_cmd() + * in non-daemon mode, and the parasite should stop at int3. + */ ENTRY(__export_parasite_head_start) call parasite_service + cmp $0, %rax + jle 1f + movq %rax, %rsp + movq $15, %rax + syscall +1: int $0x03 END(__export_parasite_head_start) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index a119a59b2..cc23dc3f3 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -102,7 +102,9 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 16dd86e79..8c3620c2a 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -85,7 +85,7 @@ __NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int fla __NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) +__NR_clock_gettime 228 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) __NR_exit_group 231 sys_exit_group (int error_code) __NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) @@ -113,7 +113,10 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) __NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) +__NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) +__NR_map_shadow_stack 453 sys_map_shadow_stack (unsigned long addr, unsigned long size, unsigned int flags) diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h index 63ff83dbe..11c50e0e5 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -244,6 +244,7 @@ enum cpuid_leafs { #define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ #define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_SHSTK (11 * 32 + 7) /* Shadow Stack */ #define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index bd3b0cbd5..d595a68fc 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -21,7 +21,28 @@ #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 -#define XSAVE_SIZE 4096 +/* + * This used to be 4096 (one page). There is a comment below concerning + * this size: + * "One page should be enough for the whole xsave state ;-)" + * Which is kind of funny as it is no longer enough ;-) + * + * Older CPUs: + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) + * + * Newer CPUs (Sapphire Rapids): + * # cpuid -1 -l 0xd -s 0 + * ... + * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) + * + * So one page is no longer enough... But: + * + * Four pages should be enough for the whole xsave state ;-) + */ + +#define XSAVE_SIZE 4*4096 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE @@ -224,6 +245,14 @@ struct pkru_state { uint32_t pad; } __packed; +/* + * State component 11 is Control-flow Enforcement user states + */ +struct cet_user_state { + uint64_t cet; /* user control-flow settings */ + uint64_t ssp; /* user shadow stack pointer */ +}; + /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. @@ -235,8 +264,11 @@ struct pkru_state { * * * One page should be enough for the whole xsave state ;-) + * + * Of course it was not ;-) Now using four pages... + * */ -#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) +#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct) - sizeof(struct cet_user_state)) /* * cpu requires it to be 64 byte aligned @@ -252,6 +284,7 @@ struct xsave_struct { struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; + struct cet_user_state cet; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index b35504ff8..b998c488c 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -143,4 +143,11 @@ typedef struct xsave_struct user_fpregs_struct_t; */ #define __NR32_mmap __NR32_mmap2 +extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); +#define compel_shstk_enabled __compel_shstk_enabled + +extern int __parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs); +#define parasite_setup_shstk __parasite_setup_shstk + #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index ec8c156fa..4a2e67559 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -177,6 +177,24 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define USER32_CS 0x23 /* clang-format off */ +/* + * rst_sigreturn in resorer is noninline call which adds an entry to the + * shadow stack above the sigframe token; + * if shadow stack is enabled, increment the shadow stack pointer to remove + * that entry + */ +#define ARCH_SHSTK_POP() \ + asm volatile( \ + "xor %%rax, %%rax\n" \ + "rdsspq %%rax\n" \ + "cmpq $0, %%rax\n" \ + "jz 1f\n" \ + "movq $1, %%rax\n" \ + "incsspq %%rax\n" \ + "1:\n" \ + : : \ + : "rax") + #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ @@ -203,10 +221,19 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) : "rdi"(new_sp) \ : "eax", "r8", "r9", "r10", "r11", "memory") -#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ +#define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ +do { \ + if ((rt_sigframe)->is_native) { \ + ARCH_SHSTK_POP(); \ + ARCH_RT_SIGRETURN_NATIVE(new_sp); \ + } else \ + ARCH_RT_SIGRETURN_COMPAT(new_sp); \ +} while (0) + +#define ARCH_RT_SIGRETURN_DUMP(new_sp, rt_sigframe) \ do { \ if ((rt_sigframe)->is_native) \ - ARCH_RT_SIGRETURN_NATIVE(new_sp); \ + return new_sp; \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 98e2512e7..afcf2c53b 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -26,6 +26,16 @@ #ifndef NT_X86_XSTATE #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #endif + +#ifndef NT_X86_SHSTK +#define NT_X86_SHSTK 0x204 /* x86 shstk state */ +#endif + +#ifndef ARCH_SHSTK_STATUS +#define ARCH_SHSTK_STATUS 0x5005 +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#endif + #ifndef NT_PRSTATUS #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ #endif @@ -220,6 +230,16 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) +{ + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; +} + static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { struct iovec iov; @@ -232,19 +252,75 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) return -1; } - return 0; -} + if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { + // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. + // Since those are restored unconditionally, make sure the init values are + // filled by retrying with old PTRACE_GETFPREGS. + // + // [1] Intel® 64 and IA-32 Architectures Software Developer's + // Manual Volume 1: Basic Architecture + // Section 13.6: Processor tracking of XSAVE-managed state + if (get_task_fpregs(pid, xsave)) + return -1; + } -static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) -{ - if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { - pr_perror("Can't obtain FPU registers for %d", pid); - return -1; + /* + * xsave may be on stack, if we don't clear it explicitly we get + * funky shadow stack state + */ + memset(&xsave->cet, 0, sizeof(xsave->cet)); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + unsigned long ssp = 0; + unsigned long features = 0; + + if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { + /* + * kernels that don't support shadow stack return + * -EINVAL + */ + if (errno == EINVAL) + return 0; + + pr_perror("shstk: can't get shadow stack status for %d", pid); + return -1; + } + + if (!(features & ARCH_SHSTK_SHSTK)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: can't get SSP for %d", pid); + return -1; + } + } + + xsave->cet.cet = features; + xsave->cet.ssp = ssp; + + pr_debug("%d: shstk: cet: %lx ssp: %lx\n", pid, xsave->cet.cet, xsave->cet.ssp); } return 0; } +static inline void fixup_mxcsr(struct xsave_struct *xsave) +{ + /* + * Right now xsave->i387.mxcsr filled with the random garbage, + * let's make it valid by applying mask which allows all + * features, except the denormals-are-zero feature bit. + * + * See also fpu__init_system_mxcsr function: + * https://github.com/torvalds/linux/blob/8cb1ae19/arch/x86/kernel/fpu/init.c#L117 + */ + xsave->i387.mxcsr &= 0x0000ffbf; +} + /* See arch/x86/kernel/fpu/xstate.c */ static void validate_random_xstate(struct xsave_struct *xsave) { @@ -272,17 +348,6 @@ static void validate_random_xstate(struct xsave_struct *xsave) /* No reserved bits may be set */ memset(&hdr->reserved, 0, sizeof(hdr->reserved)); - - /* - * While using PTRACE_SETREGSET the kernel checks that - * "Reserved bits in MXCSR must be zero." - * if (mxcsr[0] & ~mxcsr_feature_mask) - * return -EINVAL; - * - * As the mxcsr_feature_mask depends on the CPU the easiest solution for - * this error injection test is to set mxcsr just to zero. - */ - xsave->i387.mxcsr = 0; } /* @@ -309,6 +374,8 @@ static int corrupt_extregs(pid_t pid) */ pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); + fixup_mxcsr(&ext_regs); + if (!use_xsave) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { pr_perror("Can't set FPU registers for %d", pid); @@ -330,10 +397,9 @@ static int corrupt_extregs(pid_t pid) return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, void *arg, unsigned long flags) { - user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); @@ -387,7 +453,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; out: - ret = save(arg, regs, xs); + ret = save(pid, arg, regs, xs); err: return ret; } @@ -584,6 +650,7 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) int ptrace_set_breakpoint(pid_t pid, void *addr) { + k_rtsigset_t block; int ret; /* Set a breakpoint */ @@ -599,6 +666,16 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) return -1; } + /* + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler + * will be reset to the default one. + */ + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { + pr_perror("Can't block signals for %d", pid); + return -1; + } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); @@ -672,3 +749,59 @@ unsigned long compel_task_size(void) { return TASK_SIZE; } + +bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return false; + + if (ext_regs->cet.cet & ARCH_SHSTK_SHSTK) + return true; + + return false; +} + +int parasite_setup_shstk(struct parasite_ctl *ctl, __maybe_unused user_fpregs_struct_t *ext_regs) +{ + pid_t pid = ctl->rpid; + unsigned long sa_restorer = ctl->parasite_ip; + unsigned long long ssp; + unsigned long token; + struct iovec iov; + + if (!compel_shstk_enabled(ext_regs)) + return 0; + + iov.iov_base = &ssp; + iov.iov_len = sizeof(ssp); + if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + /* ENODEV means CET is not supported by the CPU */ + if (errno != ENODEV) { + pr_perror("shstk: %d: cannot get SSP", pid); + return -1; + } + } + + /* The token is for 64-bit */ + token = ALIGN_DOWN(ssp, 8); + token |= (1UL << 63); + ssp = ALIGN_DOWN(ssp, 8) - 8; + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, token)) { + pr_perror("shstk: %d: failed to inject shadow stack token", pid); + return -1; + } + + ssp = ssp - sizeof(uint64_t); + if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, sa_restorer)) { + pr_perror("shstk: %d: failed to inject restorer address", pid); + return -1; + } + + ssp = ssp + sizeof(uint64_t); + if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { + pr_perror("shstk: %d: cannot write SSP", pid); + return -1; + } + + return 0; +} diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h index 9d3442839..8e78a7f6c 100644 --- a/compel/include/infect-priv.h +++ b/compel/include/infect-priv.h @@ -72,6 +72,7 @@ extern bool arch_can_dump_task(struct parasite_ctl *ctl); extern int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, unsigned long flags); extern int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs); +extern int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs); extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs); diff --git a/compel/include/log.h b/compel/include/log.h index 0e33976b1..5250622c8 100644 --- a/compel/include/log.h +++ b/compel/include/log.h @@ -1,6 +1,9 @@ #ifndef COMPEL_LOG_H__ #define COMPEL_LOG_H__ +#include +#include + #include "uapi/compel/log.h" #ifndef LOG_PREFIX @@ -45,6 +48,6 @@ extern void compel_print_on_level(unsigned int loglevel, const char *format, ... #define pr_debug(fmt, ...) compel_print_on_level(COMPEL_LOG_DEBUG, LOG_PREFIX fmt, ##__VA_ARGS__) -#define pr_perror(fmt, ...) pr_err(fmt ": %m\n", ##__VA_ARGS__) +#define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) #endif /* COMPEL_LOG_H__ */ diff --git a/compel/include/ptrace.h b/compel/include/ptrace.h index bf2701e63..00013f937 100644 --- a/compel/include/ptrace.h +++ b/compel/include/ptrace.h @@ -5,6 +5,8 @@ #include #include +#define PTRACE_SYSCALL_TRAP 0x80 + #define PTRACE_SI_EVENT(_si_code) (((_si_code)&0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h index ace6f6b6b..658df9393 100644 --- a/compel/include/uapi/infect-util.h +++ b/compel/include/uapi/infect-util.h @@ -3,11 +3,20 @@ #include "common/compiler.h" +/** + * The length of the hash is based on what libuuid provides. + * According to the manpage this is: + * + * The uuid_unparse() function converts the supplied UUID uu from the binary + * representation into a 36-byte string (plus trailing '\0') + */ +#define RUN_ID_HASH_LENGTH 37 + /* * compel_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other processes. */ -extern uint64_t compel_run_id; +extern char compel_run_id[RUN_ID_HASH_LENGTH]; struct parasite_ctl; extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 3040a67a7..d21c261b7 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,11 +13,21 @@ #define PARASITE_START_AREA_MIN (4096) +#define PARASITE_STACK_SIZE (16 << 10) +/* + * A stack redzone is a small, protected region of memory located immediately + * after a parasite stack. It is intended to remain unchanged. While it can be + * implemented as a guard page, we want to avoid the overhead of additional + * remote system calls. + */ +#define PARASITE_STACK_REDZONE 128 + extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; unsigned long long shdpnd; + unsigned long long sigblk; char state; int vpid; int ppid; @@ -30,7 +40,9 @@ extern int __must_check compel_wait_task(int pid, int ppid, struct seize_task_status *st, void *data); extern int __must_check compel_stop_task(int pid); +extern int __must_check compel_parse_stop_signo(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); +extern int compel_resume_task_sig(pid_t pid, int orig_state, int state, int stop_signo); struct parasite_ctl; struct parasite_thread_ctl; @@ -38,9 +50,12 @@ struct parasite_thread_ctl; extern struct parasite_ctl __must_check *compel_prepare(int pid); extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); +extern int __must_check compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, + unsigned long args_size); extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); +extern int __must_check compel_start_daemon(struct parasite_ctl *ctl); extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); @@ -77,9 +92,9 @@ enum trace_flags { TRACE_EXIT, }; -extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat, enum trace_flags trace); +extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat); -extern int __must_check compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); +extern int __must_check compel_stop_pie(pid_t pid, void *addr, bool no_bp); extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); @@ -91,7 +106,7 @@ extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); -typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); +typedef int (*save_regs_t)(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); struct infect_ctx { @@ -114,6 +129,7 @@ struct infect_ctx { open_proc_fn open_proc; int log_fd; /* fd for parasite code to send messages to */ + unsigned long remote_map_addr; /* User-specified address where to mmap parasitic code, default not set */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); @@ -174,4 +190,31 @@ extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); +extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); + +#ifndef compel_host_supports_gcs +static inline bool compel_host_supports_gcs(void) +{ + return false; +} +#define compel_host_supports_gcs +#endif + +#ifndef compel_shstk_enabled +static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) +{ + return false; +} +#define compel_shstk_enabled +#endif + +#ifndef parasite_setup_shstk +static inline int parasite_setup_shstk(struct parasite_ctl *ctl, + user_fpregs_struct_t *ext_regs) +{ + return 0; +} +#define parasite_setup_shstk parasite_setup_shstk +#endif + #endif diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index 63dfee97f..558124fbd 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -86,6 +86,19 @@ struct __ptrace_rseq_configuration { #define PTRACE_EVENT_STOP 128 #endif +/* + * Amazon Linux 2 uses glibc 2.26. PTRACE_ARCH_PRCTL was added in glibc 2.27. + * This allows CRIU to build on Amazon Linux 2. + * + * Note that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the + * preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol + * that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to + * decide if PTRACE_ARCH_PRCTL is available or not. + */ +#if defined(__x86_64__) && !defined(PT_ARCH_PRCTL) +#define PTRACE_ARCH_PRCTL 30 /* From asm/ptrace-abi.h. */ +#endif + extern int ptrace_suspend_seccomp(pid_t pid); extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h index 08a5a7a80..a729abbd2 100644 --- a/compel/plugins/include/uapi/std/infect.h +++ b/compel/plugins/include/uapi/std/infect.h @@ -7,7 +7,7 @@ extern int parasite_get_rpc_sock(void); extern unsigned int __export_parasite_service_cmd; extern void *__export_parasite_service_args_ptr; -extern int __must_check parasite_service(void); +extern unsigned long __must_check parasite_service(void); /* * Must be supplied by user plugins. diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index abecc140f..034201320 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -16,6 +16,10 @@ #include "rpc-pie-priv.h" +#ifndef ARCH_RT_SIGRETURN_DUMP +#define ARCH_RT_SIGRETURN_DUMP ARCH_RT_SIGRETURN +#endif + static int tsock = -1; static struct rt_sigframe *sigframe; @@ -27,7 +31,7 @@ static struct rt_sigframe *sigframe; */ static unsigned __page_size; -unsigned __attribute((weak)) page_size(void) +unsigned long __attribute((weak)) page_size(void) { return __page_size; } @@ -79,12 +83,13 @@ static int __parasite_daemon_wait_msg(struct ctl_msg *m) /* Core infect code */ -static noinline void fini_sigreturn(unsigned long new_sp) +static noinline unsigned long fini_sigreturn(unsigned long new_sp) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_DUMP(new_sp, sigframe); + return new_sp; } -static int fini(void) +static unsigned long fini(void) { unsigned long new_sp; @@ -96,14 +101,14 @@ static int fini(void) sys_close(tsock); std_log_set_fd(-1); - fini_sigreturn(new_sp); + return fini_sigreturn(new_sp); BUG(); return -1; } -static noinline __used int noinline parasite_daemon(void *args) +static noinline __used unsigned long parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; @@ -140,12 +145,10 @@ static noinline __used int noinline parasite_daemon(void *args) } out: - fini(); - - return 0; + return fini(); } -static noinline __used int parasite_init_daemon(void *data) +static noinline __used unsigned long parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; @@ -178,14 +181,11 @@ static noinline __used int parasite_init_daemon(void *data) } else goto err; - parasite_daemon(data); + return parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); - fini(); - BUG(); - - return -1; + return fini(); } #ifndef __parasite_entry @@ -203,7 +203,7 @@ err: unsigned int __export_parasite_service_cmd = 0; void *__export_parasite_service_args_ptr = NULL; -int __used __parasite_entry parasite_service(void) +unsigned long __used __parasite_entry parasite_service(void) { unsigned int cmd = __export_parasite_service_cmd; void *args = __export_parasite_service_args_ptr; diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c index 00a7c83f7..dc57e28f7 100644 --- a/compel/src/lib/infect-util.c +++ b/compel/src/lib/infect-util.c @@ -7,7 +7,7 @@ #include "infect-rpc.h" #include "infect-util.h" -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index c78c02a6a..22fcf24fa 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -38,8 +38,6 @@ #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif -#define PARASITE_STACK_SIZE (16 << 10) - #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -92,6 +90,12 @@ static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(aux, "SigBlk:", 7)) { + if (sscanf(aux + 7, "%llx", &ss->sigblk) != 1) + goto err_parse; + + continue; + } } fclose(f); @@ -186,6 +190,29 @@ static int skip_sigstop(int pid, int nr_signals) return 0; } +#define SIG_MASK(sig) (1ULL << ((sig)-1)) + +#define SIG_IN_MASK(sig, mask) ((sig) > 0 && (sig) <= SIGMAX && (SIG_MASK(sig) & (mask))) + +#define SUPPORTED_STOP_MASK ((1ULL << (SIGSTOP - 1)) | (1ULL << (SIGTSTP - 1))) + +static inline int sig_stop(int sig) +{ + return SIG_IN_MASK(sig, SUPPORTED_STOP_MASK); +} + +int compel_parse_stop_signo(int pid) +{ + siginfo_t si; + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si) < 0) { + pr_perror("SEIZE %d: can't parse stopped siginfo", pid); + return -1; + } + + return si.si_signo; +} + /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace @@ -198,7 +225,7 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ void *data) { siginfo_t si; - int status, nr_sigstop; + int status, nr_stopsig; int ret = 0, ret2, wait_errno = 0; /* @@ -275,6 +302,11 @@ try_again: goto try_again; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } + if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; @@ -291,17 +323,32 @@ try_again: goto err; } - nr_sigstop = 0; - if (ss->sigpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (ss->shdpnd & (1 << (SIGSTOP - 1))) - nr_sigstop++; - if (si.si_signo == SIGSTOP) - nr_sigstop++; + nr_stopsig = 0; + if (SIG_IN_MASK(SIGSTOP, ss->sigpnd)) + nr_stopsig++; + if (SIG_IN_MASK(SIGSTOP, ss->shdpnd)) + nr_stopsig++; - if (nr_sigstop) { - if (skip_sigstop(pid, nr_sigstop)) - goto err_stop; + if (SIG_IN_MASK(SIGTSTP, ss->sigpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + if (SIG_IN_MASK(SIGTSTP, ss->shdpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) + nr_stopsig++; + + if (sig_stop(si.si_signo)) + nr_stopsig++; + + if (nr_stopsig) { + if (skip_sigstop(pid, nr_stopsig)) { + /* + * Make sure that the task is stopped by a supported stop signal and + * send it again to restore task state before criu intervention. + */ + if (sig_stop(si.si_signo)) + kill(pid, si.si_signo); + else + kill(pid, SIGSTOP); + goto err; + } return COMPEL_TASK_STOPPED; } @@ -313,8 +360,6 @@ try_again: goto err; } -err_stop: - kill(pid, SIGSTOP); err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); @@ -322,6 +367,11 @@ err: } int compel_resume_task(pid_t pid, int orig_st, int st) +{ + return compel_resume_task_sig(pid, orig_st, st, SIGSTOP); +} + +int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) { int ret = 0; @@ -345,8 +395,18 @@ int compel_resume_task(pid_t pid, int orig_st, int st) * task with STOP in queue that would get lost after * detach, so stop it again. */ - if (orig_st == COMPEL_TASK_STOPPED) - kill(pid, SIGSTOP); + if (orig_st == COMPEL_TASK_STOPPED) { + /* + * Check that stop_signo contain supported stop signal. + * If it isn't, then send SIGSTOP. It makes sense in the case + * when we get COMPEL_TASK_STOPPED from old image, + * where stop_signo was not yet supported. + */ + if (sig_stop(stop_signo)) + kill(pid, stop_signo); + else + kill(pid, SIGSTOP); + } } else { pr_err("Unknown final state %d\n", st); ret = -1; @@ -365,7 +425,7 @@ static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) int sun_len; saddr->sun_family = AF_UNIX; - snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%" PRIx64, key, compel_run_id); + snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%s", key, compel_run_id); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; @@ -527,7 +587,7 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t } if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); goto err; } @@ -677,6 +737,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; + user_fpregs_struct_t ext_regs; /* * Get task registers before going daemon, since the @@ -684,7 +745,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) * while in daemon it is not such. */ - if (compel_get_task_regs(pid, &ctl->orig.regs, NULL, ictx->save_regs, ictx->regs_arg, ictx->flags)) { + if (compel_get_task_regs(pid, &ctl->orig.regs, &ext_regs, ictx->save_regs, ictx->regs_arg, ictx->flags)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } @@ -697,6 +758,9 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; + if (parasite_setup_shstk(ctl, &ext_regs)) + return -1; + if (parasite_init_daemon(ctl)) return -1; @@ -750,7 +814,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; - int ret, fd, lfd; + int ret, fd, lfd, remote_flags; if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; @@ -794,7 +858,11 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, goto err_cure; } - ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_FILE | MAP_SHARED, fd, 0); + remote_flags = MAP_FILE | MAP_SHARED; + if (ctl->ictx.remote_map_addr){ + remote_flags |= MAP_FIXED_NOREPLACE; + } + ctl->remote_map = remote_mmap(ctl, (void *)ctl->ictx.remote_map_addr, size, remote_prot, remote_flags, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; @@ -905,7 +973,7 @@ static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) return ret; } -int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; @@ -986,6 +1054,16 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; @@ -994,7 +1072,7 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p; + ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; /* * x86-64 ABI requires a 16 bytes aligned stack. @@ -1008,7 +1086,7 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p; + ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; } ret = arch_fetch_sas(ctl, ctl->rsigframe); @@ -1017,15 +1095,23 @@ int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned l goto err; } - if (parasite_start_daemon(ctl)) - goto err; - return 0; err: return -1; } +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +{ + if (compel_infect_no_daemon(ctl, nr_threads, args_size)) + return -1; + + if (parasite_start_daemon(ctl)) + return -1; + + return 0; +} + struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; @@ -1222,7 +1308,7 @@ struct plain_regs_struct { user_fpregs_struct_t fpregs; }; -static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) +static int save_regs_plain(pid_t pid, void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; @@ -1309,7 +1395,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; - enum trace_flags flag; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) @@ -1329,7 +1414,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d)\n", pid); + pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); return -1; } @@ -1350,14 +1435,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return -1; /* Go to sigreturn as closer as we can */ - ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ret = compel_stop_pie(pid, ctl->sigreturn_addr, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; - if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag)) - return -1; - - if (ptrace_flush_breakpoints(pid)) + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) return -1; /* @@ -1369,6 +1451,11 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; } +int compel_start_daemon(struct parasite_ctl *ctl) +{ + return parasite_start_daemon(ctl); +} + int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { @@ -1489,7 +1576,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) if (ret) goto err; - ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1), TRACE_ENTER); + ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1)); /* * Don't touch extended registers here: they were restored @@ -1501,12 +1588,12 @@ err: return ret; } -int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) +int compel_stop_pie(pid_t pid, void *addr, bool no_bp) { int ret; if (no_bp) { - pr_debug("Force no-breakpoints restore\n"); + pr_debug("Force no-breakpoints restore of %d\n", pid); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); @@ -1518,7 +1605,6 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ - *tf = TRACE_EXIT; return 0; } @@ -1531,14 +1617,12 @@ int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) pr_perror("Unable to restart the %d process", pid); return -1; } - - *tf = TRACE_ENTER; return 0; } static bool task_is_trapped(int status, pid_t pid) { - if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) + if (WIFSTOPPED(status) && (WSTOPSIG(status) & ~PTRACE_SYSCALL_TRAP) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); @@ -1572,15 +1656,13 @@ static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ -int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, enum trace_flags trace) +int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) { + enum trace_flags trace = tasks > 1 ? TRACE_ALL : TRACE_ENTER; user_regs_struct_t regs; int status, ret; pid_t pid; - if (tasks > 1) - trace = TRACE_ALL; - /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); @@ -1594,6 +1676,18 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, pr_debug("%d was trapped\n", pid); + if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) { + /* + * On some platforms such as ARM64, it is impossible to + * pass through a breakpoint, so let's clear it right + * after it has been triggered. + */ + if (ptrace_flush_breakpoints(pid)) { + pr_err("Unable to clear breakpoints\n"); + return -1; + } + goto goon; + } if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); @@ -1707,3 +1801,11 @@ void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) { SET_REG_IP(tctl->th.regs, v); } + +void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack) +{ + if (rstack) + *rstack = ctl->rstack; + if (r_thread_stack) + *r_thread_stack = ctl->r_thread_stack; +} diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c index 49b685d70..717ee2839 100644 --- a/compel/src/lib/ptrace.c +++ b/compel/src/lib/ptrace.c @@ -23,7 +23,7 @@ int ptrace_suspend_seccomp(pid_t pid) { - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD) < 0) { pr_perror("suspending seccomp failed"); return -1; } diff --git a/compel/src/main.c b/compel/src/main.c index 632354582..21e06d7dd 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -56,6 +56,13 @@ static const flags_t flags = { .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_MIPS .arch = "mips", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_LOONGARCH64 + .arch = "loongarch64", + .cflags = COMPEL_CFLAGS_PIE, +#elif defined CONFIG_RISCV64 + .arch = "riscv64", + .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif diff --git a/compel/test/Makefile b/compel/test/Makefile index 63fb76f80..f46a821ee 100644 --- a/compel/test/Makefile +++ b/compel/test/Makefile @@ -1,4 +1,4 @@ -all: fdspy infect rsys +all: fdspy infect rsys stack fdspy: $(Q) $(MAKE) -C fdspy @@ -10,8 +10,12 @@ infect: $(Q) $(MAKE) -C infect run .PHONY: infect - rsys: $(Q) $(MAKE) -C rsys $(Q) $(MAKE) -C rsys run .PHONY: rsys + +stack: + $(Q) $(MAKE) -C stack + $(Q) $(MAKE) -C stack run +.PHONY: stack diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 7f20ea2a7..41de99e20 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -110,11 +110,11 @@ static int check_pipe_ends(int wfd, int rfd) printf("Check pipe ends are connected\n"); if (write(wfd, "1", 2) != 2) { fprintf(stderr, "write to pipe failed\n"); - return -1; + return 0; } if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { fprintf(stderr, "read from pipe failed\n"); - return -1; + return 0; } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); diff --git a/compel/test/infect/Makefile b/compel/test/infect/Makefile index bacfad962..85efa5fd9 100644 --- a/compel/test/infect/Makefile +++ b/compel/test/infect/Makefile @@ -3,6 +3,11 @@ CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host +ifeq ($(GCS_ENABLE),1) +CFLAGS += -mbranch-protection=standard -DGCS_TEST_ENABLE=1 +LDFLAGS += -z experimental-gcs=check +endif + all: victim spy run: @@ -17,7 +22,7 @@ clean: rm -f parasite.o victim: victim.c - $(CC) $(CFLAGS) -o $@ $^ + $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index e7273b446..143946941 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -94,15 +94,15 @@ static inline int chk(int fd, int val) int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) - return 0; + return 1; printf("%d, want %d\n", v, val); - return v == val; + return v != val; } int main(int argc, char **argv) { - int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; /* * Prepare IO-s and fork the victim binary @@ -112,6 +112,9 @@ int main(int argc, char **argv) return -1; } +#ifdef GCS_TEST_ENABLE + setenv("GLIBC_TUNABLES", "glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2", 1); +#endif pid = vfork(); if (pid == 0) { close(p_in[1]); @@ -142,9 +145,11 @@ int main(int argc, char **argv) return 1; printf("Checking the victim alive\n"); - pass = chk(p_out[0], 1); - pass = chk(p_out[0], 42); - if (!pass) + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) return 1; /* @@ -176,14 +181,14 @@ int main(int argc, char **argv) printf("Checking the result\n"); /* These two came from parasite */ - pass = chk(p_out[0], 138); - pass = chk(p_out[0], 403); + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); /* These two came from post-infect */ - pass = chk(p_out[0], 1234); - pass = chk(p_out[0], 4096); + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); - if (pass) + if (!err) printf("All OK\n"); else printf("Something went WRONG\n"); diff --git a/compel/test/stack/.gitignore b/compel/test/stack/.gitignore new file mode 100644 index 000000000..0a554758d --- /dev/null +++ b/compel/test/stack/.gitignore @@ -0,0 +1,4 @@ +parasite.h +parasite.po +spy +victim diff --git a/compel/test/stack/Makefile b/compel/test/stack/Makefile new file mode 100644 index 000000000..bacfad962 --- /dev/null +++ b/compel/test/stack/Makefile @@ -0,0 +1,32 @@ +CC := gcc +CFLAGS ?= -O2 -g -Wall -Werror + +COMPEL := ../../../compel/compel-host + +all: victim spy + +run: + ./spy +.PHONY: run + +clean: + rm -f victim + rm -f spy + rm -f parasite.h + rm -f parasite.po + rm -f parasite.o + +victim: victim.c + $(CC) $(CFLAGS) -o $@ $^ + +spy: spy.c parasite.h + $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) + +parasite.h: parasite.po + $(COMPEL) hgen -o $@ -f $< + +parasite.po: parasite.o + ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) + +parasite.o: parasite.c + $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/compel/test/stack/parasite.c b/compel/test/stack/parasite.c new file mode 100644 index 000000000..ad13bd25d --- /dev/null +++ b/compel/test/stack/parasite.c @@ -0,0 +1,38 @@ +#include + +#include +#include + +/* + * Stubs for std compel plugin. + */ +int parasite_trap_cmd(int cmd, void *args) +{ + return 0; +} +void parasite_cleanup(void) +{ +} + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +int parasite_daemon_cmd(int cmd, void *args) +{ + int v; + + switch (cmd) { + case PARASITE_CMD_INC: + v = (*(int *)args) + 1; + break; + case PARASITE_CMD_DEC: + v = (*(int *)args) - 1; + break; + default: + v = -1; + break; + } + + sys_write(1, &v, sizeof(int)); + return 0; +} diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c new file mode 100644 index 000000000..184c8ab31 --- /dev/null +++ b/compel/test/stack/spy.c @@ -0,0 +1,294 @@ +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "parasite.h" + +#define PARASITE_CMD_INC PARASITE_USER_CMDS +#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 + +#define err_and_ret(msg) \ + do { \ + fprintf(stderr, msg); \ + return -1; \ + } while (0) + +void *saved_data = NULL; + +#define SAVED_DATA_MAX page_size() + +void cleanup_saved_data(void) +{ + free(saved_data); +} + +static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) +{ + printf("\tLC%u: ", lvl); + vprintf(fmt, parms); +} + +static void *get_parasite_rstack_start(struct parasite_ctl *ctl) +{ + void *rstack, *r_thread_stack, *rstack_start; + + compel_get_stack(ctl, &rstack, &r_thread_stack); + + rstack_start = rstack; + if (r_thread_stack != NULL && r_thread_stack < rstack_start) + rstack_start = r_thread_stack; + + return rstack_start; +} + +static void *read_proc_mem(int pid, void *offset, size_t len) +{ + char victim_mem_path[6 + 11 + 4 + 1]; + int written; + int fd; + void *data; + ssize_t mem_read; + + written = snprintf(victim_mem_path, sizeof(victim_mem_path), "/proc/%d/mem", pid); + if (written < 0 || written >= sizeof(victim_mem_path)) { + fprintf(stderr, "Failed to create path string to victim's /proc/%d/mem file\n", pid); + return NULL; + } + + fd = open(victim_mem_path, O_RDONLY); + if (fd < 0) { + perror("Failed to open victim's /proc/$pid/mem file"); + return NULL; + } + + data = malloc(len); + if (data == NULL) { + perror("Can't allocate memory to read victim's /proc/$pid/mem file"); + return NULL; + } + + mem_read = pread(fd, data, len, (off_t)offset); + if (mem_read == -1) { + perror("Failed to read victim's /proc/$pid/mem file"); + goto freebuf; + } + + return data; + +freebuf: + free(data); + return NULL; +} + +static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) +{ + if (saved_data != NULL) { + void *current_data; + + current_data = read_proc_mem(pid, stack, saved_data_size); + if (current_data == NULL) + return -1; + + if (memcmp(saved_data, current_data, saved_data_size) != 0) + return 1; + } + + return 0; +} + +static int do_infection(int pid) +{ + int state; + struct parasite_ctl *ctl; + struct infect_ctx *ictx; + int *arg; + void *stack; + size_t saved_data_size = PARASITE_STACK_REDZONE; + int saved_data_check; + + compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); + + printf("Stopping task\n"); + state = compel_stop_task(pid); + if (state < 0) + err_and_ret("Can't stop task\n"); + + printf("Preparing parasite ctl\n"); + ctl = compel_prepare(pid); + if (!ctl) + err_and_ret("Can't prepare for infection\n"); + + printf("Configuring contexts\n"); + + /* + * First -- the infection context. Most of the stuff + * is already filled by compel_prepare(), just set the + * log descriptor for parasite side, library cannot + * live w/o it. + */ + ictx = compel_infect_ctx(ctl); + ictx->log_fd = STDERR_FILENO; + + parasite_setup_c_header(ctl); + + printf("Infecting\n"); + if (compel_infect_no_daemon(ctl, 1, sizeof(int))) + err_and_ret("Can't infect victim\n"); + + if (atexit(cleanup_saved_data)) + err_and_ret("Can't register cleanup function with atexit\n"); + + stack = get_parasite_rstack_start(ctl); + + if (compel_start_daemon(ctl)) + err_and_ret("Can't start daemon in victim\n"); + + /* + * Now get the area with arguments and run two + * commands one by one. + */ + arg = compel_parasite_args(ctl, int); + + printf("Running cmd 1\n"); + *arg = 137; + if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) + err_and_ret("Can't run parasite command 1\n"); + + printf("Running cmd 2\n"); + *arg = 404; + if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) + err_and_ret("Can't run parasite command 2\n"); + + saved_data_check = check_saved_data(ctl, pid, stack, saved_data, saved_data_size); + if (saved_data_check == -1) + err_and_ret("Could not check saved data\n"); + if (saved_data_check != 0) + err_and_ret("Saved data unexpectedly modified\n"); + + /* + * Done. Cure and resume the task. + */ + printf("Curing\n"); + if (compel_cure(ctl)) + err_and_ret("Can't cure victim\n"); + + if (compel_resume_task(pid, state, state)) + err_and_ret("Can't unseize task\n"); + + printf("Done\n"); + + return 0; +} + +static inline int chk(int fd, int val) +{ + int v = 0; + + if (read(fd, &v, sizeof(v)) != sizeof(v)) + return 1; + + printf("%d, want %d\n", v, val); + return v != val; +} + +int main(int argc, char **argv) +{ + int p_in[2], p_out[2], p_err[2], pid, i, err = 0; + + /* + * Prepare IO-s and fork the victim binary + */ + if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { + perror("Can't make pipe"); + return -1; + } + + pid = vfork(); + if (pid == 0) { + close(p_in[1]); + dup2(p_in[0], 0); + close(p_in[0]); + close(p_out[0]); + dup2(p_out[1], 1); + close(p_out[1]); + close(p_err[0]); + dup2(p_err[1], 2); + close(p_err[1]); + execl("./victim", "victim", NULL); + exit(1); + } + + close(p_in[0]); + close(p_out[1]); + close(p_err[1]); + + /* + * Tell the little guy some numbers + */ + i = 1; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 42; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + printf("Checking the victim alive\n"); + err = chk(p_out[0], 1); + if (err) + return 1; + err = chk(p_out[0], 42); + if (err) + return 1; + + /* + * Now do the infection with parasite.c + */ + + printf("Infecting the victim\n"); + if (do_infection(pid)) + return 1; + + /* + * Tell the victim some more stuff to check it's alive + */ + i = 1234; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + i = 4096; + if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) + return 1; + + /* + * Stop the victim and check the infection went well + */ + printf("Closing victim stdin\n"); + close(p_in[1]); + printf("Waiting for victim to die\n"); + wait(NULL); + + printf("Checking the result\n"); + + /* These two came from parasite */ + err = chk(p_out[0], 138); + err |= chk(p_out[0], 403); + + /* These two came from post-infect */ + err |= chk(p_out[0], 1234); + err |= chk(p_out[0], 4096); + + if (!err) + printf("All OK\n"); + else + printf("Something went WRONG\n"); + + return 0; +} diff --git a/compel/test/stack/victim.c b/compel/test/stack/victim.c new file mode 100644 index 000000000..f94613fa1 --- /dev/null +++ b/compel/test/stack/victim.c @@ -0,0 +1,16 @@ +#include + +int main(int argc, char **argv) +{ + int i; + + while (1) { + if (read(0, &i, sizeof(i)) != sizeof(i)) + break; + + if (write(1, &i, sizeof(i)) != sizeof(i)) + break; + } + + return 0; +} diff --git a/scripts/ci/apt-install b/contrib/apt-install similarity index 80% rename from scripts/ci/apt-install rename to contrib/apt-install index 5a790901a..676e0f794 100755 --- a/scripts/ci/apt-install +++ b/contrib/apt-install @@ -15,8 +15,7 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - # shellcheck disable=SC2068 - apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends $@ && break + apt-get update -y && apt-get install -y --no-install-recommends "$@" && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst deleted file mode 100644 index c2d1509fa..000000000 --- a/contrib/debian/dev-packages.lst +++ /dev/null @@ -1,20 +0,0 @@ -# Required packages for development in Debian -build-essential -libprotobuf-dev -libprotobuf-c-dev -protobuf-c-compiler -protobuf-compiler -python3-protobuf -libnet-dev - -# Extra packages, required for testing and building other tools -pkg-config -libnl-3-dev -libbsd0 -libbsd-dev -iproute2 -libcap-dev -libaio-dev -python3-yaml -libnl-route-3-dev -python-future diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh new file mode 100755 index 000000000..c47fb9fe0 --- /dev/null +++ b/contrib/dependencies/apk-packages.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env sh + +apk add --no-cache \ + asciidoctor \ + bash \ + build-base \ + coreutils \ + e2fsprogs \ + elfutils-dev \ + git \ + gnutls-dev \ + go \ + ip6tables \ + iproute2 \ + iptables \ + iptables-legacy \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libcap-utils \ + libdrm-dev \ + libnet-dev \ + libnl3-dev \ + libtraceevent-dev \ + libtracefs-dev \ + nftables \ + nftables-dev \ + perl \ + pkgconfig \ + procps \ + protobuf-c-compiler \ + protobuf-c-dev \ + protobuf-dev \ + py3-importlib-metadata \ + py3-pip \ + py3-protobuf \ + py3-yaml \ + python3 \ + sudo \ + tar \ + util-linux \ + util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh new file mode 100755 index 000000000..30ce6874c --- /dev/null +++ b/contrib/dependencies/apt-cross-packages.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + crossbuild-essential-"${DEBIAN_ARCH}" \ + iproute2:"${DEBIAN_ARCH}" \ + libaio-dev:"${DEBIAN_ARCH}" \ + libbz2-dev:"${DEBIAN_ARCH}" \ + libc6-"${DEBIAN_ARCH}"-cross \ + libc6-dev-"${DEBIAN_ARCH}"-cross \ + libcap-dev:"${DEBIAN_ARCH}" \ + libdrm-dev:"${DEBIAN_ARCH}" \ + libelf-dev:"${DEBIAN_ARCH}" \ + libexpat1-dev:"${DEBIAN_ARCH}" \ + libgnutls28-dev:"${DEBIAN_ARCH}" \ + libnet-dev:"${DEBIAN_ARCH}" \ + libnftables-dev:"${DEBIAN_ARCH}" \ + libnl-3-dev:"${DEBIAN_ARCH}" \ + libnl-route-3-dev:"${DEBIAN_ARCH}" \ + libprotobuf-c-dev:"${DEBIAN_ARCH}" \ + libprotobuf-dev:"${DEBIAN_ARCH}" \ + libssl-dev:"${DEBIAN_ARCH}" \ + libtraceevent-dev:"${DEBIAN_ARCH}" \ + libtracefs-dev:"${DEBIAN_ARCH}" \ + ncurses-dev:"${DEBIAN_ARCH}" \ + uuid-dev:"${DEBIAN_ARCH}" \ + build-essential \ + pkg-config \ + git \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh new file mode 100755 index 000000000..7963be7b4 --- /dev/null +++ b/contrib/dependencies/apt-packages.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env sh + +APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" +if [ ! -x "$APT_INSTALL" ]; then + echo "Error: apt-install not found or not executable" + exit 1 +fi + +"$APT_INSTALL" \ + asciidoctor \ + bash \ + bsdmainutils \ + build-essential \ + gdb \ + git-core \ + iproute2 \ + iptables \ + kmod \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libdrm-dev \ + libelf-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnet-dev \ + libnl-3-dev \ + libnl-route-3-dev \ + libperl-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + libtraceevent-dev \ + libtracefs-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-importlib-metadata \ + python3-pip \ + python3-protobuf \ + python3-yaml \ + time \ + util-linux \ + uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh new file mode 100755 index 000000000..793f267a5 --- /dev/null +++ b/contrib/dependencies/dnf-packages.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env sh + +dnf install -y \ + asciidoc \ + binutils \ + elfutils-libelf-devel \ + gcc \ + git \ + glibc-devel \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libbpf-devel \ + libbsd-devel \ + libcap-devel \ + libdrm-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + libtraceevent-devel \ + libtracefs-devel \ + libuuid-devel \ + make \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + protobuf-c-devel \ + protobuf-compiler \ + protobuf-devel \ + python-devel \ + python3-importlib-metadata \ + python3-protobuf \ + python3-pyyaml \ + python3-setuptools \ + python3-wheel \ + rubygem-asciidoctor \ + xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh new file mode 100755 index 000000000..260797606 --- /dev/null +++ b/contrib/dependencies/pacman-packages.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env sh + +pacman -Syu --noconfirm \ + asciidoctor \ + base-devel \ + bash \ + coreutils \ + diffutils \ + git \ + gnutls \ + go \ + iproute2 \ + iptables \ + libaio \ + libbsd \ + libcap \ + libdrm \ + libelf \ + libnet \ + libnl \ + libtraceevent \ + libtracefs \ + nftables \ + pkg-config \ + protobuf \ + protobuf-c \ + python-importlib-metadata \ + python-pip \ + python-protobuf \ + python-yaml \ + sudo \ + tar \ + util-linux \ + util-linux-libs diff --git a/contrib/docker_cr.sh b/contrib/docker_cr.sh index 9b43d8ba1..04ef676cd 100755 --- a/contrib/docker_cr.sh +++ b/contrib/docker_cr.sh @@ -418,7 +418,7 @@ resolve_path() { local p p="${2}" - if which realpath > /dev/null; then + if command -v realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" @@ -427,7 +427,7 @@ resolve_path() { resolve_cmd() { local cpath - cpath=$(which "${2}") + cpath=$(command -v "${2}") resolve_path "${1}" "${cpath}" } diff --git a/coredump/coredump.py b/coredump/coredump old mode 100644 new mode 100755 similarity index 74% rename from coredump/coredump.py rename to coredump/coredump index 5e63d2138..5b3e6f366 --- a/coredump/coredump.py +++ b/coredump/coredump @@ -1,8 +1,13 @@ +#!/usr/bin/env python3 +import platform import argparse import os +import sys import criu_coredump +PLATFORMS = ["aarch64", "armv7l", "x86_64"] + def coredump(opts): generator = criu_coredump.coredump_generator() @@ -34,7 +39,16 @@ def main(): opts = vars(parser.parse_args()) - coredump(opts) + if platform.machine() not in PLATFORMS: + print("ERROR: %s is only supported on: %s" % (sys.argv[0], ', '.join(PLATFORMS))) + sys.exit(1) + + try: + coredump(opts) + except SystemExit as error: + print('ERROR: %s' % error) + print('Exiting') + sys.exit(1) if __name__ == '__main__': diff --git a/coredump/coredump-python2 b/coredump/coredump-python2 deleted file mode 100755 index 564c05ce9..000000000 --- a/coredump/coredump-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -import coredump - -if __name__ == '__main__': - coredump.main() diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index 53b143ec0..acb806ace 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -31,16 +31,11 @@ import io import sys import ctypes +import platform from pycriu import images from . import elf - -try: - from itertools import ifilter as filter -except ImportError: - pass - # Some memory-related constants PAGESIZE = 4096 status = { @@ -59,6 +54,8 @@ status = { "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, + "VMA_AREA_MEMFD": 1 << 14, + "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } @@ -99,8 +96,13 @@ class coredump: buf.write(b"\0" * (8 - len(note.owner))) buf.write(note.data) - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(self.vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} + + offset = ctypes.sizeof(ehdr[bits]()) + offset += (len(self.vmas) + 1) * ctypes.sizeof(phdr[bits]()) filesz = 0 for note in self.notes: @@ -135,6 +137,20 @@ class coredump_generator: reg_files = None # reg-files; pagemaps = {} # pagemap by pid; + # thread info key based on the current arch + thread_info_key = { + "aarch64": "ti_aarch64", + "armv7l": "ti_arm", + "x86_64": "thread_info", + } + + machine = platform.machine() # current arch + bits = platform.architecture()[0] # 32 or 64 bits + + ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} # 32 or 64 bits Ehdr + nhdr = {"32bit": elf.Elf32_Nhdr, "64bit": elf.Elf64_Nhdr} # 32 or 64 bits Nhdr + phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} # 32 or 64 bits Phdr + def _img_open_and_strip(self, name, single=False, pid=None): """ Load criu image and strip it from magic and redundant list. @@ -206,44 +222,62 @@ class coredump_generator: """ Generate elf header for process pid with program headers phdrs. """ - ehdr = elf.Elf64_Ehdr() + ei_class = {"32bit": elf.ELFCLASS32, "64bit": elf.ELFCLASS64} + + ehdr = self.ehdr[self.bits]() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 - ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 + ehdr.e_ident[elf.EI_CLASS] = ei_class[self.bits] ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT + if self.machine == "armv7l": + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_ARM + else: + ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_NONE + ehdr.e_type = elf.ET_CORE - ehdr.e_machine = elf.EM_X86_64 + ehdr.e_machine = self._get_e_machine() ehdr.e_version = elf.EV_CURRENT - ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) - ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) + ehdr.e_phoff = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_ehsize = ctypes.sizeof(self.ehdr[self.bits]()) + ehdr.e_phentsize = ctypes.sizeof(self.phdr[self.bits]()) # FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) return ehdr + def _get_e_machine(self): + """ + Get the e_machine field based on the current architecture. + """ + e_machine_dict = { + "aarch64": elf.EM_AARCH64, + "armv7l": elf.EM_ARM, + "x86_64": elf.EM_X86_64, + } + return e_machine_dict[self.machine] + def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. """ phdrs = [] - offset = ctypes.sizeof(elf.Elf64_Ehdr()) - offset += (len(vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) + offset = ctypes.sizeof(self.ehdr[self.bits]()) + offset += (len(vmas) + 1) * ctypes.sizeof(self.phdr[self.bits]()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset @@ -263,7 +297,7 @@ class coredump_generator: for vma in vmas: offset += filesz filesz = vma.filesz - phdr = elf.Elf64_Phdr() + phdr = self.phdr[self.bits]() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE @@ -315,13 +349,12 @@ class coredump_generator: prpsinfo.pr_ppid = pstree["ppid"] prpsinfo.pr_pgrp = pstree["pgid"] prpsinfo.pr_sid = pstree["sid"] - prpsinfo.pr_psargs = self._gen_cmdline(pid) - if (sys.version_info > (3, 0)): - prpsinfo.pr_fname = core["tc"]["comm"].encode() - else: - prpsinfo.pr_fname = core["tc"]["comm"] + # prpsinfo.pr_psargs has a limit of 80 characters which means it will + # fail here if the cmdline is longer than 80 + prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] + prpsinfo.pr_fname = core["tc"]["comm"].encode() - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO @@ -338,7 +371,7 @@ class coredump_generator: Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["gpregs"] + regs = self._get_gpregs(core) pstree = self.pstree[pid] prstatus = elf.elf_prstatus() @@ -351,35 +384,9 @@ class coredump_generator: prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] - prstatus.pr_reg.r15 = regs["r15"] - prstatus.pr_reg.r14 = regs["r14"] - prstatus.pr_reg.r13 = regs["r13"] - prstatus.pr_reg.r12 = regs["r12"] - prstatus.pr_reg.rbp = regs["bp"] - prstatus.pr_reg.rbx = regs["bx"] - prstatus.pr_reg.r11 = regs["r11"] - prstatus.pr_reg.r10 = regs["r10"] - prstatus.pr_reg.r9 = regs["r9"] - prstatus.pr_reg.r8 = regs["r8"] - prstatus.pr_reg.rax = regs["ax"] - prstatus.pr_reg.rcx = regs["cx"] - prstatus.pr_reg.rdx = regs["dx"] - prstatus.pr_reg.rsi = regs["si"] - prstatus.pr_reg.rdi = regs["di"] - prstatus.pr_reg.orig_rax = regs["orig_ax"] - prstatus.pr_reg.rip = regs["ip"] - prstatus.pr_reg.cs = regs["cs"] - prstatus.pr_reg.eflags = regs["flags"] - prstatus.pr_reg.rsp = regs["sp"] - prstatus.pr_reg.ss = regs["ss"] - prstatus.pr_reg.fs_base = regs["fs_base"] - prstatus.pr_reg.gs_base = regs["gs_base"] - prstatus.pr_reg.ds = regs["ds"] - prstatus.pr_reg.es = regs["es"] - prstatus.pr_reg.fs = regs["fs"] - prstatus.pr_reg.gs = regs["gs"] + self._set_pr_regset(prstatus.pr_reg, regs) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS @@ -391,28 +398,83 @@ class coredump_generator: return note + def _get_gpregs(self, core): + """ + Get the general purpose registers based on the current architecture. + """ + thread_info_key = self.thread_info_key[self.machine] + thread_info = core[thread_info_key] + + return thread_info["gpregs"] + + def _set_pr_regset(self, pr_reg, regs): + """ + Set the pr_reg struct based on the current architecture. + """ + if self.machine == "aarch64": + pr_reg.regs = (ctypes.c_ulonglong * len(regs["regs"]))(*regs["regs"]) + pr_reg.sp = regs["sp"] + pr_reg.pc = regs["pc"] + pr_reg.pstate = regs["pstate"] + elif self.machine == "armv7l": + pr_reg.r0 = regs["r0"] + pr_reg.r1 = regs["r1"] + pr_reg.r2 = regs["r2"] + pr_reg.r3 = regs["r3"] + pr_reg.r4 = regs["r4"] + pr_reg.r5 = regs["r5"] + pr_reg.r6 = regs["r6"] + pr_reg.r7 = regs["r7"] + pr_reg.r8 = regs["r8"] + pr_reg.r9 = regs["r9"] + pr_reg.r10 = regs["r10"] + pr_reg.fp = regs["fp"] + pr_reg.ip = regs["ip"] + pr_reg.sp = regs["sp"] + pr_reg.lr = regs["lr"] + pr_reg.pc = regs["pc"] + pr_reg.cpsr = regs["cpsr"] + pr_reg.orig_r0 = regs["orig_r0"] + elif self.machine == "x86_64": + pr_reg.r15 = regs["r15"] + pr_reg.r14 = regs["r14"] + pr_reg.r13 = regs["r13"] + pr_reg.r12 = regs["r12"] + pr_reg.rbp = regs["bp"] + pr_reg.rbx = regs["bx"] + pr_reg.r11 = regs["r11"] + pr_reg.r10 = regs["r10"] + pr_reg.r9 = regs["r9"] + pr_reg.r8 = regs["r8"] + pr_reg.rax = regs["ax"] + pr_reg.rcx = regs["cx"] + pr_reg.rdx = regs["dx"] + pr_reg.rsi = regs["si"] + pr_reg.rdi = regs["di"] + pr_reg.orig_rax = regs["orig_ax"] + pr_reg.rip = regs["ip"] + pr_reg.cs = regs["cs"] + pr_reg.eflags = regs["flags"] + pr_reg.rsp = regs["sp"] + pr_reg.ss = regs["ss"] + pr_reg.fs_base = regs["fs_base"] + pr_reg.gs_base = regs["gs_base"] + pr_reg.ds = regs["ds"] + pr_reg.es = regs["es"] + pr_reg.fs = regs["fs"] + pr_reg.gs = regs["gs"] + def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] - regs = core["thread_info"]["fpregs"] + regs = self._get_fpregs(core) fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) - fpregset.cwd = regs["cwd"] - fpregset.swd = regs["swd"] - fpregset.ftw = regs["twd"] - fpregset.fop = regs["fop"] - fpregset.rip = regs["rip"] - fpregset.rdp = regs["rdp"] - fpregset.mxcsr = regs["mxcsr"] - fpregset.mxcr_mask = regs["mxcsr_mask"] - fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( - *regs["st_space"]) - fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( - *regs["xmm_space"]) + self._set_fpregset(fpregset, regs) nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -426,6 +488,86 @@ class coredump_generator: return note + def _get_fpregs(self, core): + """ + Get the floating point register dictionary based on the current architecture. + """ + fpregs_key_dict = {"aarch64": "fpsimd", "x86_64": "fpregs"} + fpregs_key = fpregs_key_dict[self.machine] + + thread_info_key = self.thread_info_key[self.machine] + + return core[thread_info_key][fpregs_key] + + def _set_fpregset(self, fpregset, regs): + """ + Set the fpregset struct based on the current architecture. + """ + if self.machine == "aarch64": + fpregset.vregs = (ctypes.c_ulonglong * len(regs["vregs"]))(*regs["vregs"]) + fpregset.fpsr = regs["fpsr"] + fpregset.fpcr = regs["fpcr"] + elif self.machine == "x86_64": + fpregset.cwd = regs["cwd"] + fpregset.swd = regs["swd"] + fpregset.ftw = regs["twd"] + fpregset.fop = regs["fop"] + fpregset.rip = regs["rip"] + fpregset.rdp = regs["rdp"] + fpregset.mxcsr = regs["mxcsr"] + fpregset.mxcr_mask = regs["mxcsr_mask"] + fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( + *regs["st_space"]) + fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( + *regs["xmm_space"]) + + def _gen_arm_tls(self, tid): + """ + Generate NT_ARM_TLS note for thread tid of process pid. + """ + core = self.cores[tid] + tls = ctypes.c_ulonglong(core["ti_aarch64"]["tls"]) + + nhdr = elf.Elf64_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(ctypes.c_ulonglong) + nhdr.n_type = elf.NT_ARM_TLS + + note = elf_note() + note.data = tls + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + + def _gen_arm_vfp(self, tid): + """ + Generate NT_ARM_VFP note for thread tid of process pid. + """ + core = self.cores[tid] + fpstate = core["ti_arm"]["fpstate"] + + data = elf.vfp_hard_struct() + ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) + + data.vfp_regs = (ctypes.c_uint64 * len(fpstate["vfp_regs"]))(*fpstate["vfp_regs"]) + data.fpexc = fpstate["fpexc"] + data.fpscr = fpstate["fpscr"] + data.fpinst = fpstate["fpinst"] + data.fpinst2 = fpstate["fpinst2"] + + nhdr = elf.Elf32_Nhdr() + nhdr.n_namesz = 6 + nhdr.n_descsz = ctypes.sizeof(data) + nhdr.n_type = elf.NT_ARM_VFP + + note = elf_note() + note.data = data + note.owner = b"LINUX" + note.nhdr = nhdr + + return note + def _gen_x86_xstate(self, pid, tid): """ Generate NT_X86_XSTATE note for thread tid of process pid. @@ -475,7 +617,7 @@ class coredump_generator: # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO @@ -494,17 +636,22 @@ class coredump_generator: mm = self.mms[pid] num_auxv = len(mm["mm_saved_auxv"]) // 2 - class elf_auxv(ctypes.Structure): + class elf32_auxv(ctypes.Structure): + _fields_ = [("auxv", elf.Elf32_auxv_t * num_auxv)] + + class elf64_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] - auxv = elf_auxv() + elf_auxv = {"32bit": elf32_auxv(), "64bit": elf64_auxv()} + + auxv = elf_auxv[self.bits] for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i + 1] - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 - nhdr.n_descsz = ctypes.sizeof(elf_auxv()) + nhdr.n_descsz = ctypes.sizeof(elf_auxv[self.bits]) nhdr.n_type = elf.NT_AUXV note = elf_note() @@ -579,12 +726,9 @@ class coredump_generator: setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - if (sys.version_info > (3, 0)): - setattr(data, "name" + str(i), info.name.encode()) - else: - setattr(data, "name" + str(i), info.name) + setattr(data, "name" + str(i), info.name.encode()) - nhdr = elf.Elf64_Nhdr() + nhdr = self.nhdr[self.bits]() nhdr.n_namesz = 5 # strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) @@ -601,9 +745,15 @@ class coredump_generator: notes = [] notes.append(self._gen_prstatus(pid, tid)) - notes.append(self._gen_fpregset(pid, tid)) - notes.append(self._gen_x86_xstate(pid, tid)) + if self.machine != "armv7l": + notes.append(self._gen_fpregset(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) + if self.machine == "aarch64": + notes.append(self._gen_arm_tls(tid)) + elif self.machine == "armv7l": + notes.append(self._gen_arm_vfp(tid)) + elif self.machine == "x86_64": + notes.append(self._gen_x86_xstate(pid, tid)) return notes @@ -644,7 +794,9 @@ class coredump_generator: off = 0 # in pages for m in pagemap[1:]: found = False - for i in range(m["nr_pages"]): + num_pages = m.get("nr_pages", m["compat_nr_pages"]) + + for i in range(num_pages): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True break @@ -690,7 +842,11 @@ class coredump_generator: files = self.reg_files fname = next(filter(lambda x: x["id"] == shmid, files))["name"] - f = open(fname, 'rb') + try: + f = open(fname, 'rb') + except FileNotFoundError: + sys.exit('Required file %s not found.' % fname) + f.seek(off) start = vma["start"] diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 092b47857..2911f491e 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -1,5 +1,14 @@ # Define structures and constants for generating elf file. import ctypes +import platform + +MACHINE = platform.machine() + +Elf32_Half = ctypes.c_uint16 # typedef uint16_t Elf32_Half; +Elf32_Word = ctypes.c_uint32 # typedef uint32_t Elf32_Word; +Elf32_Addr = ctypes.c_uint32 # typedef uint32_t Elf32_Addr; +Elf32_Off = ctypes.c_uint32 # typedef uint32_t Elf32_Off; +Elf32_Xword = ctypes.c_uint64 # typedef uint64_t Elf32_Xword; Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; @@ -7,7 +16,7 @@ Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; -# Elf64_Ehdr related constants. +# Elf_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) @@ -28,21 +37,50 @@ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ +EI_OSABI = 7 # #define EI_OSABI 7 /* OS ABI identification */ + EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ +ELFCLASS32 = 1 # #define ELFCLASS32 1 /* 32-bit objects */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). +EM_ARM = 40 # #define EM_ARM 40 /* ARM */ EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ +EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ +# Legal values for e_osabi +ELFOSABI_NONE = 0 # #define ELFOSABI_NONE 0 /* UNIX System V ABI */ +ELFOSABI_ARM = 97 # #define ELFOSABI_ARM 97 /* ARM */ + + +class Elf32_Ehdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("e_ident", + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf32_Half), # Elf32_Half e_type; + ("e_machine", Elf32_Half), # Elf32_Half e_machine; + ("e_version", Elf32_Word), # Elf32_Word e_version; + ("e_entry", Elf32_Addr), # Elf32_Addr e_entry; + ("e_phoff", Elf32_Off), # Elf32_Off e_phoff; + ("e_shoff", Elf32_Off), # Elf32_Off e_shoff; + ("e_flags", Elf32_Word), # Elf32_Word e_flags; + ("e_ehsize", Elf32_Half), # Elf32_Half e_ehsize; + ("e_phentsize", Elf32_Half), # Elf32_Half e_phentsize; + ("e_phnum", Elf32_Half), # Elf32_Half e_phnum; + ("e_shentsize", Elf32_Half), # Elf32_Half e_shentsize; + ("e_shnum", Elf32_Half), # Elf32_Half e_shnum; + ("e_shstrndx", Elf32_Half) # Elf32_Half e_shstrndx; + ] # } Elf32_Ehdr; + class Elf64_Ehdr(ctypes.Structure): # typedef struct _fields_ = [ @@ -64,7 +102,7 @@ class Elf64_Ehdr(ctypes.Structure): # typedef struct ] # } Elf64_Ehdr; -# Elf64_Phdr related constants. +# Elf_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ @@ -76,6 +114,19 @@ PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ +class Elf32_Phdr(ctypes.Structure): # typedef struct + _fields_ = [ + ("p_type", Elf32_Word), # Elf32_Word p_type; + ("p_offset", Elf32_Off), # Elf32_Off p_offset; + ("p_vaddr", Elf32_Addr), # Elf32_Addr p_vaddr; + ("p_paddr", Elf32_Addr), # Elf32_Addr p_paddr; + ("p_filesz", Elf32_Word), # Elf32_Word p_filesz; + ("p_memsz", Elf32_Word), # Elf32_Word p_memsz; + ("p_flags", Elf32_Word), # Elf32_Word p_flags; + ("p_align", Elf32_Word), # Elf32_Word p_align; + ] # } Elf32_Phdr; + + class Elf64_Phdr(ctypes.Structure): # typedef struct _fields_ = [ ("p_type", Elf64_Word), # Elf64_Word p_type; @@ -89,7 +140,25 @@ class Elf64_Phdr(ctypes.Structure): # typedef struct ] # } Elf64_Phdr; -# Elf64_auxv_t related constants. +# Elf_auxv_t related constants. + + +class _Elf32_auxv_t_U(ctypes.Union): + _fields_ = [("a_val", ctypes.c_uint32)] + + +class Elf32_auxv_t(ctypes.Structure): # typedef struct + _fields_ = [ + ("a_type", + ctypes.c_uint32), # uint32_t a_type; /* Entry type */ + ("a_un", _Elf32_auxv_t_U) # union + + # uint32_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; + ] # } Elf32_auxv_t; class _Elf64_auxv_t_U(ctypes.Union): @@ -110,7 +179,7 @@ class Elf64_auxv_t(ctypes.Structure): # typedef struct ] # } Elf64_auxv_t; -# Elf64_Nhdr related constants. +# Elf_Nhdr related constants. NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ @@ -119,6 +188,22 @@ NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +NT_ARM_VFP = 0x400 # #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ + + +class Elf32_Nhdr(ctypes.Structure): # typedef struct + _fields_ = [ + ( + "n_namesz", Elf32_Word + ), # Elf32_Word n_namesz; /* Length of the note's name. */ + ( + "n_descsz", Elf32_Word + ), # Elf32_Word n_descsz; /* Length of the note's descriptor. */ + ( + "n_type", Elf32_Word + ), # Elf32_Word n_type; /* Type of the note. */ + ] # } Elf32_Nhdr; class Elf64_Nhdr(ctypes.Structure): # typedef struct @@ -134,7 +219,52 @@ class Elf64_Nhdr(ctypes.Structure): # typedef struct ] # } Elf64_Nhdr; -# Elf64_Shdr related constants. +# Elf_Shdr related constants. + + +class Elf32_Shdr(ctypes.Structure): + _fields_ = [ + ( + # Section name (string tbl index) + "sh_name", Elf32_Word + ), + ( + # Section type + "sh_type", Elf32_Word + ), + ( + # Section flags + "sh_flags", Elf32_Word + ), + ( + # Section virtual addr at execution + "sh_addr", Elf32_Addr + ), + ( + # Section file offset + "sh_offset", Elf32_Off + ), + ( + # Section size in bytes + "sh_size", Elf32_Word + ), + ( + # Link to another section + "sh_link", Elf32_Word + ), + ( + # Additional section information + "sh_info", Elf32_Word + ), + ( + # Section alignment + "sh_addralign", Elf32_Word + ), + ( + # Entry size if section holds table + "sh_entsize", Elf32_Word + ) + ] class Elf64_Shdr(ctypes.Structure): @@ -218,7 +348,7 @@ class timeval(ctypes.Structure): # struct timeval ] -class user_regs_struct(ctypes.Structure): # struct user_regs_struct +class x86_64_user_regs_struct(ctypes.Structure): # struct x86_64_user_regs_struct _fields_ = [ ("r15", ctypes.c_ulonglong), # __extension__ unsigned long long int r15; @@ -277,10 +407,73 @@ class user_regs_struct(ctypes.Structure): # struct user_regs_struct ] +class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_struct + _fields_ = [ + ("regs", + ctypes.c_ulonglong * 31), # unsigned long long int regs[31]; + ("sp", + ctypes.c_ulonglong), # unsigned long long int sp; + ("pc", + ctypes.c_ulonglong), # unsigned long long int pc; + ("pstate", + ctypes.c_ulonglong), # unsigned long long int pstate; + ] + + +class arm_user_regs_struct(ctypes.Structure): # struct arm_user_regs_struct + _fields_ = [ + ("r0", + ctypes.c_ulong), # unsigned ulong int r0; + ("r1", + ctypes.c_ulong), # unsigned ulong int r1; + ("r2", + ctypes.c_ulong), # unsigned ulong int r2; + ("r3", + ctypes.c_ulong), # unsigned ulong int r3; + ("r4", + ctypes.c_ulong), # unsigned ulong int r4; + ("r5", + ctypes.c_ulong), # unsigned ulong int r5; + ("r6", + ctypes.c_ulong), # unsigned ulong int r6; + ("r7", + ctypes.c_ulong), # unsigned ulong int r7; + ("r8", + ctypes.c_ulong), # unsigned ulong int r8; + ("r9", + ctypes.c_ulong), # unsigned ulong int r9; + ("r10", + ctypes.c_ulong), # unsigned ulong int r10; + ("fp", + ctypes.c_ulong), # unsigned ulong int fp; + ("ip", + ctypes.c_ulong), # unsigned ulong int ip; + ("sp", + ctypes.c_ulong), # unsigned ulong int sp; + ("lr", + ctypes.c_ulong), # unsigned ulong int lr; + ("pc", + ctypes.c_ulong), # unsigned ulong int pc; + ("cpsr", + ctypes.c_ulong), # unsigned ulong int cpsr; + ("orig_r0", + ctypes.c_ulong), # unsigned ulong int orig_r0; + ] + + # elf_greg_t = ctypes.c_ulonglong # ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) # elf_gregset_t = elf_greg_t*ELF_NGREG -elf_gregset_t = user_regs_struct +user_regs_dict = { + "aarch64": aarch64_user_regs_struct, + "armv7l": arm_user_regs_struct, + "x86_64": x86_64_user_regs_struct, +} + +try: + elf_gregset_t = user_regs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) class elf_prstatus(ctypes.Structure): # struct elf_prstatus @@ -420,7 +613,7 @@ class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo ] -class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct +class x86_64_user_fpregs_struct(ctypes.Structure): # struct x86_64_user_fpregs_struct _fields_ = [ # unsigned short int cwd; ("cwd", ctypes.c_ushort), @@ -447,7 +640,29 @@ class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct ] -elf_fpregset_t = user_fpregs_struct +class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpregs_struct + _fields_ = [ + # unsigned long long int vregs[64]; + ("vregs", ctypes.c_ulonglong * 64), + # unsigned int fpsr; + ("fpsr", ctypes.c_uint), + # unsigned int fpcr; + ("fpcr", ctypes.c_uint), + # unsigned int padding[2]; + ("padding", ctypes.c_uint * 2), + ] + + +user_fpregs_dict = { + "aarch64": aarch64_user_fpregs_struct, + "armv7l": None, + "x86_64": x86_64_user_fpregs_struct, +} + +try: + elf_fpregset_t = user_fpregs_dict[MACHINE] +except KeyError: + raise ValueError("Current architecture %s is not supported." % MACHINE) # siginfo_t related constants. @@ -842,3 +1057,13 @@ class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { # struct ymmh_struct ymmh; ("ymmh", ymmh_struct) ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; + + +class vfp_hard_struct(ctypes.Structure): # struct vfp_hard_struct { + _fields_ = [ + ("vfp_regs", ctypes.c_ulonglong * 32), # __u64 fpregs[32]; + ("fpexc", ctypes.c_ulong), # __u32 fpexc; + ("fpscr", ctypes.c_ulong), # __u32 fpscr; + ("fpinst", ctypes.c_ulong), # __u32 fpinst; + ("fpinst2", ctypes.c_ulong), # __u32 fpinst2; + ] # }; diff --git a/coredump/pycriu b/coredump/pycriu index d13a8790a..d1b6ed5c4 120000 --- a/coredump/pycriu +++ b/coredump/pycriu @@ -1 +1 @@ -../lib/py/ \ No newline at end of file +../lib/pycriu \ No newline at end of file diff --git a/crit/.gitignore b/crit/.gitignore new file mode 100644 index 000000000..10c8ab186 --- /dev/null +++ b/crit/.gitignore @@ -0,0 +1,4 @@ +crit.egg-info/ +build/ +dist/ +version.py diff --git a/crit/Makefile b/crit/Makefile index 988b481b6..33bd68eed 100644 --- a/crit/Makefile +++ b/crit/Makefile @@ -1,13 +1,25 @@ +VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) -all-y += crit +all-y += ${VERSION_FILE} +cleanup-y += ${VERSION_FILE} -crit/crit: crit/crit-$(PYTHON) - $(Q) cp $^ $@ -crit: crit/crit -.PHONY: crit +${VERSION_FILE}: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ -clean-crit: - $(Q) $(RM) crit/crit -.PHONY: clean-crit -clean: clean-crit -mrproper: clean +install: ${VERSION_FILE} +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " INSTALL " crit + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./crit +else + $(E) " SKIP INSTALL crit" +endif +.PHONY: install + +uninstall: +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " UNINSTALL" crit + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit +else + $(E) " SKIP UNINSTALL crit" +endif +.PHONY: uninstall diff --git a/crit/crit-python2 b/crit/crit-python2 deleted file mode 100755 index b0b7d3c3a..000000000 --- a/crit/crit-python2 +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python2 - -from pycriu import cli - -if __name__ == '__main__': - cli.main() diff --git a/crit/crit/__init__.py b/crit/crit/__init__.py new file mode 100644 index 000000000..58f3ace6c --- /dev/null +++ b/crit/crit/__init__.py @@ -0,0 +1 @@ +from .version import __version__ diff --git a/lib/py/cli.py b/crit/crit/__main__.py similarity index 94% rename from lib/py/cli.py rename to crit/crit/__main__.py index 5419384c3..bce523445 100755 --- a/lib/py/cli.py +++ b/crit/crit/__main__.py @@ -1,18 +1,17 @@ -from __future__ import print_function +#!/usr/bin/env python3 import argparse import sys import json import os import pycriu +from . import __version__ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: - if (sys.version_info < (3, 0)): - return sys.stdin if sys.stdin.isatty(): # If we are reading from a terminal (not a pipe) we want text input and not binary return sys.stdin @@ -28,8 +27,6 @@ def outf(opts, decode): mode = 'w+' return open(opts['out'], mode) else: - if (sys.version_info < (3, 0)): - return sys.stdout if decode: return sys.stdout return sys.stdout.buffer @@ -45,9 +42,9 @@ def decode(opts): try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: - print("Unknown magic %#x.\n"\ - "Maybe you are feeding me an image with "\ - "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + print("Unknown magic %#x.\n" + "Maybe you are feeding me an image with " + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: @@ -63,9 +60,9 @@ def encode(opts): try: img = json.load(inf(opts)) except UnicodeDecodeError: - print("Cannot read JSON.\n"\ - "Maybe you are feeding me an image with protobuf data? "\ - "Encode expects JSON input.", file=sys.stderr) + print("Cannot read JSON.\n" + "Maybe you are feeding me an image with protobuf data? " + "Encode expects JSON input.", file=sys.stderr) sys.exit(1) pycriu.images.dump(img, outf(opts, False)) @@ -135,7 +132,7 @@ def ftype_find_in_files(opts, ft, fid): if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] - except: + except Exception: files_img = [] if len(files_img) == 0: @@ -326,12 +323,12 @@ def explore_rss(opts): pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) - while vmas[vmi]['end'] <= pm['vaddr']: + while vmi < len(vmas) and vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' - while vmas[vmi]['start'] < pme: + while vmi < len(vmas) and vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' @@ -368,6 +365,7 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') @@ -377,8 +375,7 @@ def main(): 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', - help= - 'Multiline with indents and some numerical fields in field-specific format', + help='Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', diff --git a/crit/pycriu b/crit/pycriu deleted file mode 120000 index d13a8790a..000000000 --- a/crit/pycriu +++ /dev/null @@ -1 +0,0 @@ -../lib/py/ \ No newline at end of file diff --git a/crit/pyproject.toml b/crit/pyproject.toml new file mode 100644 index 000000000..f0b185eb7 --- /dev/null +++ b/crit/pyproject.toml @@ -0,0 +1,22 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "crit" +description = "CRiu Image Tool" +authors = [ + {name = "CRIU team", email = "criu@lists.linux.dev"}, +] +license = {text = "GPLv2"} +dynamic = ["version"] +requires-python = ">=3.6" + +[project.scripts] +crit = "crit.__main__:main" + +[tool.setuptools] +packages = ["crit"] + +[tool.setuptools.dynamic] +version = {attr = "crit.__version__"} diff --git a/crit/setup.cfg b/crit/setup.cfg new file mode 100644 index 000000000..37895923f --- /dev/null +++ b/crit/setup.cfg @@ -0,0 +1,20 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = crit +description = CRiu Image Tool +author = CRIU team +author_email = criu@lists.linux.dev +license = GPLv2 +version = attr: crit.__version__ + +[options] +packages = crit +python_requires = >=3.6 + +[options.entry_points] +console_scripts = + crit = crit.__main__:main diff --git a/coredump/coredump-python3 b/crit/setup.py old mode 100755 new mode 100644 similarity index 55% rename from coredump/coredump-python3 rename to crit/setup.py index 3032dbadf..618ac1de4 --- a/coredump/coredump-python3 +++ b/crit/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import setuptools -import coredump if __name__ == '__main__': - coredump.main() + setuptools.setup() diff --git a/criu/Makefile b/criu/Makefile index 55bdb1b7a..bafdd980b 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -85,7 +85,7 @@ $(obj)/%: pie $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ UNIT-BUILTINS += $(obj)/util.o UNIT-BUILTINS += $(obj)/config.o @@ -102,7 +102,7 @@ $(obj)/unittest/built-in.o: .FORCE $(obj)/unittest/unittest: $(UNIT-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ unittest: $(obj)/unittest/unittest $(Q) $(obj)/unittest/$@ @@ -145,10 +145,8 @@ install: $(obj)/criu $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts -ifeq ($(PYTHON),python3) $(E) " INSTALL " scripts/criu-ns $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR) -endif .PHONY: install uninstall: diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 22108cce0..ba6132d2f 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -74,6 +74,7 @@ obj-y += sk-unix.o obj-y += sockets.o obj-y += stats.o obj-y += string.o +obj-y += setproctitle.o obj-y += sysctl.o obj-y += sysfs_parse.o obj-y += timerfd.o @@ -91,6 +92,8 @@ obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o +obj-y += timer.o +obj-y += sigact.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 @@ -98,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o obj-y += hugetlb.o +obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 13c346f44..3e2e6efd1 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,7 +6,7 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel -REQ-RPM-PKG-NAMES += $(PYTHON)-future +REQ-RPM-PKG-NAMES += libuuid-devel REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -15,22 +15,19 @@ REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf -REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev +REQ-DEB-PKG-NAMES += uuid-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev -ifeq ($(PYTHON),python3) REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML -else -REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml -endif -export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet + +export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet -luuid check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 1ce6d9c10..6f7900186 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -18,6 +18,7 @@ #include "common/scm.h" static const char *action_names[ACT_MAX] = { + [ACT_PRE_STREAM] = "pre-stream", [ACT_PRE_DUMP] = "pre-dump", [ACT_POST_DUMP] = "post-dump", [ACT_PRE_RESTORE] = "pre-restore", @@ -30,6 +31,7 @@ static const char *action_names[ACT_MAX] = { [ACT_POST_RESUME] = "post-resume", [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", [ACT_STATUS_READY] = "status-ready", + [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { @@ -51,6 +53,9 @@ static int run_shell_scripts(const char *action) #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 + if (list_empty(&scripts)) + return 0; + if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; @@ -111,6 +116,20 @@ int rpc_send_fd(enum script_actions act, int fd) return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } +int rpc_query_external_files(void) +{ + int rpc_sk; + + if (scripts_mode != SCRIPTS_RPC) + return 0; + + rpc_sk = get_service_fd(RPC_SK_OFF); + if (rpc_sk < 0) + return -1; + + return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); +} + int run_scripts(enum script_actions act) { int ret = 0; @@ -118,23 +137,24 @@ int run_scripts(enum script_actions act) pr_debug("Running %s scripts\n", action); - if (scripts_mode == SCRIPTS_NONE) + switch (scripts_mode) { + case SCRIPTS_NONE: return 0; - - if (scripts_mode == SCRIPTS_RPC) { + case SCRIPTS_RPC: ret = rpc_send_fd(act, -1); - goto out; - } - - if (scripts_mode == SCRIPTS_SHELL) { + if (ret) + break; + /* Enable scripts from config file in RPC mode (fallthrough) */ + case SCRIPTS_SHELL: ret = run_shell_scripts(action); - goto out; + break; + default: + BUG(); } - BUG(); -out: if (ret) pr_err("One of more action scripts failed\n"); + return ret; } @@ -142,8 +162,9 @@ int add_script(char *path) { struct script *script; - BUG_ON(scripts_mode == SCRIPTS_RPC); - scripts_mode = SCRIPTS_SHELL; + /* Set shell mode when a script is added but don't overwrite RPC mode */ + if (scripts_mode == SCRIPTS_NONE) + scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) @@ -169,7 +190,6 @@ int add_rpc_notify(int sk) return -1; } - BUG_ON(scripts_mode == SCRIPTS_SHELL); scripts_mode = SCRIPTS_RPC; if (install_service_fd(RPC_SK_OFF, fd) < 0) diff --git a/criu/apparmor.c b/criu/apparmor.c index 67553c8f1..48b639216 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -108,7 +108,7 @@ static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) return -1; aa_policy__init(cur); - strlcat(path + my_offset, "name", PATH_MAX - my_offset); + __strlcat(path + my_offset, "name", PATH_MAX - my_offset); f = fopen(path, "r"); if (!f) { xfree(cur); @@ -124,7 +124,7 @@ static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) return -1; } - strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); + __strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("failed to open aa policy %s", path); @@ -207,8 +207,6 @@ static int by_time(const struct dirent **de1, const struct dirent **de2) } else { if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) return -1; - if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) - return 0; return 1; } } @@ -471,6 +469,7 @@ static void *get_suspend_policy(char *name, off_t *len) ret = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (ret == MAP_FAILED) { pr_perror("mmap of %s failed", file); + ret = NULL; goto out; } @@ -520,13 +519,13 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit tmp = *end; *end = 0; - strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); + __strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); *end = tmp; break; } default: - strlcpy(namespace, ns->name, sizeof(namespace)); + __strlcpy(namespace, ns->name, sizeof(namespace)); for (i = 0; i < ns->n_policies; i++) { if (strcmp(ns->policies[i]->name, rewrite_pos)) pr_warn("binary rewriting of apparmor policies not supported right now, not renaming %s to %s\n", @@ -551,8 +550,8 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit goto fail; } - ret = snprintf(path + offset + my_offset, sizeof(path) - offset - my_offset, "/.replace"); - if (ret < 0 || ret >= sizeof(path) - offset - my_offset) { + ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); + if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { pr_err("snprintf failed\n"); goto fail; } diff --git a/criu/arch/aarch64/Makefile b/criu/arch/aarch64/Makefile index b26487367..b87fcaa5b 100644 --- a/criu/arch/aarch64/Makefile +++ b/criu/arch/aarch64/Makefile @@ -6,3 +6,4 @@ obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o +obj-y += gcs.o \ No newline at end of file diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index e87b8629a..2e89f9ce3 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -1,5 +1,6 @@ #include #include +#include #include @@ -11,6 +12,7 @@ #include "common/compiler.h" #include #include "asm/dump.h" +#include "asm/gcs-types.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" @@ -20,12 +22,137 @@ #include "cpu.h" #include "restorer.h" #include "compel/infect.h" +#include "pstree.h" +#include + +/* + * cr_user_pac_* are a copy of the corresponding uapi structs + * in arch/arm64/include/uapi/asm/ptrace.h + */ +struct cr_user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct cr_user_pac_generic_keys { + __uint128_t apgakey; +}; + +/* + * The following HWCAP constants are copied from + * arch/arm64/include/uapi/asm/hwcap.h + */ +#ifndef HWCAP_PACA +#define HWCAP_PACA (1 << 30) +#endif + +#ifndef HWCAP_PACG +#define HWCAP_PACG (1UL << 31) +#endif + +/* + * The following NT_ARM_PAC constants are copied from + * include/uapi/linux/elf.h + */ +#ifndef NT_ARM_PACA_KEYS +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#endif + +#ifndef NT_ARM_PACG_KEYS +#define NT_ARM_PACG_KEYS 0x408 +#endif + +#ifndef NT_ARM_PAC_ENABLED_KEYS +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ +#endif + +extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +static int save_pac_keys(int pid, CoreEntry *core) +{ + struct cr_user_pac_address_keys paca; + struct cr_user_pac_generic_keys pacg; + PacKeys *pac_entry; + long pac_enabled_key; + struct iovec iov; + int ret; + + unsigned long hwcaps = getauxval(AT_HWCAP); + + pac_entry = xmalloc(sizeof(PacKeys)); + if (!pac_entry) + return -1; + core->ti_aarch64->pac_keys = pac_entry; + pac_keys__init(pac_entry); + + if (hwcaps & HWCAP_PACA) { + PacAddressKeys *pac_address_keys; + + pr_debug("%d: Dumping address authentication keys\n", pid); + iov.iov_base = &paca; + iov.iov_len = sizeof(paca); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to get address authentication key for %d", pid); + return -1; + } + pac_address_keys = xmalloc(sizeof(PacAddressKeys)); + if (!pac_address_keys) + return -1; + pac_address_keys__init(pac_address_keys); + pac_entry->pac_address_keys = pac_address_keys; + pac_address_keys->apiakey_lo = paca.apiakey; + pac_address_keys->apiakey_hi = paca.apiakey >> 64; + pac_address_keys->apibkey_lo = paca.apibkey; + pac_address_keys->apibkey_hi = paca.apibkey >> 64; + pac_address_keys->apdakey_lo = paca.apdakey; + pac_address_keys->apdakey_hi = paca.apdakey >> 64; + pac_address_keys->apdbkey_lo = paca.apdbkey; + pac_address_keys->apdbkey_hi = paca.apdbkey >> 64; + + iov.iov_base = &pac_enabled_key; + iov.iov_len = sizeof(pac_enabled_key); + ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); + if (ret) { + pr_perror("Failed to get authentication key mask for %d", pid); + return -1; + } + + pac_address_keys->pac_enabled_key = pac_enabled_key; + + } + if (hwcaps & HWCAP_PACG) { + PacGenericKeys *pac_generic_keys; + + pr_debug("%d: Dumping generic authentication keys\n", pid); + iov.iov_base = &pacg; + iov.iov_len = sizeof(pacg); + if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to get a generic authantication key for %d", pid); + return -1; + } + pac_generic_keys = xmalloc(sizeof(PacGenericKeys)); + if (!pac_generic_keys) + return -1; + pac_generic_keys__init(pac_generic_keys); + pac_entry->pac_generic_keys = pac_generic_keys; + pac_generic_keys->apgakey_lo = pacg.apgakey; + pac_generic_keys->apgakey_hi = pacg.apgakey >> 64; + } + return 0; +} + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; + struct cr_user_gcs gcs_live; + struct iovec gcs_iov = { + .iov_base = &gcs_live, + .iov_len = sizeof(gcs_live), + }; CoreEntry *core = x; // Save the Aarch64 CPU state @@ -37,11 +164,24 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsi // Save the FP/SIMD state for (i = 0; i < 32; ++i) { - core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->vregs[i]; - core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->vregs[i] >> 64; + core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->fpstate.vregs[i]; + core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->fpstate.vregs[i] >> 64; + } + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpsr); + assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpcr); + + if (save_pac_keys(pid, core)) + return -1; + + /* Save the GCS state */ + if (compel_host_supports_gcs()) { + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { + pr_perror("Failed to get GCS for %d", pid); + return -1; + } + core->ti_aarch64->gcs->gcspr_el0 = gcs_live.gcspr_el0; + core->ti_aarch64->gcs->features_enabled = gcs_live.features_enabled; } - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); - assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); return 0; } @@ -51,6 +191,7 @@ int arch_alloc_thread_info(CoreEntry *core) ThreadInfoAarch64 *ti_aarch64; UserAarch64RegsEntry *gpregs; UserAarch64FpsimdContextEntry *fpsimd; + UserAarch64GcsEntry *gcs; ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); if (!ti_aarch64) @@ -80,6 +221,15 @@ int arch_alloc_thread_info(CoreEntry *core) if (!fpsimd->vregs) goto err; + /* Allocate & init GCS */ + if (compel_host_supports_gcs()) { + gcs = xmalloc(sizeof(*gcs)); + if (!gcs) + goto err; + user_aarch64_gcs_entry__init(gcs); + ti_aarch64->gcs = gcs; + } + return 0; err: return -1; @@ -92,6 +242,12 @@ void arch_free_thread_info(CoreEntry *core) xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } + if (CORE_THREAD_ARCH_INFO(core)->pac_keys) { + PacKeys *pac_entry = CORE_THREAD_ARCH_INFO(core)->pac_keys; + xfree(pac_entry->pac_address_keys); + xfree(pac_entry->pac_generic_keys); + xfree(pac_entry); + } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); @@ -103,6 +259,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int i; struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); + struct gcs_context *gcs; if (core->ti_aarch64->fpsimd->n_vregs != 64) return 1; @@ -116,6 +273,18 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); + if (compel_host_supports_gcs()) { + gcs = RT_SIGFRAME_GCS(sigframe); + + pr_debug("sigframe gcspr %llx enabled %llx\n", gcs->gcspr, gcs->features_enabled); + + gcs->head.magic = GCS_MAGIC; + gcs->head.size = sizeof(*gcs); + gcs->reserved = 0; + gcs->gcspr = core->ti_aarch64->gcs->gcspr_el0 - 8; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + } + return 0; } @@ -135,3 +304,83 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) return 0; } + +int arch_ptrace_restore(int pid, struct pstree_item *item) +{ + unsigned long hwcaps = getauxval(AT_HWCAP); + struct cr_user_pac_address_keys upaca; + struct cr_user_pac_generic_keys upacg; + PacAddressKeys *paca; + PacGenericKeys *pacg; + long pac_enabled_keys; + struct iovec iov; + int ret; + + + pr_debug("%d: Restoring PAC keys\n", pid); + + paca = &rsti(item)->arch_info.pac_address_keys; + pacg = &rsti(item)->arch_info.pac_generic_keys; + if (rsti(item)->arch_info.has_paca) { + if (!(hwcaps & HWCAP_PACA)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + pac_enabled_keys = rsti(item)->arch_info.pac_address_keys.pac_enabled_key; + + upaca.apiakey = paca->apiakey_lo + ((__uint128_t)paca->apiakey_hi << 64); + upaca.apibkey = paca->apibkey_lo + ((__uint128_t)paca->apibkey_hi << 64); + upaca.apdakey = paca->apdakey_lo + ((__uint128_t)paca->apdakey_hi << 64); + upaca.apdbkey = paca->apdbkey_lo + ((__uint128_t)paca->apdbkey_hi << 64); + + iov.iov_base = &upaca; + iov.iov_len = sizeof(upaca); + + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { + pr_perror("Failed to set address authentication keys for %d", pid); + return 1; + } + iov.iov_base = &pac_enabled_keys; + iov.iov_len = sizeof(pac_enabled_keys); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { + pr_perror("Failed to set enabled key mask for %d", pid); + return 1; + } + } + + if (rsti(item)->arch_info.has_pacg) { + if (!(hwcaps & HWCAP_PACG)) { + pr_err("PACG support is required from the source system.\n"); + return 1; + } + upacg.apgakey = pacg->apgakey_lo + ((__uint128_t)pacg->apgakey_hi << 64); + iov.iov_base = &upacg; + iov.iov_len = sizeof(upacg); + if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { + pr_perror("Failed to set the generic authentication key for %d", pid); + return 1; + } + } + + return 0; +} + +void arch_rsti_init(struct pstree_item *p) +{ + PacKeys *pac_keys = p->core[0]->ti_aarch64->pac_keys; + + rsti(p)->arch_info.has_paca = false; + rsti(p)->arch_info.has_pacg = false; + + if (!pac_keys) + return; + + if (pac_keys->pac_address_keys) { + rsti(p)->arch_info.has_paca = true; + rsti(p)->arch_info.pac_address_keys = *pac_keys->pac_address_keys; + } + if (pac_keys->pac_generic_keys) { + rsti(p)->arch_info.has_pacg = true; + rsti(p)->arch_info.pac_generic_keys = *pac_keys->pac_generic_keys; + } +} diff --git a/criu/arch/aarch64/gcs.c b/criu/arch/aarch64/gcs.c new file mode 100644 index 000000000..4bdb9d2e4 --- /dev/null +++ b/criu/arch/aarch64/gcs.c @@ -0,0 +1,157 @@ +#include +#include + +#include +#include + +#include "asm/gcs-types.h" +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +#include +#include + +static bool task_has_gcs_enabled(UserAarch64GcsEntry *gcs) +{ + return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; +} + +static bool host_supports_gcs(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + return (hwcap & HWCAP_GCS) != 0; +} + +static bool task_needs_gcs(struct pstree_item *item, CoreEntry *core) +{ + UserAarch64GcsEntry *gcs; + + if (!task_alive(item)) + return false; + + gcs = core->ti_aarch64->gcs; + + if (task_has_gcs_enabled(gcs)) { + if (!host_supports_gcs()) { + pr_warn_once("Restoring task with GCS on non-GCS host\n"); + return false; + } + + pr_info("Restoring task with GCS\n"); + return true; + } + + pr_info("Restoring a task without GCS\n"); + return false; +} + +static int gcs_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *gcs) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, gcs->gcspr_el0)) { + unsigned long premapped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + gcs->vma_start = vma->e->start; + gcs->vma_size = size; + gcs->premapped_addr = premapped_addr; + + return 0; + } + } + + pr_err("Unable to find a shadow stack vma: %lx\n", gcs->gcspr_el0); + return -1; +} + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + int i; + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *gcs = &ta->shstk; + + if (!task_needs_gcs(item, core)) + return 0; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + gcs = &thread_args->shstk; + + gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; + gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; + + if (gcs_prepare_task(vmas, gcs)) { + pr_err("gcs: failed to prepare GCS memory\n"); + return -1; + } + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + int fret; + unsigned long flags = PR_SHADOW_STACK_ENABLE | + PR_SHADOW_STACK_PUSH | + PR_SHADOW_STACK_WRITE; + + long ret, x1_after, x8_after; + + /* If task doesn't need GCS, just call func */ + if (!task_needs_gcs(item, core)) { + return func(arg); + } + + pr_debug("gcs: GCS enable SVC about to fire: x8=%d x0=%d x1=0x%lx\n", + __NR_prctl, PR_SET_SHADOW_STACK_STATUS, flags); + + asm volatile( + "mov x0, %3\n" // x0 = PR_SET_SHADOW_STACK_STATUS (75) + "mov x1, %4\n" // x1 = flags + "mov x2, xzr\n" // x2 = 0 + "mov x3, xzr\n" // x3 = 0 + "mov x4, xzr\n" // x4 = 0 + "mov x8, %5\n" // x8 = __NR_prctl (167) + "svc #0\n" // Invoke syscall + "mov %0, x0\n" // Capture return value + "mov %1, x1\n" // Capture x1 after + "mov %2, x8\n" // Capture x8 after + : "=r"(ret), "=r"(x1_after), "=r"(x8_after) + : "i"(PR_SET_SHADOW_STACK_STATUS), // x0 - %3rd + "r"(flags), // x1 - %4th + "i"(__NR_prctl) // x8 - %5th + : "x0", "x1", "x2", "x3", "x4", "x8", "memory", "cc"); + + pr_info("gcs: after SVC: ret=%ld x1=%ld x8=%ld\n", ret, x1_after, x8_after); + + if (ret != 0) { + int err = errno; + pr_err("gcs: failed to enable GCS: ret=%ld errno=%d (%s)\n", ret, err, strerror(err)); + return -1; + } + + fret = func(arg); + exit(fret); + + return -1; +} diff --git a/criu/arch/aarch64/include/asm/dump.h b/criu/arch/aarch64/include/asm/dump.h index 90cd8bca8..ecab061c3 100644 --- a/criu/arch/aarch64/include/asm/dump.h +++ b/criu/arch/aarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/aarch64/include/asm/gcs.h b/criu/arch/aarch64/include/asm/gcs.h new file mode 100644 index 000000000..28faa23b7 --- /dev/null +++ b/criu/arch/aarch64/include/asm/gcs.h @@ -0,0 +1,196 @@ +#ifndef __CR_ASM_GCS_H__ +#define __CR_ASM_GCS_H__ + +#include + +struct rst_shstk_info { + unsigned long vma_start; /* start of GCS VMA */ + unsigned long vma_size; /* size of GCS VMA */ + unsigned long premapped_addr; /* premapped buffer */ + unsigned long tmp_gcs; /* temp area for GCS if needed */ + u64 gcspr_el0; /* GCS pointer */ + u64 features_enabled; /* GCS flags */ +}; + +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_gcs_prepare + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline + +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *gcs, void *ptr) +{ + gcs->tmp_gcs = (long unsigned)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size + +#ifdef CR_NOGLIBC +#include +#include +#include "vma.h" + +static inline unsigned long gcs_map(unsigned long addr, unsigned long size, unsigned int flags) +{ + long gcspr = sys_map_shadow_stack(addr, size, flags); + pr_info("gcs: syscall: map_shadow_stack at=%lx size=%ld\n", addr, size); + + if (gcspr < 0) { + pr_err("gcs: failed to map GCS at %lx: %ld\n", addr, gcspr); + return -1; + } + + if (addr && gcspr != addr) { + pr_err("gcs: address mismatch: need %lx, got %lx\n", addr, gcspr); + return -1; + } + + pr_info("gcs: mmapped GCS at %lx\n", gcspr); + + return gcspr; +} + +/* clang-format off */ +static always_inline void gcsss1(unsigned long *Xt) +{ + asm volatile ( + "sys #3, C7, C7, #2, %0\n" + : + : "rZ" (Xt) + : "memory"); +} + +static always_inline unsigned long *gcsss2(void) +{ + unsigned long *Xt; + + asm volatile ( + "SYSL %0, #3, C7, C7, #3\n" + : "=r" (Xt) + : + : "memory"); + + return Xt; +} + +static inline void gcsstr(unsigned long addr, unsigned long val) +{ + asm volatile( + "mov x0, %0\n" + "mov x1, %1\n" + ".inst 0xd91f1c01\n" // GCSSTR x1, [x0] + "mov x0, #0\n" + : + : "r"(addr), "r"(val) + : "x0", "x1", "memory"); +} +/* clang-format on */ + +static always_inline int gcs_restore(struct rst_shstk_info *gcs) +{ + unsigned long gcspr, val; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + gcspr = gcs->gcspr_el0 - 8; + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8); + pr_debug("gcs: [0] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8) | GCS_CAP_VALID_TOKEN; + gcspr -= 8; + pr_debug("gcs: [1] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); + gcsstr(gcspr, val); + + pr_debug("gcs: about to switch stacks via GCSSS1 to: %lx\n", gcspr); + gcsss1((unsigned long *)gcspr); + return 0; +} +#define arch_shstk_restore gcs_restore + +static always_inline int gcs_vma_restore(VmaEntry *vma_entry) +{ + unsigned long shstk, i, ret; + unsigned long *gcs_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + + shstk = gcs_map(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", shstk, shstk); + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + gcsstr(shstk + i * 8, gcs_data[i]); + + pr_debug("unmap %lx %ld\n", (unsigned long)gcs_data, vma_size); + ret = sys_munmap(gcs_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore gcs_vma_restore + +static always_inline int gcs_switch_to_restorer(struct rst_shstk_info *gcs) +{ + int ret; + unsigned long *ssp; + unsigned long addr; + unsigned long gcspr; + + if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { + return 0; + } + + pr_debug("gcs->premapped_addr + gcs->vma_size = %lx\n", gcs->premapped_addr + gcs->vma_size); + pr_debug("gcs->tmp_gcs = %lx\n", gcs->tmp_gcs); + addr = gcs->tmp_gcs; + + if (addr % PAGE_SIZE != 0) { + pr_err("gcs: 0x%lx not page-aligned to size 0x%lx\n", addr, PAGE_SIZE); + return -1; + } + + ret = sys_munmap((void *)addr, PAGE_SIZE); + if (ret < 0) { + pr_err("gcs: Failed to unmap aarea for dumpee GCS VMAs\n"); + return -1; + } + + gcspr = gcs_map(addr, PAGE_SIZE, SHADOW_STACK_SET_TOKEN); + + if (gcspr == -1) { + pr_err("gcs: failed to gcs_map(%lx, %lx)\n", (unsigned long)addr, PAGE_SIZE); + return -1; + } + + ssp = (unsigned long *)(addr + PAGE_SIZE - 8); + gcsss1(ssp); + + return 0; +} +#define arch_shstk_switch_to_restorer gcs_switch_to_restorer + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_GCS_H__ */ diff --git a/criu/arch/aarch64/include/asm/restore.h b/criu/arch/aarch64/include/asm/restore.h index 75e87996a..c79605c40 100644 --- a/criu/arch/aarch64/include/asm/restore.h +++ b/criu/arch/aarch64/include/asm/restore.h @@ -26,4 +26,14 @@ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); +#define ARCH_RST_INFO y +struct rst_arch_info { + bool has_paca, has_pacg; + PacAddressKeys pac_address_keys; + PacGenericKeys pac_generic_keys; +}; + +int arch_ptrace_restore(int pid, struct pstree_item *item); +void arch_rsti_init(struct pstree_item *current); + #endif diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 64a9c24eb..8f3edc257 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,10 +1,11 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" +#include "asm/gcs.h" #include "images/core.pb-c.h" #include diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h index 363c1cae2..db118cafd 100644 --- a/criu/arch/aarch64/include/asm/types.h +++ b/criu/arch/aarch64/include/asm/types.h @@ -33,7 +33,16 @@ static inline uint64_t encode_pointer(void *p) return (uint64_t)p; } -#define AT_VECTOR_SIZE 40 +/** + * See also: + * * arch/arm64/include/uapi/asm/auxvec.h + * * include/linux/auxvec.h + * * include/linux/mm_types.h + */ +#define AT_VECTOR_SIZE_BASE 22 +#define AT_VECTOR_SIZE_ARCH 2 +#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) + typedef uint64_t auxv_t; typedef uint64_t tls_t; diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c index 26b94e157..6a5e4c89a 100644 --- a/criu/arch/arm/crtools.c +++ b/criu/arch/arm/crtools.c @@ -22,7 +22,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/arm/include/asm/dump.h b/criu/arch/arm/include/asm/dump.h index 485986065..b0ac5715d 100644 --- a/criu/arch/arm/include/asm/dump.h +++ b/criu/arch/arm/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/loongarch64/Makefile b/criu/arch/loongarch64/Makefile new file mode 100644 index 000000000..4bd99eb7e --- /dev/null +++ b/criu/arch/loongarch64/Makefile @@ -0,0 +1,14 @@ +builtin-name := crtools.built-in.o + +ccflags-y += -iquote $(obj)/include +ccflags-y += -iquote criu/include -iquote include +ccflags-y += $(COMPEL_UAPI_INCLUDES) + +asflags-y += -Wstrict-prototypes +asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer +asflags-y += -iquote $(obj)/include +ldflags-y += -r -z noexecstack + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o diff --git a/criu/arch/loongarch64/cpu.c b/criu/arch/loongarch64/cpu.c new file mode 100644 index 000000000..5559c4288 --- /dev/null +++ b/criu/arch/loongarch64/cpu.c @@ -0,0 +1,31 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpuinfo_dump(void) +{ + if (cpu_init()) + return -1; + if (cpu_dump_cpuinfo()) + return -1; + return 0; +} + +int cpuinfo_check(void) +{ + return 0; +} diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c new file mode 100644 index 000000000..783951b5b --- /dev/null +++ b/criu/arch/loongarch64/crtools.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" +#include "log.h" +#include "asm/restorer.h" +#include "asm/parasite-syscall.h" +#include +#include "asm/dump.h" +#include "cr_options.h" +#include "common/compiler.h" +#include "restorer.h" +#include "parasite-syscall.h" +#include "util.h" +#include "cpu.h" +#include +#include "kerndat.h" + +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" + +#define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +{ + int i; + CoreEntry *core = x; + UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; + UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; + for (i = 0; i < GPR_NUM; i++) + assign_reg(gprs, regs, regs[i]); + assign_reg(gprs, regs, pc); + + for (i = 0; i < FPR_NUM; i++) + assign_reg(fpregs, fpregs, regs[i]); + assign_reg(fprs, fpregs, fcc); + assign_reg(fprs, fpregs, fcsr); + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoLoongarch64 *ti_loongarch64; + UserLoongarch64GpregsEntry *gpregs; + UserLoongarch64FpregsEntry *fpregs; + + ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); + thread_info_loongarch64__init(ti_loongarch64); + core->ti_loongarch64 = ti_loongarch64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_loongarch64_gpregs_entry__init(gpregs); + gpregs->n_regs = GPR_NUM; + gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); + if (!gpregs->regs) + goto err; + ti_loongarch64->gpregs = gpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + goto err; + user_loongarch64_fpregs_entry__init(fpregs); + fpregs->n_regs = FPR_NUM; + fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); + if (!fpregs->regs) + goto err; + ti_loongarch64->fpregs = fpregs; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (CORE_THREAD_ARCH_INFO(core)) { + if (CORE_THREAD_ARCH_INFO(core)->fpregs) { + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); + } + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); + xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); + xfree(CORE_THREAD_ARCH_INFO(core)); + CORE_THREAD_ARCH_INFO(core) = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); + UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; + + memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); + fpu->fcc = fpregs->fcc; + fpu->fcsr = fpregs->fcsr; + return 0; +} + +int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) +{ + sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); + memcpy(sc->regs, r->regs, sizeof(sc->regs)); + sc->pc = r->pc; + return 0; +} diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h new file mode 100644 index 000000000..a1c0c4c58 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_loongarch64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/int.h b/criu/arch/loongarch64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/kerndat.h b/criu/arch/loongarch64/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/loongarch64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/parasite-syscall.h b/criu/arch/loongarch64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..6008c3792 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/loongarch64/include/asm/parasite.h b/criu/arch/loongarch64/include/asm/parasite.h new file mode 100644 index 000000000..b64cb3185 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/parasite.h @@ -0,0 +1,11 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm volatile("or %0, $zero, $tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/loongarch64/include/asm/restore.h b/criu/arch/loongarch64/include/asm/restore.h new file mode 100644 index 000000000..d956231c8 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restore.h @@ -0,0 +1,33 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ +({ \ + uint64_t save_sp; \ + asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ + asm volatile( \ + "or $a0, $zero, %2 \n" \ + "or $sp, $zero, %0 \n" \ + "jirl $ra, %1, 0 \n" \ + : \ + : "r"(new_sp & ~15), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "$a0", "memory"); \ + asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ +}) + +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_loongarch64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/loongarch64/include/asm/restorer.h b/criu/arch/loongarch64/include/asm/restorer.h new file mode 100644 index 000000000..7a0d35c5b --- /dev/null +++ b/criu/arch/loongarch64/include/asm/restorer.h @@ -0,0 +1,97 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include "asm/types.h" +#include +#include "images/core.pb-c.h" +#include +#include + +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld.d $a1, %2 \n" \ + "addi.d $a1, $a1, -16 \n" \ + "st.d %5, $a1, 0 \n" \ + "st.d %6, $a1, 8 \n" \ + "or $a0, $zero, %1 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone_end \n" \ + \ + "thread_run: \n" \ + "ld.d $a1, $sp, 0 \n" \ + "ld.d $a0, $sp, 8 \n" \ + "jirl $ra, $a1, 0 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "ZB"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(&clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + "or $a0, $zero, %1 \n" \ + "or $a1, $zero, %2 \n" \ + "or $a2, $zero, %3 \n" \ + "or $a3, $zero, %4 \n" \ + "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ + "syscall 0 \n" \ + \ + "beqz $a0, clone3_thread_run \n" \ + \ + "or %0, $zero, $a0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + "or $a0, $zero, $a3 \n" \ + "jirl $ra, $a2, 0 \n" \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") +/* clang-format on */ + +static inline void restore_tls(tls_t *ptls) +{ + asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); + +#define arch_map_vdso(map, compat) -1 + +#endif diff --git a/criu/arch/loongarch64/include/asm/thread_pointer.h b/criu/arch/loongarch64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/loongarch64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/loongarch64/include/asm/types.h b/criu/arch/loongarch64/include/asm/types.h new file mode 100644 index 000000000..72bca2022 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/types.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" +#include "images/core.pb-c.h" + +#include + +#define core_is_compat(core) false + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 + +#define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) + +#define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) + +typedef UserLoongarch64GpregsEntry UserRegsEntry; + +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} + +#define AT_VECTOR_SIZE 44 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/loongarch64/include/asm/vdso.h b/criu/arch/loongarch64/include/asm/vdso.h new file mode 100644 index 000000000..64631dee0 --- /dev/null +++ b/criu/arch/loongarch64/include/asm/vdso.h @@ -0,0 +1,27 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "asm-generic/vdso.h" + +/* This definition is used in pie/util-vdso.c to initialize the vdso symbol + * name string table 'vdso_symbols' + */ + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 5 +#define VDSO_SYMBOL_GTOD 3 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ + const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ + const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ + const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ + const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 +#endif diff --git a/criu/arch/loongarch64/restorer.c b/criu/arch/loongarch64/restorer.c new file mode 100644 index 000000000..730318ac1 --- /dev/null +++ b/criu/arch/loongarch64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" +#include + +#include +#include "log.h" +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) +{ + return 0; +} diff --git a/criu/arch/loongarch64/sigframe.c b/criu/arch/loongarch64/sigframe.c new file mode 100644 index 000000000..18983ff13 --- /dev/null +++ b/criu/arch/loongarch64/sigframe.c @@ -0,0 +1,12 @@ +#include +#include + +#include "asm/sigframe.h" +#include "asm/types.h" + +#include "log.h" +#include +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/loongarch64/vdso-pie.c b/criu/arch/loongarch64/vdso-pie.c new file mode 100644 index 000000000..7a75d2741 --- /dev/null +++ b/criu/arch/loongarch64/vdso-pie.c @@ -0,0 +1,48 @@ +#include +#include "asm/types.h" + +#include +#include +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " +static void insert_trampoline(uintptr_t from, uintptr_t to) +{ + struct { + uint32_t pcaddi; + uint32_t ldptr; + uint32_t jirl; + uint32_t guards; + uint64_t imm64; + } __packed jmp = { + .pcaddi = 0x18000095, /* pcaddi $x, 4 */ + .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ + .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ + .guards = 0x002a0000, /* break 0 */ + .imm64 = to, + }; + memcpy((void *)from, &jmp, sizeof(jmp)); +} + +int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, + struct vdso_symtable *sfrom, bool compat_vdso) +{ + unsigned int i; + unsigned long from, to; + for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { + if (vdso_symbol_empty(&sfrom->symbols[i])) + continue; + pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, + sto->symbols[i].offset, i); + + from = base_from + sfrom->symbols[i].offset; + to = base_to + sto->symbols[i].offset; + insert_trampoline(from, to); + } + return 0; +} diff --git a/criu/arch/mips/crtools.c b/criu/arch/mips/crtools.c index ed4da9b7e..eabbd85f4 100644 --- a/criu/arch/mips/crtools.c +++ b/criu/arch/mips/crtools.c @@ -27,7 +27,7 @@ #include "images/core.pb-c.h" #include "images/creds.pb-c.h" -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/mips/include/asm/dump.h b/criu/arch/mips/include/asm/dump.h index 58015833d..ec59b051b 100644 --- a/criu/arch/mips/include/asm/dump.h +++ b/criu/arch/mips/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index bb5b7256e..b87230f40 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -64,6 +64,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c index a08a2ca5b..d57040008 100644 --- a/criu/arch/ppc64/crtools.c +++ b/criu/arch/ppc64/crtools.c @@ -404,7 +404,7 @@ static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpre return 0; } -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } diff --git a/criu/arch/ppc64/include/asm/dump.h b/criu/arch/ppc64/include/asm/dump.h index eb488900a..7393654fa 100644 --- a/criu/arch/ppc64/include/asm/dump.h +++ b/criu/arch/ppc64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/riscv64/Makefile b/criu/arch/riscv64/Makefile new file mode 100644 index 000000000..d19895471 --- /dev/null +++ b/criu/arch/riscv64/Makefile @@ -0,0 +1,8 @@ +builtin-name := crtools.built-in.o + +ldflags-y += -r + +obj-y += cpu.o +obj-y += crtools.o +obj-y += sigframe.o +obj-y += vdso-lookup.o \ No newline at end of file diff --git a/criu/arch/riscv64/cpu.c b/criu/arch/riscv64/cpu.c new file mode 100644 index 000000000..97a883b8c --- /dev/null +++ b/criu/arch/riscv64/cpu.c @@ -0,0 +1,40 @@ +#undef LOG_PREFIX +#define LOG_PREFIX "cpu: " + +#include +#include "cpu.h" + +int cpu_init(void) +{ + return 0; +} + +int cpu_dump_cpuinfo(void) +{ + return 0; +} + +int cpu_validate_cpuinfo(void) +{ + return 0; +} + +int cpu_dump_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpu_validate_image_cpuinfo_single(void) +{ + return -ENOTSUP; +} + +int cpuinfo_dump(void) +{ + return -ENOTSUP; +} + +int cpuinfo_check(void) +{ + return -ENOTSUP; +} diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c new file mode 100644 index 000000000..eea98d6de --- /dev/null +++ b/criu/arch/riscv64/crtools.c @@ -0,0 +1,171 @@ +#include +#include + +#include + +#include "types.h" +#include + +#include +#include "asm/restorer.h" +#include "common/compiler.h" +#include +#include "asm/dump.h" +#include "protobuf.h" +#include "images/core.pb-c.h" +#include "images/creds.pb-c.h" +#include "parasite-syscall.h" +#include "log.h" +#include "util.h" +#include "cpu.h" +#include "restorer.h" +#include "compel/infect.h" + +#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e + +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +{ + int i; + CoreEntry *core = x; + + // Save riscv64 gprs + assign_reg(core->ti_riscv64->gpregs, regs, pc); + assign_reg(core->ti_riscv64->gpregs, regs, ra); + assign_reg(core->ti_riscv64->gpregs, regs, sp); + assign_reg(core->ti_riscv64->gpregs, regs, gp); + assign_reg(core->ti_riscv64->gpregs, regs, tp); + assign_reg(core->ti_riscv64->gpregs, regs, t0); + assign_reg(core->ti_riscv64->gpregs, regs, t1); + assign_reg(core->ti_riscv64->gpregs, regs, t2); + assign_reg(core->ti_riscv64->gpregs, regs, s0); + assign_reg(core->ti_riscv64->gpregs, regs, s1); + assign_reg(core->ti_riscv64->gpregs, regs, a0); + assign_reg(core->ti_riscv64->gpregs, regs, a1); + assign_reg(core->ti_riscv64->gpregs, regs, a2); + assign_reg(core->ti_riscv64->gpregs, regs, a3); + assign_reg(core->ti_riscv64->gpregs, regs, a4); + assign_reg(core->ti_riscv64->gpregs, regs, a5); + assign_reg(core->ti_riscv64->gpregs, regs, a6); + assign_reg(core->ti_riscv64->gpregs, regs, a7); + assign_reg(core->ti_riscv64->gpregs, regs, s2); + assign_reg(core->ti_riscv64->gpregs, regs, s3); + assign_reg(core->ti_riscv64->gpregs, regs, s4); + assign_reg(core->ti_riscv64->gpregs, regs, s5); + assign_reg(core->ti_riscv64->gpregs, regs, s6); + assign_reg(core->ti_riscv64->gpregs, regs, s7); + assign_reg(core->ti_riscv64->gpregs, regs, s8); + assign_reg(core->ti_riscv64->gpregs, regs, s9); + assign_reg(core->ti_riscv64->gpregs, regs, s10); + assign_reg(core->ti_riscv64->gpregs, regs, s11); + assign_reg(core->ti_riscv64->gpregs, regs, t3); + assign_reg(core->ti_riscv64->gpregs, regs, t4); + assign_reg(core->ti_riscv64->gpregs, regs, t5); + assign_reg(core->ti_riscv64->gpregs, regs, t6); + + // Save riscv64 fprs + for (i = 0; i < 32; ++i) + assign_reg(core->ti_riscv64->fpsimd, fpsimd, f[i]); + assign_reg(core->ti_riscv64->fpsimd, fpsimd, fcsr); + + return 0; +} + +int arch_alloc_thread_info(CoreEntry *core) +{ + ThreadInfoRiscv64 *ti_riscv64; + UserRiscv64RegsEntry *gpregs; + UserRiscv64DExtEntry *fpsimd; + + ti_riscv64 = xmalloc(sizeof(*ti_riscv64)); + if (!ti_riscv64) + goto err; + thread_info_riscv64__init(ti_riscv64); + core->ti_riscv64 = ti_riscv64; + + gpregs = xmalloc(sizeof(*gpregs)); + if (!gpregs) + goto err; + user_riscv64_regs_entry__init(gpregs); + + ti_riscv64->gpregs = gpregs; + + fpsimd = xmalloc(sizeof(*fpsimd)); + if (!fpsimd) + goto err; + user_riscv64_d_ext_entry__init(fpsimd); + ti_riscv64->fpsimd = fpsimd; + fpsimd->f = xmalloc(32 * sizeof(fpsimd->f[0])); + fpsimd->n_f = 32; + if (!fpsimd->f) + goto err; + + return 0; +err: + return -1; +} + +void arch_free_thread_info(CoreEntry *core) +{ + if (core->ti_riscv64) { + if (core->ti_riscv64->fpsimd) { + xfree(core->ti_riscv64->fpsimd->f); + xfree(core->ti_riscv64->fpsimd); + } + xfree(core->ti_riscv64->gpregs); + xfree(core->ti_riscv64); + core->ti_riscv64 = NULL; + } +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) +{ + int i; + UserRiscv64DExtEntry *fpsimd = core->ti_riscv64->fpsimd; + + if (fpsimd->n_f != 32) + return 1; + + for (i = 0; i < 32; ++i) + sigframe->uc.uc_mcontext.__fpregs.__d.__f[i] = fpsimd->f[i]; + sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpsimd->fcsr; + + return 0; +} + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r) +{ + f->uc.uc_mcontext.__gregs[0] = r->pc; + f->uc.uc_mcontext.__gregs[1] = r->ra; + f->uc.uc_mcontext.__gregs[2] = r->sp; + f->uc.uc_mcontext.__gregs[3] = r->gp; + f->uc.uc_mcontext.__gregs[4] = r->tp; + f->uc.uc_mcontext.__gregs[5] = r->t0; + f->uc.uc_mcontext.__gregs[6] = r->t1; + f->uc.uc_mcontext.__gregs[7] = r->t2; + f->uc.uc_mcontext.__gregs[8] = r->s0; + f->uc.uc_mcontext.__gregs[9] = r->s1; + f->uc.uc_mcontext.__gregs[10] = r->a0; + f->uc.uc_mcontext.__gregs[11] = r->a1; + f->uc.uc_mcontext.__gregs[12] = r->a2; + f->uc.uc_mcontext.__gregs[13] = r->a3; + f->uc.uc_mcontext.__gregs[14] = r->a4; + f->uc.uc_mcontext.__gregs[15] = r->a5; + f->uc.uc_mcontext.__gregs[16] = r->a6; + f->uc.uc_mcontext.__gregs[17] = r->a7; + f->uc.uc_mcontext.__gregs[18] = r->s2; + f->uc.uc_mcontext.__gregs[19] = r->s3; + f->uc.uc_mcontext.__gregs[20] = r->s4; + f->uc.uc_mcontext.__gregs[21] = r->s5; + f->uc.uc_mcontext.__gregs[22] = r->s6; + f->uc.uc_mcontext.__gregs[23] = r->s7; + f->uc.uc_mcontext.__gregs[24] = r->s8; + f->uc.uc_mcontext.__gregs[25] = r->s9; + f->uc.uc_mcontext.__gregs[26] = r->s10; + f->uc.uc_mcontext.__gregs[27] = r->s11; + f->uc.uc_mcontext.__gregs[28] = r->t3; + f->uc.uc_mcontext.__gregs[29] = r->t4; + f->uc.uc_mcontext.__gregs[30] = r->t5; + f->uc.uc_mcontext.__gregs[31] = r->t6; + + return 0; +} diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h new file mode 100644 index 000000000..4f0a2d209 --- /dev/null +++ b/criu/arch/riscv64/include/asm/dump.h @@ -0,0 +1,15 @@ +#ifndef __CR_ASM_DUMP_H__ +#define __CR_ASM_DUMP_H__ + +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int arch_alloc_thread_info(CoreEntry *core); +extern void arch_free_thread_info(CoreEntry *core); + +static inline void core_put_tls(CoreEntry *core, tls_t tls) +{ + core->ti_riscv64->tls = tls; +} + +#define get_task_futex_robust_list_compat(pid, info) -1 + +#endif diff --git a/criu/arch/riscv64/include/asm/int.h b/criu/arch/riscv64/include/asm/int.h new file mode 100644 index 000000000..642804e9b --- /dev/null +++ b/criu/arch/riscv64/include/asm/int.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_INT_H__ +#define __CR_ASM_INT_H__ + +#include "asm-generic/int.h" + +#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/riscv64/include/asm/kerndat.h b/criu/arch/riscv64/include/asm/kerndat.h new file mode 100644 index 000000000..bb70cf6cf --- /dev/null +++ b/criu/arch/riscv64/include/asm/kerndat.h @@ -0,0 +1,7 @@ +#ifndef __CR_ASM_KERNDAT_H__ +#define __CR_ASM_KERNDAT_H__ + +#define kdat_compatible_cr() 0 +#define kdat_can_map_vdso() 0 + +#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/riscv64/include/asm/parasite-syscall.h b/criu/arch/riscv64/include/asm/parasite-syscall.h new file mode 100644 index 000000000..6008c3792 --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite-syscall.h @@ -0,0 +1,6 @@ +#ifndef __CR_ASM_PARASITE_SYSCALL_H__ +#define __CR_ASM_PARASITE_SYSCALL_H__ + +struct parasite_ctl; + +#endif diff --git a/criu/arch/riscv64/include/asm/parasite.h b/criu/arch/riscv64/include/asm/parasite.h new file mode 100644 index 000000000..4798cfd8a --- /dev/null +++ b/criu/arch/riscv64/include/asm/parasite.h @@ -0,0 +1,16 @@ +#ifndef __ASM_PARASITE_H__ +#define __ASM_PARASITE_H__ + +/* + * This function is used to retrieve the value of the thread pointer (tp) + * in RISC-V architecture, which is typically used for thread-local storage (TLS). + * The value is then stored in the provided tls_t pointer. + */ +static inline void arch_get_tls(tls_t *ptls) +{ + tls_t tls; + asm("mv %0, tp" : "=r"(tls)); + *ptls = tls; +} + +#endif diff --git a/criu/arch/riscv64/include/asm/restore.h b/criu/arch/riscv64/include/asm/restore.h new file mode 100644 index 000000000..e4f25a57b --- /dev/null +++ b/criu/arch/riscv64/include/asm/restore.h @@ -0,0 +1,29 @@ +#ifndef __CR_ASM_RESTORE_H__ +#define __CR_ASM_RESTORE_H__ + +#include "asm/restorer.h" + +#include "images/core.pb-c.h" + +/* clang-format off */ +#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ + task_args) \ + asm volatile( \ + "and sp, %0, ~15 \n" \ + "mv a0, %2 \n" \ + "jr %1 \n" \ + : \ + : "r"(new_sp), \ + "r"(restore_task_exec_start), \ + "r"(task_args) \ + : "a0", "memory") +/* clang-format on */ + +static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) +{ + *ptls = pcore->ti_riscv64->tls; +} + +int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); + +#endif diff --git a/criu/arch/riscv64/include/asm/restorer.h b/criu/arch/riscv64/include/asm/restorer.h new file mode 100644 index 000000000..45fe847a9 --- /dev/null +++ b/criu/arch/riscv64/include/asm/restorer.h @@ -0,0 +1,150 @@ +#ifndef __CR_ASM_RESTORER_H__ +#define __CR_ASM_RESTORER_H__ + +#include + +#include "asm/types.h" +#include "images/core.pb-c.h" + +#include + +// kernel arg order for clone +// unsigned long clone_flags, +// unsigned long newsp, +// int __user * parent_tidptr, +// unsigned long tls, +// int __user * child_tidptr +/* clang-format off */ +#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ + thread_args, clone_restore_fn) \ + asm volatile( \ + "clone_emul: \n" \ + "ld a1, %2 \n" \ + "andi a1, a1, ~15 \n" \ + "addi a1, a1, -16 \n" \ + "sd %5, 0(a1) \n" \ + "sd %6, 8(a1) \n" \ + "mv a0, %1 \n" \ + "mv a2, %3 \n" \ + "mv a3, %4 \n" \ + "li a7, "__stringify(__NR_clone)" \n" \ + "ecall \n" \ + \ + "beqz a0, thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone_end \n" \ + \ + "thread_run: \n" \ + "ld a1, 0(sp) \n" \ + "ld a0, 8(sp) \n" \ + "jr a1 \n" \ + \ + "clone_end: \n" \ + : "=r"(ret) \ + : "r"(clone_flags), \ + "m"(new_sp), \ + "r"(&parent_tid), \ + "r"(&thread_args[i].pid), \ + "r"(clone_restore_fn), \ + "r"(&thread_args[i]) \ + : "a0", "a1", "a2", "a3", "a7", "memory") + +/* + * Based on sysdeps/unix/sysv/linux/riscv/clone.S + * + * int clone(int (*fn)(void *arg), x0 + * void *child_stack, x1 + * int flags, x2 + * void *arg, x3 + * pid_t *ptid, x4 + * struct user_desc *tls, x5 + * pid_t *ctid); x6 + * + * int clone3(struct clone_args *args, x0 + * size_t size); x1 + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + */ +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* In contrast to the clone() wrapper above this does not put + * the thread function and its arguments on the child stack, + * but uses registers to pass these parameters to the child process. + * Based on the glibc clone() wrapper at + * sysdeps/unix/sysv/linux/riscv/clone.S. + */ \ + "clone3_emul: \n" \ + /* + * Based on the glibc clone() wrapper, which uses x10 and x11 + * to save the arguments for the child process, this does the same. + * x10 for the thread function and x11 for the thread arguments. + */ \ + "mv t0, %3 /* clone_restore_fn */ \n" \ + "mv t1, %4 /* args */ \n" \ + "mv a0, %1 /* &clone_args */ \n" \ + "mv a1, %2 /* size */ \n" \ + /* Load syscall number */ \ + "li a7, "__stringify(__NR_clone3)" \n" \ + /* Do the syscall */ \ + "ecall \n" \ + \ + "beqz a0, clone3_thread_run \n" \ + \ + "mv %0, a0 \n" \ + "j clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + /* Move args to a0 */ \ + "mv a0, t1 \n" \ + /* Jump to clone_restore_fn */ \ + "jr t0 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "a0", "a1", "a7", "t0", "t1", "memory") + +#define ARCH_FAIL_CORE_RESTORE \ + asm volatile( \ + "mv sp, %0 \n" \ + "li a0, 0 \n" \ + "jr x0 \n" \ + : \ + : "r"(ret) \ + : "sp", "a0", "memory") +/* clang-format on */ + +#define arch_map_vdso(map, compat) -1 + +int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r); +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r); + +static inline void restore_tls(tls_t *ptls) +{ + asm("mv tp, %0" : : "r"(*ptls)); +} + +static inline void *alloc_compat_syscall_stack(void) +{ + return NULL; +} +static inline void free_compat_syscall_stack(void *stack32) +{ +} +static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) +{ + return -1; +} +static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) +{ + return -1; +} + +#endif \ No newline at end of file diff --git a/criu/arch/riscv64/include/asm/thread_pointer.h b/criu/arch/riscv64/include/asm/thread_pointer.h new file mode 100644 index 000000000..f7e07066a --- /dev/null +++ b/criu/arch/riscv64/include/asm/thread_pointer.h @@ -0,0 +1,27 @@ +/* __thread_pointer definition. Generic version. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifndef _SYS_THREAD_POINTER_H +#define _SYS_THREAD_POINTER_H + +static inline void *__criu_thread_pointer(void) +{ + return __builtin_thread_pointer(); +} + +#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/riscv64/include/asm/types.h b/criu/arch/riscv64/include/asm/types.h new file mode 100644 index 000000000..83bb5f65f --- /dev/null +++ b/criu/arch/riscv64/include/asm/types.h @@ -0,0 +1,40 @@ +#ifndef __CR_ASM_TYPES_H__ +#define __CR_ASM_TYPES_H__ + +#include +#include +#include +#include "images/core.pb-c.h" + +#include "page.h" +#include "bitops.h" +#include "asm/int.h" + +#include + +#define core_is_compat(core) false + +typedef UserRiscv64RegsEntry UserRegsEntry; + +#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__RISCV64 + +#define CORE_THREAD_ARCH_INFO(core) core->ti_riscv64 + +#define TI_SP(core) ((core)->ti_riscv64->gpregs->sp) + +#define TI_IP(core) ((core)->ti_riscv64->gpregs->pc) + +static inline void *decode_pointer(uint64_t v) +{ + return (void *)v; +} +static inline uint64_t encode_pointer(void *p) +{ + return (uint64_t)p; +} + +#define AT_VECTOR_SIZE 64 +typedef uint64_t auxv_t; +typedef uint64_t tls_t; + +#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/riscv64/include/asm/vdso.h b/criu/arch/riscv64/include/asm/vdso.h new file mode 100644 index 000000000..322149c6e --- /dev/null +++ b/criu/arch/riscv64/include/asm/vdso.h @@ -0,0 +1,28 @@ +#ifndef __CR_ASM_VDSO_H__ +#define __CR_ASM_VDSO_H__ + +#include "asm/int.h" +#include "common/compiler.h" +#include "asm-generic/vdso.h" + +/* + * This is a minimal amount of symbols + * we should support at the moment. + */ +#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_GTOD 2 + +#define ARCH_VDSO_SYMBOLS_LIST \ + const char *rv64_vdso_symbol1 = "__vdso_clock_getres"; \ + const char *rv64_vdso_symbol2 = "__vdso_clock_gettime"; \ + const char *rv64_vdso_symbol3 = "__vdso_gettimeofday"; \ + const char *rv64_vdso_symbol4 = "__vdso_getcpu"; \ + const char *rv64_vdso_symbol5 = "__vdso_flush_icache"; \ + const char *rv64_vdso_symbol6 = "__vdso_rt_sigreturn"; + +#define ARCH_VDSO_SYMBOLS \ + rv64_vdso_symbol1, rv64_vdso_symbol2, rv64_vdso_symbol3, rv64_vdso_symbol4, rv64_vdso_symbol5, rv64_vdso_symbol6 + +extern void write_intraprocedure_branch(unsigned long to, unsigned long from); + +#endif /* __CR_ASM_VDSO_H__ */ \ No newline at end of file diff --git a/criu/arch/riscv64/restorer.c b/criu/arch/riscv64/restorer.c new file mode 100644 index 000000000..d605f048d --- /dev/null +++ b/criu/arch/riscv64/restorer.c @@ -0,0 +1,14 @@ +#include + +#include "restorer.h" +#include "asm/restorer.h" + +#include +#include "log.h" +#include +#include "cpu.h" + +int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r) +{ + return 0; +} diff --git a/criu/arch/riscv64/sigframe.c b/criu/arch/riscv64/sigframe.c new file mode 100644 index 000000000..8096fab66 --- /dev/null +++ b/criu/arch/riscv64/sigframe.c @@ -0,0 +1,8 @@ +#include "asm/types.h" +#include +#include "asm/sigframe.h" + +int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) +{ + return 0; +} diff --git a/criu/arch/riscv64/vdso-lookup.S b/criu/arch/riscv64/vdso-lookup.S new file mode 100644 index 000000000..50d4ecf08 --- /dev/null +++ b/criu/arch/riscv64/vdso-lookup.S @@ -0,0 +1,15 @@ +#include "common/asm/linkage.h" + +.section .text + +/* Expects t0 to hold the index into the lookup table. */ +GLOBAL(riscv_vdso_lookup) + /* Get the beginning of the lookup table */ + la t1, riscv_vdso_lookup_end + /* Scale the index */ + slli t0, t0, 3 + add t1, t0, t1 + ld t2, 0(t1) + jr t2 + +GLOBAL(riscv_vdso_lookup_end) \ No newline at end of file diff --git a/criu/arch/riscv64/vdso-pie.c b/criu/arch/riscv64/vdso-pie.c new file mode 100644 index 000000000..aa9272fb5 --- /dev/null +++ b/criu/arch/riscv64/vdso-pie.c @@ -0,0 +1,159 @@ +#include + +#include "asm/types.h" + +#include +#include +#include +#include +#include "atomic.h" +#include "parasite-vdso.h" +#include "log.h" +#include "common/bug.h" + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "vdso: " + +/* These symbols are defined in vdso-lookup.S */ +extern char *riscv_vdso_lookup, *riscv_vdso_lookup_end; + +/* + * li t0, INDEX + * jal x0, riscv_vdso_lookup + */ +#define TRAMP_CALL_SIZE (2 * sizeof(uint32_t)) + +static inline void invalidate_caches(void) +{ + // We're supposed to use the VDSO as the officially sanctioned ABI. But oh well. + int ret; + __smp_mb(); + asm volatile("li a0, 0\n" + "li a1, 0\n" + "li a2, 1\n" /* SYS_RISCV_FLUSH_ICACHE_ALL */ + "li a7, 259\n" /* __NR_arch_specific_syscall */ + "ecall\n" + : "=r"(ret) + : + : "a7"); +} + +static inline size_t vdso_trampoline_size(void) +{ + return (size_t)&riscv_vdso_lookup_end - (size_t)&riscv_vdso_lookup; +} + +static uint64_t put_trampoline(uint64_t at, struct vdso_symtable *sym) +{ + int i, j; + uint64_t total_size, trampoline_size; + uint64_t trampoline = 0; + + /* First of all we have to find a place where to put the trampoline + * code. + */ + trampoline_size = vdso_trampoline_size(); + total_size = trampoline_size + VDSO_SYMBOL_MAX * sizeof(uint64_t); + + for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { + if (vdso_symbol_empty(&sym->symbols[i])) + continue; + + pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); + + /* find the nearest following symbol we are interested in */ + for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { + if (i == j || vdso_symbol_empty(&sym->symbols[j])) + continue; + + if (sym->symbols[j].offset <= sym->symbols[i].offset) + /* this symbol is above the current one */ + continue; + + if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { + /* we have a major issue here since we cannot + * even put the trampoline call for this symbol + */ + pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); + return 0; + } + + if (trampoline) + /* no need to put it twice */ + continue; + + if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= total_size) + /* not enough place */ + continue; + + /* We can put the trampoline there */ + trampoline = at + sym->symbols[i].offset; + trampoline += TRAMP_CALL_SIZE; + + pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); + memcpy((void *)trampoline, &riscv_vdso_lookup, trampoline_size); + invalidate_caches(); + return trampoline; + } + } + + return 0; +} + +static inline void put_trampoline_call(uint64_t from, uint64_t to, uint64_t trampoline, unsigned int idx) +{ + size_t trampoline_size = vdso_trampoline_size(); + uint64_t *lookup_table = NULL; + /* + * li t0, INDEX + * addi t0, x0 INDEX + * jal x0, riscv_vdso_lookup + */ + uint32_t trampoline_call[2] = { + 0x00000293, + 0x0000006f, + }; + const size_t insts_len = ARRAY_SIZE(trampoline_call); + uint32_t *call_addr = (uint32_t *)from; + // Offset from the jal instruction to the lookup trampoline. + ssize_t trampoline_offset = trampoline - (from + sizeof(uint32_t)); + + trampoline_call[0] = trampoline_call[0] | (idx << 24); + trampoline_call[1] = trampoline_call[1] | riscv_j_imm(trampoline_offset); + + for (unsigned int i = 0; i < insts_len; i++) { + call_addr[i] = trampoline_call[i]; + } + + // Set the lookup table pointer for this vdso symbol. + lookup_table = (uint64_t *)(trampoline + trampoline_size); + lookup_table[idx] = to; +} + +int vdso_redirect_calls(uint64_t base_to, uint64_t base_from, struct vdso_symtable *to, struct vdso_symtable *from, + bool __always_unused compat_vdso) +{ + unsigned int i, valid_idx = 0; + + uint64_t trampoline = (uint64_t)put_trampoline(base_from, from); + if (!trampoline) + return 1; + + for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { + if (vdso_symbol_empty(&from->symbols[i])) + continue; + + pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, + to->symbols[i].offset, i, from->symbols[i].name); + + put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline, + valid_idx); + valid_idx++; + } + + invalidate_caches(); + + return 0; +} \ No newline at end of file diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index 3f430f455..e227fad5e 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -87,6 +87,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 5cf160d82..e08c83878 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -142,6 +142,29 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Allocate VxrsLow registers */ @@ -282,7 +305,7 @@ static void free_ri_cb(UserS390RiEntry *ri_cb) /* * Copy internal structures into Google Protocol Buffers */ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; @@ -294,7 +317,13 @@ int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; + /* + * We delay allocating this until now because checkpointing can fail earlier. + * When it fails we need to know if we reached here or not so that the cleanup + * code doesn't restore FPRs that were never saved in the first place. + */ + fpregs = allocate_fp_regs(); + CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -399,36 +428,15 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) return 0; } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); + if (fpregs) { + xfree(fpregs->fprs); + xfree(fpregs); + } } /* @@ -487,15 +495,17 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - ti_s390->fpregs = allocate_fp_regs(); - if (!ti_s390->fpregs) - goto fail_free_gp_regs; + + /* + * Delay allocating space until needed. Checkpointing can fail before that + * and the cleanup code needs to be able to tell if FPRs were saved or not + * before trying to restore the register state. + */ + ti_s390->fpregs = NULL; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; -fail_free_gp_regs: - free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -678,14 +688,18 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* Floating point registers */ + /* + * Floating point registers + * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup + * so there's no guarantee that we saved FPRs for this thread. + */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (!cfpregs) - return -1; - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; + if (cfpregs) { + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; + } /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { diff --git a/criu/arch/s390/include/asm/dump.h b/criu/arch/s390/include/asm/dump.h index c200724d7..5a24c5b3d 100644 --- a/criu/arch/s390/include/asm/dump.h +++ b/criu/arch/s390/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); +int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile index 618e85bb3..46f00e9e9 100644 --- a/criu/arch/x86/Makefile +++ b/criu/arch/x86/Makefile @@ -9,6 +9,7 @@ obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o +obj-y += shstk.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index dfa31569f..2e1f2de9a 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -407,6 +407,12 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; + if (empty_image(img)) { + pr_err("No cpuinfo image\n"); + close_image(img); + return -1; + } + if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index d10e51e48..1f4d0736b 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -15,7 +15,7 @@ #define XSAVE_PB_NELEMS(__s, __obj, __member) (sizeof(__s) / sizeof(*(__obj)->__member)) -int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; UserX86RegsEntry *gpregs = core->thread_info->gpregs; @@ -133,6 +133,14 @@ int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpre #undef assign_array #undef assign_xsave + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + UserX86CetEntry *cet = core->thread_info->fpregs->xsave->cet; + struct cet_user_state *regs = &fpregs->cet; + + cet->cet = regs->cet; + cet->ssp = regs->ssp; + } + return 0; } @@ -199,6 +207,13 @@ static int alloc_xsave_extends(UserX86XsaveEntry *xsave) goto err; } + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + xsave->cet = xzalloc(sizeof(UserX86CetEntry)); + if (!xsave->cet) + goto err; + user_x86_cet_entry__init(xsave->cet); + } + return 0; err: return -1; @@ -220,6 +235,8 @@ int arch_alloc_thread_info(CoreEntry *core) with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); + if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) + sz += sizeof(UserX86CetEntry); } m = xmalloc(sz); @@ -433,7 +450,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) #define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) #define assign_xsave(feature, xsave, member, area) \ do { \ - if (compel_fpu_has_feature(feature)) { \ + if (compel_fpu_has_feature(feature) && (xsave->xstate_bv & (1UL << feature))) { \ uint32_t off = compel_fpu_feature_offset(feature); \ void *to = &area[off]; \ void *from = xsave->member; \ diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h index 867357fa2..4ca704fd7 100644 --- a/criu/arch/x86/include/asm/compat.h +++ b/criu/arch/x86/include/asm/compat.h @@ -11,6 +11,8 @@ #include +#include "log.h" + static inline void *alloc_compat_syscall_stack(void) { void *mem = (void *)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, diff --git a/criu/arch/x86/include/asm/dump.h b/criu/arch/x86/include/asm/dump.h index 192f6bd02..925ea91ff 100644 --- a/criu/arch/x86/include/asm/dump.h +++ b/criu/arch/x86/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/arch/x86/include/asm/kerndat.h b/criu/arch/x86/include/asm/kerndat.h index 903bc80f7..5c3717230 100644 --- a/criu/arch/x86/include/asm/kerndat.h +++ b/criu/arch/x86/include/asm/kerndat.h @@ -4,5 +4,6 @@ extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); +extern int kdat_has_shstk(void); #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index f7a6d5058..3a673958d 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -8,6 +8,7 @@ #include #include #include "asm/compat.h" +#include "asm/shstk.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000..d113fd8ab --- /dev/null +++ b/criu/arch/x86/include/asm/shstk.h @@ -0,0 +1,304 @@ +#ifndef __CR_ASM_SHSTK_H__ +#define __CR_ASM_SHSTK_H__ + +/* + * Shadow stack constants from Linux + */ +/* arch/x86/include/uapi/asm/mman.h */ +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +/* arch/x86/include/uapi/asm/prctl.h */ +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#define ARCH_HAS_SHSTK + +/* from arch/x86/kernel/shstk.c */ +#define SHSTK_DATA_BIT (1UL << 63) /* BIT(63) */ + +/* + * Shadow stack memory cannot be restored with memcpy/pread but only using + * a special instruction that can write to shadow stack. + * That instruction is only available when shadow stack is enabled, + * otherwise it causes #UD. + * + * Also, shadow stack VMAs cannot be mmap()ed or mrepmap()ed, they must be + * created using map_shadow_stack() system call. This pushes creation of + * shadow stack VMAs to the restorer blob after CRIU mappings are freed. + * + * And there is an additional jungling with shadow stacks to ensure that we + * don't unmap an active shadow stack + * + * The overall sequence of restoring shadow stack is + * - Enable shadow stack early after clone()ing the task + * - Unlock shadow stack features using ptrace + * - In the restorer blob: + * - switch to a temporary shadow stack to be able to unmap shadow stack + * with the CRIU mappings + * - after memory mappigns are restored, recreate shadow stack VMAs, + * populate them using wrss instruction and switch to the task shadow + * stack + * - lock shadow stack features + */ +struct rst_shstk_info { + unsigned long vma_start; /* start of shadow stack VMA */ + unsigned long vma_size; /* size of shadow stack VMA */ + unsigned long premmaped_addr; /* address of shadow stack copy in + the premmaped area */ + unsigned long tmp_shstk; /* address of temporary shadow stack */ + u64 ssp; /* shadow stack pointer */ + u64 cet; /* CET conrtol state */ +}; +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_shstk_prepare + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); +#define arch_shstk_unlock arch_shstk_unlock + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg); +#define arch_shstk_trampoline arch_shstk_trampoline + +static always_inline long shstk_restorer_stack_size(void) +{ + return PAGE_SIZE; +} +#define shstk_restorer_stack_size shstk_restorer_stack_size +static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + info->tmp_shstk = (unsigned long)ptr; +} +#define shstk_set_restorer_stack shstk_set_restorer_stack + +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long __maybe_unused def) +{ + return !(info->cet & ARCH_SHSTK_SHSTK) ? def : (4UL << 30); +} +#define shstk_min_mmap_addr shstk_min_mmap_addr + +#ifdef CR_NOGLIBC + +#include +#include +#include "vma.h" + +#define SHSTK_BUSY_BIT (1UL << 0) /* BIT(0) */ + +static inline int shstk_map(unsigned long addr, unsigned long size) +{ + long shstk = sys_map_shadow_stack(addr, size, SHADOW_STACK_SET_TOKEN); + + if (shstk < 0) { + pr_err("Failed to map shadow stack at %lx: %ld\n", addr, shstk); + return -1; + } + + if (shstk != addr) { + pr_err("Shadow stack address mismatch: need %lx, got %lx\n", addr, shstk); + return -1; + } + + pr_info("Created shadow stack at %lx\n", shstk); + + return 0; +} + +/* clang-format off */ +static inline unsigned long get_ssp(void) +{ + unsigned long ssp; + + asm volatile("rdsspq %0" : "=r"(ssp) :: ); + + return ssp; +} + +static inline void wrssq(unsigned long addr, unsigned long val) +{ + asm volatile("wrssq %1, (%0)" :: "r"(addr), "r"(val) : "memory"); +} +/* clang-format off */ + +static always_inline void shstk_switch_ssp(unsigned long new_ssp) +{ + unsigned long old_ssp = get_ssp(); + + asm volatile("rstorssp (%0)\n" :: "r"(new_ssp)); + asm volatile("saveprevssp"); + + pr_debug("changed ssp from %lx to %lx\n", old_ssp, new_ssp); +} + +/* + * Disable writes to the shadow stack and lock it's disable/enable control + */ +static inline int shstk_finalize(void) +{ + int ret = 0; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return ret; +} + +/* + * Create shadow stack vma and restore its content from premmapped anonymous (non-shstk) vma + */ +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + long shstk, i; + unsigned long *shstk_data = (void *)vma_premmaped_start(vma_entry); + unsigned long vma_size = vma_entry_len(vma_entry); + long ret; + + shstk = sys_map_shadow_stack(0, vma_size, SHADOW_STACK_SET_TOKEN); + if (shstk < 0) { + pr_err("Failed to map shadow stack: %ld\n", shstk); + return -1; + } + + /* restore shadow stack contents */ + for (i = 0; i < vma_size / 8; i++) + wrssq(shstk + i * 8, shstk_data[i]); + + ret = sys_munmap(shstk_data, vma_size); + if (ret < 0) { + pr_err("Failed to unmap premmaped shadow stack\n"); + return ret; + } + + /* + * From that point premapped vma is (shstk) and we need + * to mremap() it to the final location. Originally premapped + * (shstk_data) has been unmapped already. + */ + vma_premmaped_start(vma_entry) = shstk; + + return 0; +} +#define shstk_vma_restore shstk_vma_restore + +/* + * Restore contents of the shadow stack and set shadow stack pointer + */ +static always_inline int shstk_restore(struct rst_shstk_info *cet) +{ + unsigned long ssp, val; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + /* + * Add tokens for sigreturn frame and for switch of the shadow stack. + * The sigreturn token will be checked by the kernel during + * processing of sigreturn + * The token for stack switch is required by rstorssp and + * saveprevssp semantics + */ + + /* token for sigreturn frame */ + ssp = cet->ssp - 8; + val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; + wrssq(ssp, val); + + /* shadow stack switch token */ + val = ssp | SHSTK_BUSY_BIT; + ssp -= 8; + wrssq(ssp, val); + + /* reset shadow stack pointer to the proper location */ + shstk_switch_ssp(ssp); + + return shstk_finalize(); +} +#define arch_shstk_restore shstk_restore + +/* + * Disable shadow stack + */ +static inline int shstk_disable(void) +{ + int ret; + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to disable writes to shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); + if (ret) { + pr_err("Failed to disable shadow stack\n"); + return ret; + } + + ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); + if (ret) + pr_err("Failed to lock shadow stack controls\n"); + + return 0; +} + +/* + * Switch to temporary shadow stack + */ +static always_inline int shstk_switch_to_restorer(struct rst_shstk_info *cet) +{ + unsigned long ssp; + long ret; + + if (!(cet->cet & ARCH_SHSTK_SHSTK)) + return 0; + + ret = sys_munmap((void *)cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) { + pr_err("Failed to unmap area for temporary shadow stack\n"); + return -1; + } + + ret = shstk_map(cet->tmp_shstk, PAGE_SIZE); + if (ret < 0) + return -1; + + /* + * Switch shadow stack from the default created by the kernel to a + * temporary shadow stack allocated in the premmaped area + */ + ssp = cet->tmp_shstk + PAGE_SIZE - 8; + shstk_switch_ssp(ssp); + + ret = sys_arch_prctl(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS); + if (ret) { + pr_err("Failed to enable writes to shadow stack\n"); + return ret; + } + + return 0; +} +#define arch_shstk_switch_to_restorer shstk_switch_to_restorer + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h index 3b3f292bd..ca46374a5 100644 --- a/criu/arch/x86/include/asm/vdso.h +++ b/criu/arch/x86/include/asm/vdso.h @@ -12,7 +12,7 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 6 +#define VDSO_SYMBOL_MAX 7 #define VDSO_SYMBOL_GTOD 2 /* @@ -42,11 +42,12 @@ const char *aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol4 = "__vdso_time"; \ const char *aarch_vdso_symbol5 = "__kernel_sigreturn"; \ - const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; + const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; \ + const char *aarch_vdso_symbol7 = "__vdso_clock_gettime64"; \ #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ - aarch_vdso_symbol6 + aarch_vdso_symbol6, aarch_vdso_symbol7 /* "__kernel_vsyscall", */ diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c index a98797d39..3a58bbea7 100644 --- a/criu/arch/x86/kerndat.c +++ b/criu/arch/x86/kerndat.c @@ -17,6 +17,7 @@ #include "asm/compat.h" #include "asm/dump.h" +#include "asm/shstk.h" int kdat_can_map_vdso(void) { @@ -251,3 +252,29 @@ out_kill: return ret; } + +/* + * Unlike most kerndat knobs, this does not check for availability of the + * shadow stack in the kernel, but rather checks if criu runs with shadow + * stack enabled. + * + * This depends on hardware availability, kernel and glibc support, compiler + * options and glibc tunables. + */ +int kdat_has_shstk(void) +{ + unsigned long features; + + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) + return 0; + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_STATUS, &features)) { + /* kernels that don't support shadow stack return -EINVAL */ + if (errno == EINVAL) + return 0; + pr_perror("Cannot get shadow stack status"); + return 1; + } + + return !!(features & ARCH_SHSTK_SHSTK); +} diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c new file mode 100644 index 000000000..0810efac5 --- /dev/null +++ b/criu/arch/x86/shstk.c @@ -0,0 +1,222 @@ +#include +#include + +#include + +#include + +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +static bool task_needs_shstk(struct pstree_item *item, CoreEntry *core) +{ + UserX86FpregsEntry *fpregs; + + if (!task_alive(item)) + return false; + + fpregs = core->thread_info->fpregs; + if (fpregs->xsave && fpregs->xsave->cet) { + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + pr_warn_once("Restoring task with shadow stack on non-CET machine\n"); + return false; + } + + if (fpregs->xsave->cet->cet & ARCH_SHSTK_SHSTK) + return true; + } + + return false; +} + +static int shstk_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *shstk) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, shstk->ssp)) { + unsigned long premmaped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + shstk->vma_start = vma->e->start; + shstk->vma_size = size; + shstk->premmaped_addr = premmaped_addr; + + break; + } + } + + return 0; +} + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + UserX86FpregsEntry *fpregs = core->thread_info->fpregs; + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *shstk = &ta->shstk; + int i; + + if (!task_needs_shstk(item, core)) + return 0; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + fpregs = core->thread_info->fpregs; + shstk = &thread_args->shstk; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + } + + return 0; +} + +int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid) +{ + unsigned long features; + int status; + int ret = -1; + + /* + * CRIU runs with no shadow stack and the task does not need one, + * nothing to do. + */ + if (!kdat.has_shstk && !task_needs_shstk(item, core)) + return 0; + + futex_wait_until(&rsti(item)->shstk_enable, 1); + + if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { + pr_perror("Cannot attach to %d", pid); + goto futex_wake; + } + + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Cannot interrupt the %d task", pid); + goto detach; + } + + if (wait4(pid, &status, __WALL, NULL) != pid) { + pr_perror("waitpid(%d) failed", pid); + goto detach; + } + + features = ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS; + if (ptrace(PTRACE_ARCH_PRCTL, pid, features, ARCH_SHSTK_UNLOCK)) { + pr_perror("Cannot unlock CET for %d task", pid); + goto detach; + } + +detach: + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + pr_perror("Unable to detach %d", pid); + goto futex_wake; + } + + ret = 0; + +futex_wake: + futex_set_and_wake(&rsti(item)->shstk_unlock, 1); + + return ret; +} + +static void shstk_sync_unlock(struct pstree_item *item) +{ + /* notify parent that shadow stack is enabled ... */ + futex_set_and_wake(&rsti(item)->shstk_enable, 1); + + /* ... and wait until it unlocks its features with ptrace */ + futex_wait_until(&rsti(item)->shstk_unlock, 1); +} + +static void __arch_shstk_enable(struct pstree_item *item, + int (*func)(void *arg), void *arg) +{ + int ret; + + shstk_sync_unlock(item); + + /* return here would cause #CP, use exit() instead */ + ret = func(arg); + exit(ret); +} + +static int shstk_disable(struct pstree_item *item) +{ + shstk_sync_unlock(item); + + /* disable shadow stack, implicitly clears ARCH_SHSTK_WRSS */ + if (syscall(__NR_arch_prctl, ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) { + pr_perror("Failed to disable shadow stack"); + return -1; + } + + if (syscall(__NR_arch_prctl, ARCH_SHSTK_LOCK, + ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS)) { + pr_perror("Failed to lock shadow stack controls"); + return -1; + } + + return 0; +} + +int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + unsigned long features = ARCH_SHSTK_SHSTK; + int code = ARCH_SHSTK_ENABLE; + + /* + * If task does not need shadow stack but CRIU runs with shadow + * stack enabled, we should disable it before continuing with + * restore + */ + if (!task_needs_shstk(item, core)) { + if (kdat.has_shstk && shstk_disable(item)) + return -1; + return func(arg); + } + + /* + * Calling sys_arch_prctl() means there will be use of retq + * instruction after shadow stack is enabled and this will cause + * Control Protectiond fault. Open code sys_arch_prctl() in + * assembly. + * + * code and addr should be in %rdi and %rsi and will be passed to + * the system call as is. + */ + asm volatile("movq $"__stringify(__NR_arch_prctl)", %%rax \n" + "syscall \n" + "cmpq $0, %%rax \n" + "je 1f \n" + "retq \n" + "1: \n" + :: "D"(code), "S"(features)); + + __arch_shstk_enable(item, func, arg); + + /* never reached */ + return -1; +} diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 4fa7eb3dc..46612e70d 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -23,7 +23,7 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *r } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; - } else if (!sigframe->is_native) { + } else { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; if ((addr % 64ul)) { diff --git a/criu/autofs.c b/criu/autofs.c index c662bea60..a1775cbc9 100644 --- a/criu/autofs.c +++ b/criu/autofs.c @@ -431,8 +431,7 @@ static int access_autofs_mount(struct mount_info *pm) pr_err("failed to fork\n"); goto close_autofs_mnt; case 0: - /* We don't care about results. - * All we need is to "touch" */ + /* We don't care about results, all we need is to "touch" */ /* coverity[check_return] */ openat(autofs_mnt, mnt_path, O_RDONLY | O_NONBLOCK | O_DIRECTORY); _exit(0); @@ -659,7 +658,7 @@ static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { - pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); + pr_info("%s: set timeout %" PRId64 " for %s\n", __func__, (int64_t)timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } @@ -771,7 +770,7 @@ static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { - pr_err("Failed to set timeout %ld for %s\n", timeout, mnt_path); + pr_err("Failed to set timeout %" PRId64 " for %s\n", (int64_t)timeout, mnt_path); return -1; } diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index 5bed7dd9d..1b85c5b5a 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -35,12 +35,29 @@ static const char *____criu_global_props____[] = { "tasks", }; +/* cgroup2 global properties */ +// clang-format off +static const char *____criu_global_props_v2____[] = { + "cgroup.subtree_control", + "cgroup.max.descendants", + "cgroup.max.depth", + "cgroup.freeze", + "cgroup.type", +}; +// clang-format on + cgp_t cgp_global = { .name = "____criu_global_props____", .nr_props = ARRAY_SIZE(____criu_global_props____), .props = ____criu_global_props____, }; +cgp_t cgp_global_v2 = { + .name = "____criu_global_props_v2____", + .nr_props = ARRAY_SIZE(____criu_global_props_v2____), + .props = ____criu_global_props_v2____, +}; + typedef struct { struct list_head list; cgp_t cgp; diff --git a/criu/cgroup.c b/criu/cgroup.c index e05b0832e..9246be639 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "common/list.h" #include "xmalloc.h" @@ -54,6 +55,7 @@ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; +static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { @@ -173,6 +175,7 @@ struct cg_controller *new_controller(const char *name) nc->n_controllers = 1; nc->n_heads = 0; + nc->is_threaded = false; INIT_LIST_HEAD(&nc->heads); return nc; @@ -245,7 +248,7 @@ static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir return EXACT_MATCH; } - if (strstartswith(path, d->path)) { + if (issubpath(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; @@ -370,7 +373,8 @@ static void free_all_cgroup_props(struct cgroup_dir *ncd) ncd->n_properties = 0; } -static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) +static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp, + struct cg_controller *controller) { int j; char buf[PATH_MAX]; @@ -421,6 +425,14 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; } + /* + * Set the is_threaded flag if cgroup.type's value is threaded + * or it is a cgroup v1 (it has a 'tasks' property). + * Ignore all other values. + */ + if ((!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) || !strcmp("tasks", prop->name)) + controller->is_threaded = true; + pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; @@ -436,12 +448,20 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); - if (dump_cg_props_array(fpath, ncd, cgp) < 0) { + if (dump_cg_props_array(fpath, ncd, cgp, controller) < 0) { pr_err("dumping known properties failed\n"); return -1; } + } - if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { + /* cgroup v2 */ + if (controller->controllers[0][0] == 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global_v2, controller) < 0) { + pr_err("dumping global properties v2 failed\n"); + return -1; + } + } else { + if (dump_cg_props_array(fpath, ncd, &cgp_global, controller) < 0) { pr_err("dumping global properties failed\n"); return -1; } @@ -560,14 +580,15 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) int fsfd, fd; char *name; - fsfd = sys_fsopen(fstype, 0); + fsfd = cr_fsopen(fstype, 0); if (fsfd < 0) { pr_perror("Unable to open the cgroup file system"); return -1; } if (strstartswith(cc->name, namestr)) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", cc->name); goto err; } @@ -575,7 +596,8 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) char *saveptr = NULL, *buf = strdupa(cc->name); name = strtok_r(buf, ",", &saveptr); while (name) { - if (sys_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to configure the cgroup (%s) file system", name); goto err; } @@ -583,14 +605,17 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) } } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + fsfd_dump_messages(fsfd); pr_perror("Unable to create the cgroup (%s) file system", cc->name); goto err; } - fd = sys_fsmount(fsfd, 0, 0); - if (fd < 0) + fd = cr_fsmount(fsfd, 0, 0); + if (fd < 0) { + fsfd_dump_messages(fsfd); pr_perror("Unable to mount the cgroup (%s) file system", cc->name); + } close(fsfd); return fd; @@ -619,8 +644,8 @@ static int open_cgroupfs(struct cg_ctl *cc) return -1; } - if (mount("none", prefix, fstype, 0, mopts) < 0) { - pr_perror("Unable to mount %s", mopts); + if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { + pr_perror("Unable to mount %s %s", fstype, mopts); rmdir(prefix); return -1; } @@ -694,6 +719,8 @@ static int collect_cgroups(struct list_head *ctls) } } else { fd = open_cgroupfs(cc); + if (fd < 0) + return -1; } path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); @@ -726,20 +753,28 @@ static int collect_cgroups(struct list_head *ctls) return 0; } -int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) +int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args, int id) { - int pid; + int pid, tid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; + if (opts.unprivileged) + return 0; + if (item) pid = item->pid->real; else pid = getpid(); - pr_info("Dumping cgroups for %d\n", pid); - if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) + if (id < 0) + tid = pid; + else + tid = item->threads[id].real; + + pr_info("Dumping cgroups for thread %d\n", tid); + if (parse_thread_cgroup(pid, tid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); @@ -752,9 +787,10 @@ int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_ pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { - BUG_ON(root_cgset); - root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); + if (!root_cgset) { + root_cgset = cs; + pr_info("Set %d is root one\n", cs->id); + } } else { struct cg_ctl *root, *stray; @@ -901,6 +937,8 @@ static int dump_controllers(CgroupEntry *cg) list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); + ce->has_is_threaded = true; + ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; @@ -988,6 +1026,9 @@ int dump_cgroups(void) CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; + if (opts.unprivileged) + return 0; + BUG_ON(!criu_cgset || !root_cgset); /* @@ -1054,8 +1095,15 @@ static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt * it. We restore these properties as soon as the cgroup is created. */ static const char *special_props[] = { - "cpuset.cpus", "cpuset.mems", "devices.list", "memory.kmem.limit_in_bytes", - "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", NULL, + "cpuset.cpus", + "cpuset.mems", + "devices.list", + "memory.kmem.limit_in_bytes", + "memory.swappiness", + "memory.oom_control", + "memory.use_hierarchy", + "cgroup.type", + NULL, }; bool is_special_property(const char *prop) @@ -1161,17 +1209,12 @@ static int prepare_cgns(CgSetEntry *se) return 0; } -static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) +static int move_in_cgroup(CgSetEntry *se) { int i; pr_info("Move into %d\n", se->id); - if (setup_cgns && prepare_cgns(se) < 0) { - pr_err("failed preparing cgns\n"); - return -1; - } - for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; @@ -1211,7 +1254,44 @@ static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) return 0; } -int prepare_task_cgroup(struct pstree_item *me) +int prepare_cgroup_namespace(struct pstree_item *root_task) +{ + CgSetEntry *se; + + if (opts.manage_cgroups == CG_MODE_IGNORE) + return 0; + + if (root_task->parent) { + pr_err("Expecting root_task to restore cgroup namespace\n"); + return -1; + } + + /* + * If on dump all dumped tasks are in same cgset with criu we don't + * dump cgsets and thus cgroup namespaces and rely that on restore + * criu caller would prepare proper cgset/cgns for us. Also in case + * of --unprivileged we don't even have the root cgset here. + */ + if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { + pr_info("Cgroup namespace inherited from parent\n"); + return 0; + } + + se = find_rst_set_by_id(rsti(root_task)->cg_set); + if (!se) { + pr_err("No set %d found\n", rsti(root_task)->cg_set); + return -1; + } + + if (prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + + return 0; +} + +int restore_task_cgroup(struct pstree_item *me) { struct pstree_item *parent = me->parent; CgSetEntry *se; @@ -1243,13 +1323,7 @@ int prepare_task_cgroup(struct pstree_item *me) return -1; } - /* Since don't support nesting of cgroup namespaces, let's only set up - * the cgns (if it exists) in the init task. In the future, we should - * just check that the cgns prefix string matches for all the entries - * in the cgset, and only unshare if that's true. - */ - - return move_in_cgroup(se, !me->parent); + return move_in_cgroup(se); } void fini_cgroup(void) @@ -1268,39 +1342,75 @@ void fini_cgroup(void) cg_yard = NULL; } -static int restore_perms(int fd, const char *path, CgroupPerms *perms) +static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) { - struct stat sb; + char *current, *next; + size_t len, off = 0; - if (perms) { - if (fstat(fd, &sb) < 0) { - pr_perror("stat of property %s failed", path); - return -1; - } + current = input; + do { + next = strchrnul(current, ' '); + len = next - current; - /* only chmod/chown if the perms are actually different: we aren't - * allowed to chmod some cgroup props (e.g. the read only ones), so we - * don't want to try if the perms already match. - */ - if (sb.st_mode != (mode_t)perms->mode && fchmod(fd, perms->mode) < 0) { - pr_perror("chmod of %s failed", path); - return -1; - } + output[off] = prefix; + off++; + memcpy(output + off, current, len); + off += len; + output[off] = ' '; + off++; - if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && fchown(fd, perms->uid, perms->gid)) { - pr_perror("chown of %s failed", path); - return -1; - } + current = next + 1; + } while (*next != '\0'); + + return off; +} + +static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p, int fd) +{ + char buf[1024]; + char line[1024]; + int ret, off = 0; + + ret = read(fd, buf, sizeof(buf) - 1); + if (ret < 0) { + pr_perror("read from cgroup.subtree_control"); + return ret; + } + /* Remove the trailing newline */ + buf[ret] = '\0'; + + /* Remove all current subsys in subtree_control */ + if (buf[0] != '\0') + off = add_subtree_control_prop_prefix(buf, line, '-'); + + /* Add subsys need to be restored in subtree_control */ + if (cg_prop_entry_p->value[0] != '\0') + off += add_subtree_control_prop_prefix(cg_prop_entry_p->value, line + off, '+'); + + /* Remove the trailing space */ + if (off != 0) { + off--; + line[off] = '\0'; + } + + if (write(fd, line, off) != off) { + pr_perror("write to cgroup.subtree_control"); + return -1; } return 0; } +/* + * Note: The path string can be modified in this function, + * the length of path string should be at least PATH_MAX. + */ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { - int cg, fd, ret = -1; + int cg, fd, exit_code = -1, flag; CgroupPerms *perms = cg_prop_entry_p->perms; + int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); if (opts.manage_cgroups == CG_MODE_IGNORE) return 0; @@ -1317,19 +1427,35 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); + if (is_subtree_control) + flag = O_RDWR; + else + flag = O_WRONLY; + cg = get_service_fd(CGROUP_YARD); - fd = openat(cg, path, O_WRONLY); + fd = openat(cg, path, flag); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; } - if (restore_perms(fd, path, perms) < 0) + if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) { - ret = 0; + exit_code = 0; + goto out; + } + + if (is_subtree_control) { + exit_code = restore_cgroup_subtree_control(cg_prop_entry_p, fd); + goto out; + } + + /* skip restoring cgroup.type if its value is not "threaded" */ + if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { + exit_code = 0; goto out; } @@ -1351,21 +1477,28 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat } while (*next_line != '\0'); } else { size_t len = strlen(cg_prop_entry_p->value); + int ret; - if (write(fd, cg_prop_entry_p->value, len) != len) { + ret = write(fd, cg_prop_entry_p->value, len); + /* memory.kmem.limit_in_bytes has been deprecated. Look at + * 58056f77502f3 ("memcg, kmem: further deprecate + * kmem.limit_in_bytes") for more details. */ + if (ret == -1 && errno == EOPNOTSUPP && + !strcmp(cg_prop_entry_p->name, "memory.kmem.limit_in_bytes")) + ret = len; + if (ret != len) { pr_perror("Failed writing %s to %s", cg_prop_entry_p->value, path); if (!skip_fails) goto out; } } - ret = 0; - + exit_code = 0; out: if (close(fd) != 0) pr_perror("Failed closing %s", path); - return ret; + return exit_code; } static CgroupPropEntry *freezer_state_entry; @@ -1630,7 +1763,7 @@ static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { - int fd, ret; + int fd, ret = 0; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { @@ -1638,7 +1771,8 @@ static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) return -1; } - ret = restore_perms(fd, path, perms); + if (perms) + ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); close(fd); return ret; } @@ -1677,12 +1811,9 @@ static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux return -1; for (j = 0; j < n_controllers; j++) { - if (!strcmp(controllers[j], "cpuset") || !strcmp(controllers[j], "memory") || - !strcmp(controllers[j], "devices")) { - if (restore_special_props(paux, off2, e) < 0) { - pr_err("Restoring special cpuset props failed!\n"); - return -1; - } + if (restore_special_props(paux, off2, e) < 0) { + pr_err("Restoring special cpuset props failed!\n"); + return -1; } } } else { @@ -1796,7 +1927,7 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) if (ctrl->cnames[0][0] == 0) fstype = "cgroup2"; - pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); + pr_debug("\tMaking controller dir %s (%s), type %s\n", paux, opt, fstype); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; @@ -1820,6 +1951,161 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } +static int cgroupd_unblock_sigterm(void) +{ + sigset_t unblockmask; + + sigemptyset(&unblockmask); + sigaddset(&unblockmask, SIGTERM); + + if (sigprocmask(SIG_UNBLOCK, &unblockmask, NULL)) { + pr_perror("cgroupd: can't unblock SIGTERM"); + return -1; + } + + return 0; +} + +/* + * If a thread is a different cgroup set than the main thread in process, + * it means it is in a threaded controller. This daemon receives the cg_set + * number from the restored thread and move this thread to the correct + * cgroup controllers + */ +static int cgroupd(int sk) +{ + /* + * This pairs with SIGTERM in stop_cgroupd(), and ensures that cgroupd + * will receive termination signal, regardless of which signal block + * mask was inherited. + */ + if (cgroupd_unblock_sigterm()) + return -1; + + pr_info("cgroud: Daemon started\n"); + + while (1) { + struct unsc_msg um; + uns_call_t call; + pid_t tid; + int fd, cg_set, i; + CgSetEntry *cg_set_entry; + int ret; + + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); + ret = recvmsg(sk, &um.h, 0); + if (ret <= 0) { + pr_perror("cgroupd: recv req error"); + return -1; + } + + unsc_msg_pid_fd(&um, &tid, &fd); + pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); + + cg_set_entry = find_rst_set_by_id(cg_set); + if (!cg_set_entry) { + pr_err("cgroupd: No set found %d\n", cg_set); + return -1; + } + + for (i = 0; i < cg_set_entry->n_ctls; i++) { + int j, aux_off; + CgMemberEntry *ce = cg_set_entry->ctls[i]; + char aux[PATH_MAX]; + CgControllerEntry *ctrl = NULL; + const char *format; + + for (j = 0; j < n_controllers; j++) { + CgControllerEntry *cur = controllers[j]; + if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { + ctrl = cur; + break; + } + } + + if (!ctrl) { + pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); + return -1; + } + + /* + * This is not a threaded controller, all threads in this + * process must be in this controller. Main thread has been + * restored, so this thread is in this controller already. + */ + if (!ctrl->has_is_threaded || !ctrl->is_threaded) + continue; + + aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); + format = ctrl->cnames[0][0] ? "/%s/tasks" : "/%s/cgroup.threads"; + snprintf(aux + aux_off, sizeof(aux) - aux_off, format, ce->path); + + /* + * Cgroupd runs outside of the namespaces so we don't + * need to use userns_call here + */ + if (userns_move(aux, 0, tid)) { + pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); + return -1; + } + } + + /* + * We only want to send the cred which contains thread id back. + * The restored thread recvmsg(MSG_PEEK) until it gets its own + * thread id. + */ + unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); + if (sendmsg(sk, &um.h, 0) <= 0) { + pr_perror("cgroupd: send req error"); + return -1; + } + } + + return 0; +} + +int stop_cgroupd(void) +{ + if (cgroupd_pid) { + sigset_t blockmask, oldmask; + + /* + * Block the SIGCHLD signal to avoid triggering + * sigchld_handler() + */ + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + kill(cgroupd_pid, SIGTERM); + waitpid(cgroupd_pid, NULL, 0); + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + } + + return 0; +} + +static int prepare_cgroup_thread_sfd(void) +{ + int sk; + + sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); + if (sk < 0) { + pr_err("failed to start cgroupd\n"); + return -1; + } + + if (install_service_fd(CGROUPD_SK, sk) < 0) { + kill(cgroupd_pid, SIGKILL); + waitpid(cgroupd_pid, NULL, 0); + return -1; + } + + return 0; +} + static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); @@ -1974,15 +2260,19 @@ int prepare_cgroup(void) n_controllers = ce->n_controllers; controllers = ce->controllers; - if (n_sets) + if (n_sets) { /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); - else + if (ret < 0) + return ret; + ret = prepare_cgroup_thread_sfd(); + } else { ret = 0; + } return ret; } diff --git a/criu/config.c b/criu/config.c index 14a11f9c3..d7ef3f8e8 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,6 +18,7 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" +#include "image.h" #include "irmap.h" #include "mount.h" #include "mount-v2.h" @@ -430,6 +431,7 @@ void init_opts(void) opts.pre_dump_mode = PRE_DUMP_SPLICE; opts.file_validation_method = FILE_VALIDATION_DEFAULT; opts.network_lock_method = NETWORK_LOCK_DEFAULT; + opts.ghost_fiemap = FIEMAP_DEFAULT; } bool deprecated_ok(char *what) @@ -696,14 +698,21 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097 }, { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), + BOOL_OPT("unprivileged", &opts.unprivileged), + BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), + BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), {}, }; #undef BOOL_OPT + if (argv && argv[0]) + SET_CHAR_OPTS(argv_0, argv[0]); + ret = pre_parse(argc, argv, usage_error, &no_default_config, &cfg_file); if (ret) @@ -1029,6 +1038,8 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else if (!strcmp("nftables", optarg)) { opts.network_lock_method = NETWORK_LOCK_NFTABLES; + } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { + opts.network_lock_method = NETWORK_LOCK_SKIP; } else { pr_err("Invalid value for --network-lock: %s\n", optarg); return 1; @@ -1115,6 +1126,11 @@ int check_options(void) } } + if (opts.track_mem && !kdat.has_dirty_track) { + pr_err("Tracking memory is not available. Consider omitting --track-mem option.\n"); + return 1; + } + if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/cr-check.c b/criu/cr-check.c index f589a91da..7c3dc76dd 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,7 +21,8 @@ #include #include #include -#include +#include +#include #include "../soccr/soccr.h" @@ -30,7 +31,7 @@ #include "sockets.h" #include "crtools.h" #include "log.h" -#include "util-pie.h" +#include "util-caps.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" @@ -52,6 +53,8 @@ #include "net.h" #include "restorer.h" #include "uffd.h" +#include "linux/aio_abi.h" +#include "mount-v2.h" #include "images/inventory.pb-c.h" @@ -104,7 +107,7 @@ out: static int check_apparmor_stacking(void) { - if (!check_aa_ns_dumping()) + if (!kdat.apparmor_ns_dumping_enabled) return -1; return 0; @@ -515,6 +518,14 @@ static int check_ipc(void) { int ret; + /* + * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however + * for non-root users access() runs with an empty set of caps and will therefore always + * fail. + */ + if (opts.uid) + return 0; + ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; @@ -1039,10 +1050,14 @@ static int check_tcp(void) } val = 1; - ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); - if (ret < 0) { - pr_perror("Can't turn TCP repair mode ON"); - goto out; + if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; + } + } else { + pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); } optlen = sizeof(val); @@ -1073,6 +1088,8 @@ static int kerndat_tcp_repair_window(void) int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); + if (sk < 0 && errno == EAFNOSUPPORT) + sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; @@ -1180,7 +1197,7 @@ static int check_ipt_legacy(void) char *ipt_legacy_bin; char *ip6t_legacy_bin; - ipt_legacy_bin = get_legacy_iptables_bin(false); + ipt_legacy_bin = get_legacy_iptables_bin(false, false); if (!ipt_legacy_bin) { pr_warn("Couldn't find iptables version which is using iptables legacy API\n"); return -1; @@ -1191,7 +1208,7 @@ static int check_ipt_legacy(void) if (!kdat.ipv6) return 0; - ip6t_legacy_bin = get_legacy_iptables_bin(true); + ip6t_legacy_bin = get_legacy_iptables_bin(true, false); if (!ip6t_legacy_bin) { pr_warn("Couldn't find ip6tables version which is using iptables legacy API\n"); return -1; @@ -1311,9 +1328,6 @@ static int check_pidfd_store(void) static int check_ns_pid(void) { - if (kerndat_has_nspid() < 0) - return -1; - if (!kdat.has_nspid) return -1; @@ -1362,6 +1376,236 @@ static int check_openat2(void) return 0; } +static int check_ipv6_freebind(void) +{ + if (!kdat.has_ipv6_freebind) + return -1; + + return 0; +} + +static int check_pagemap_scan(void) +{ + if (!kdat.has_pagemap_scan) + return -1; + + return 0; +} + +static int check_timer_cr_ids(void) +{ + if (!kdat.has_timer_cr_ids) + return -1; + + return 0; +} + +/* musl doesn't have a statx wrapper... */ +struct staty { + __u32 stx_dev_major; + __u32 stx_dev_minor; + __u64 stx_ino; +}; + +static long get_file_dev_and_inode(void *addr, struct staty *stx) +{ + char buf[4096]; + FILE *mapf; + + mapf = fopen("/proc/self/maps", "r"); + if (mapf == NULL) { + pr_perror("fopen(/proc/self/maps)"); + return -1; + } + + while (fgets(buf, sizeof(buf), mapf)) { + unsigned long start, end; + uint32_t maj, min; + __u64 ino; + + if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", + &start, &end, &maj, &min, &ino) != 5) { + pr_perror("Unable to parse: %s", buf); + return -1; + } + if (start == (unsigned long)addr) { + stx->stx_dev_major = maj; + stx->stx_dev_minor = min; + stx->stx_ino = ino; + return 0; + } + } + + pr_err("Unable to find the mapping\n"); + return -1; +} + +static int ovl_mount(void) +{ + int tmpfs, fsfd, ovl; + + fsfd = cr_fsopen("tmpfs", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen tmpfs"); + return -1; + } + + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create tmpfs mount"); + return -1; + } + + tmpfs = cr_fsmount(fsfd, 0, 0); + if (tmpfs == -1) { + pr_perror("Unable to mount tmpfs"); + return -1; + } + + close(fsfd); + + /* overlayfs can't be constructed on top of a detached mount. */ + if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Unable to attach tmpfs mount"); + return -1; + } + close(tmpfs); + + if (chdir("/tmp")) { + pr_perror("Unable to change working directory"); + return -1; + } + + if (mkdir("/tmp/w", 0755) == -1 || + mkdir("/tmp/u", 0755) == -1 || + mkdir("/tmp/l", 0755) == -1) { + pr_perror("mkdir"); + return -1; + } + + fsfd = cr_fsopen("overlay", 0); + if (fsfd == -1) { + pr_perror("Unable to fsopen overlayfs"); + return -1; + } + if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { + pr_perror("Unable to configure overlayfs"); + return -1; + } + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { + pr_perror("Unable to create overlayfs"); + return -1; + } + ovl = cr_fsmount(fsfd, 0, 0); + if (ovl == -1) { + pr_perror("Unable to mount overlayfs"); + return -1; + } + + return ovl; +} + +/* + * Check that the file device and inode shown in /proc/pid/maps match values + * returned by stat(2). + */ +static int do_check_overlayfs_maps(void) +{ + struct staty stx, mstx; + struct stat st; + int ovl, fd; + void *addr; + + /* Create a new mount namespace to not care about cleaning test mounts. */ + if (unshare(CLONE_NEWNS) == -1) { + pr_warn("Unable to create a new mount namespace\n"); + return 0; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + pr_perror("Unable to remount / with MS_SLAVE"); + return -1; + } + + ovl = ovl_mount(); + if (ovl == -1) + return -1; + + fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); + if (fd == -1) { + pr_perror("Unable to open a test file"); + return -1; + } + + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Unable to map the test file"); + return -1; + } + + if (get_file_dev_and_inode(addr, &mstx)) + return -1; + if (fstat(fd, &st)) { + pr_perror("stat"); + return -1; + } + stx.stx_dev_major = major(st.st_dev); + stx.stx_dev_minor = minor(st.st_dev); + stx.stx_ino = st.st_ino; + + if (stx.stx_dev_major != mstx.stx_dev_major || + stx.stx_dev_minor != mstx.stx_dev_minor || + stx.stx_ino != mstx.stx_ino) { + pr_err("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", + mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, + stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); + return -1; + } + + return 0; +} + +static int check_overlayfs_maps(void) +{ + pid_t pid; + int status; + + pid = fork(); + if (pid == -1) { + pr_perror("Unable to fork a child"); + return -1; + } + if (pid == 0) { + if (do_check_overlayfs_maps()) + exit(1); + exit(0); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid"); + return -1; + } + return status == 0 ? 0 : -1; +} + +static int check_breakpoints(void) +{ + if (!kdat.has_breakpoints) { + pr_warn("Hardware breakpoints don't seem to work\n"); + return -1; + } + + return 0; +} + +static int check_pagemap_scan_guard_pages(void) +{ + kerndat_warn_about_madv_guards(); + + return kdat.has_pagemap_scan_guard_pages ? 0 : -1; +} + static int (*chk_feature)(void); /* @@ -1389,14 +1633,12 @@ static int (*chk_feature)(void); return ret; \ } \ } while (0) + int cr_check(void) { struct ns_id *ns; int ret = 0; - if (!is_root_user()) - return -1; - root_item = alloc_pstree_item(); if (root_item == NULL) return -1; @@ -1478,13 +1720,20 @@ int cr_check(void) ret |= check_newifindex(); ret |= check_pidfd_store(); ret |= check_ns_pid(); - ret |= check_apparmor_stacking(); ret |= check_network_lock_nftables(); ret |= check_sockopt_buf_lock(); ret |= check_memfd_hugetlb(); ret |= check_move_mount_set_group(); ret |= check_openat2(); ret |= check_ptrace_get_rseq_conf(); + ret |= check_ipv6_freebind(); + ret |= check_pagemap_scan(); + ret |= check_overlayfs_maps(); + ret |= check_timer_cr_ids(); + ret |= check_pagemap_scan_guard_pages(); + + if (kdat.lsm == LSMTYPE__APPARMOR) + ret |= check_apparmor_stacking(); } /* @@ -1494,6 +1743,10 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } + /* + * Category 4 - optional. + */ + check_breakpoints(); pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; @@ -1602,6 +1855,12 @@ static struct feature_list feature_list[] = { { "move_mount_set_group", check_move_mount_set_group }, { "openat2", check_openat2 }, { "get_rseq_conf", check_ptrace_get_rseq_conf }, + { "ipv6_freebind", check_ipv6_freebind }, + { "pagemap_scan", check_pagemap_scan }, + { "timer_cr_ids", check_timer_cr_ids }, + { "overlayfs_maps", check_overlayfs_maps }, + { "breakpoints", check_breakpoints }, + { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; @@ -1653,3 +1912,54 @@ static char *feature_name(int (*func)(void)) } return NULL; } + +static int pr_set_dumpable(int value) +{ + int ret = prctl(PR_SET_DUMPABLE, value, 0, 0, 0); + if (ret < 0) + pr_perror("Unable to set PR_SET_DUMPABLE"); + return ret; +} + +int check_caps(void) +{ + /* Read out effective capabilities and store in opts.cap_eff. */ + if (set_opts_cap_eff()) + goto out; + + /* + * No matter if running as root or not. CRIU always needs + * at least these capabilities. + */ + if (!has_cap_checkpoint_restore(opts.cap_eff)) + goto out; + + /* For some things we need to know if we are running as root. */ + opts.uid = geteuid(); + + if (!opts.uid) { + /* CRIU is running as root. No further checks are necessary. */ + return 0; + } + + if (!opts.unprivileged) { + pr_msg("Running as non-root requires '--unprivileged'\n"); + pr_msg("Please consult the documentation for limitations when running as non-root\n"); + return -1; + } + + /* + * At his point we know we are running as non-root with the necessary + * capabilities available. Now we have to make the process dumpable + * so that /proc/self is not owned by root. + */ + if (pr_set_dumpable(1)) + return -1; + + return 0; +out: + pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); + pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); + + return -1; +} diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index c0c21f53e..feeb9ebb0 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -87,7 +87,8 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); + pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", + pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index f58701e5c..a58aaf34a 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -86,6 +86,8 @@ #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" +#include "timer.h" +#include "sigact.h" /* * Architectures can overwrite this function to restore register sets that @@ -128,6 +130,23 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap if (ret < 0) goto err; + /* + * In addition to real process VMAs we should keep an info about + * madvise(MADV_GUARD_INSTALL) pages. While these are not represented + * as a struct vm_area_struct in the kernel, it is convenient to treat + * them as mappings in CRIU and reuse the same VMA images but with only + * VMA_AREA_GUARD flag set. + * + * Also, we don't need to dump them during pre-dump. + */ + if (dump_file) { + ret = collect_madv_guards(pid, vma_area_list); + if (ret < 0) { + pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); + goto err; + } + } + pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); @@ -157,6 +176,11 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_policy = true; tc->sched_policy = ret; + /* The reset-on-fork flag might be used in combination + * with SCHED_FIFO or SCHED_RR to reset the scheduling + * policy/priority in child processes. + */ + ret &= ~SCHED_RESET_ON_FORK; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { @@ -429,7 +453,7 @@ static int dump_filemap(struct vma_area *vma_area, int fd) if (vma_area->aufs_rpath) { struct fd_link aufs_link; - strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); + __strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); aufs_link.len = strlen(aufs_link.name); p.link = &aufs_link; } @@ -759,6 +783,7 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item pid_t pid = item->pid->real; int ret = -1; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; + u32 *cg_set; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); @@ -769,11 +794,16 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; + if (misc->membarrier_registration_mask) { + core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; + core->tc->has_membarrier_registration_mask = true; + } + ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; - strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); + __strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); core->tc->flags = stat->flags; core->tc->task_state = item->pid->state; core->tc->exit_code = 0; @@ -781,6 +811,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[0]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + if (core->tc->task_state == TASK_STOPPED) { + core->tc->has_stop_signo = true; + core->tc->stop_signo = item->pid->stop_signo; + } + ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; @@ -799,13 +834,15 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; + strcpy(cgroup_args.thread_cgrp, "self/cgroup"); ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } - core->tc->has_cg_set = true; - ret = dump_task_cgroup(item, &core->tc->cg_set, info); + core->thread_core->has_cg_set = true; + cg_set = &core->thread_core->cg_set; + ret = dump_thread_cgroup(item, cg_set, info, -1); if (ret) goto err; @@ -867,6 +904,72 @@ static int collect_file_locks(void) return parse_file_locks(); } +static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) +{ + return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; +} + +static int fixup_thread_rseq(const struct pstree_item *item, int i) +{ + CoreEntry *core = item->core[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + pid_t tid = item->threads[i].real; + + if (!kdat.has_ptrace_get_rseq_conf) + return 0; + + /* equivalent to (struct rseq)->rseq_cs is NULL */ + if (!rseq_cs->start_ip) + return 0; + + pr_debug( + "fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", + tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, + rseq_cs->version, (unsigned long)TI_IP(core)); + + if (rseq_cs->version != 0) { + pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); + return -1; + } + + if (task_in_rseq(rseq_cs, TI_IP(core))) { + struct pid *tid = &item->threads[i]; + + /* + * We need to fixup task instruction pointer from + * the original one (which lays inside rseq critical section) + * to rseq abort handler address. But we need to look on rseq_cs->flags + * (please refer to struct rseq -> flags field description). + * Naive idea of flags support may be like... let's change instruction pointer (IP) + * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). + * But unfortunately, it doesn't work properly, because the kernel does + * clean up of rseq_cs field in the struct rseq (modifies userspace memory). + * So, we need to preserve original value of (struct rseq)->rseq_cs field in the + * image and restore it's value before releasing threads (see restore_rseq_cs()). + * + * It's worth to mention that we need to fixup IP in CoreEntry + * (used when full dump/restore is performed) and also in + * the parasite regs storage (used if --leave-running option is used, + * or if dump error occurred and process execution is resumed). + */ + + if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) { + pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", + tid->real); + + TI_IP(core) = rseq_cs->abort_ip; + + if (item->pid->real == tid->real) { + compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); + } else { + compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); + } + } + } + + return 0; +} + static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstree_item *item, int id) { struct parasite_thread_ctl *tctl = dmpi(item)->thread_ctls[id]; @@ -890,6 +993,12 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstr core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[id]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; + ret = fixup_thread_rseq(item, id); + if (ret) { + pr_err("Can't fixup rseq for pid %d\n", pid); + goto err; + } + img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt); if (!img) goto err; @@ -898,6 +1007,7 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstr close_image(img); err: + compel_release_thread(tctl); pr_info("----------------------------------------\n"); return ret; } @@ -912,7 +1022,7 @@ static int dump_one_zombie(const struct pstree_item *item, const struct proc_pid if (!core) return -1; - strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); + __strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); core->tc->task_state = TASK_DEAD; core->tc->exit_code = pps->exit_code; @@ -1034,7 +1144,7 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } -static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct rseq_cs *rseq_cs, +static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct criu_rseq_cs *rseq_cs, struct criu_rseq *rseq) { int ret; @@ -1065,10 +1175,11 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, st if (!rseq->rseq_cs) return 0; - ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct rseq_cs)); + ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct criu_rseq_cs)); if (ret) { pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, - (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, (unsigned long)sizeof(struct rseq_cs)); + (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, + (unsigned long)sizeof(struct criu_rseq_cs)); return -1; } @@ -1083,7 +1194,7 @@ static int dump_thread_rseq(struct pstree_item *item, int i) CoreEntry *core = item->core[i]; RseqEntry **rseqep = &core->thread_core->rseq_entry; struct criu_rseq rseq = {}; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; + struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; pid_t tid = item->threads[i].real; /* @@ -1149,7 +1260,7 @@ err: static int dump_task_rseq(pid_t pid, struct pstree_item *item) { int i; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* if rseq() syscall isn't supported then nothing to dump */ if (!kdat.has_rseq) @@ -1174,95 +1285,11 @@ free_rseq: return -1; } -static bool task_in_rseq(struct rseq_cs *rseq_cs, uint64_t addr) -{ - return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; -} - -static int fixup_thread_rseq(struct pstree_item *item, int i) -{ - CoreEntry *core = item->core[i]; - struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; - pid_t tid = item->threads[i].real; - - /* equivalent to (struct rseq)->rseq_cs is NULL */ - if (!rseq_cs->start_ip) - return 0; - - pr_debug( - "fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", - tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, - rseq_cs->version, (unsigned long)TI_IP(core)); - - if (rseq_cs->version != 0) { - pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); - return -1; - } - - if (task_in_rseq(rseq_cs, TI_IP(core))) { - struct pid *tid = &item->threads[i]; - - /* - * We need to fixup task instruction pointer from - * the original one (which lays inside rseq critical section) - * to rseq abort handler address. But we need to look on rseq_cs->flags - * (please refer to struct rseq -> flags field description). - * Naive idea of flags support may be like... let's change instruction pointer (IP) - * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). - * But unfortunately, it doesn't work properly, because the kernel does - * clean up of rseq_cs field in the struct rseq (modifies userspace memory). - * So, we need to preserve original value of (struct rseq)->rseq_cs field in the - * image and restore it's value before releasing threads (see restore_rseq_cs()). - * - * It's worth to mention that we need to fixup IP in CoreEntry - * (used when full dump/restore is performed) and also in - * the parasite regs storage (used if --leave-running option is used, - * or if dump error occurred and process execution is resumed). - */ - - if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) { - pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", - tid->real); - - TI_IP(core) = rseq_cs->abort_ip; - - if (item->pid->real == tid->real) { - compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); - } else { - compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); - } - } - } - - return 0; -} - -static int fixup_task_rseq(pid_t pid, struct pstree_item *item) -{ - int ret = 0; - int i; - - if (!kdat.has_ptrace_get_rseq_conf) - return 0; - - for (i = 0; i < item->nr_threads; i++) { - if (fixup_thread_rseq(item, i)) { - ret = -1; - goto exit; - } - } - -exit: - xfree(dmpi(item)->thread_rseq_cs); - dmpi(item)->thread_rseq_cs = NULL; - return ret; -} - static struct proc_pid_stat pps_buf; static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) { - int i; + int i, ret = 0; for (i = 0; i < item->nr_threads; i++) { /* Leader is already dumped */ @@ -1270,11 +1297,14 @@ static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pst item->threads[i].ns[0].virt = vpid(item); continue; } - if (dump_task_thread(parasite_ctl, item, i)) - return -1; + ret = dump_task_thread(parasite_ctl, item, i); + if (ret) + break; } - return 0; + xfree(dmpi(item)->thread_rseq_cs); + dmpi(item)->thread_rseq_cs = NULL; + return ret; } /* @@ -1383,7 +1413,7 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(!list_empty(&item->children)); + BUG_ON(has_children(item)); if (!item->sid) { pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", @@ -1403,6 +1433,39 @@ err: return ret; } +static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) +{ + struct parasite_dump_cgroup_args cgroup_args, *info; + int i; + + BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); + for (i = 0; i < item->nr_threads; i++) { + CoreEntry *core = item->core[i]; + + /* Leader is already dumped */ + if (item->pid->real == item->threads[i].real) + continue; + + /* For now, we only need to dump the root task's cgroup ns, because we + * know all the tasks are in the same cgroup namespace because we don't + * allow nesting. + */ + info = NULL; + if (item->ids->has_cgroup_ns_id && !item->parent) { + info = &cgroup_args; + sprintf(cgroup_args.thread_cgrp, "self/task/%d/cgroup", item->threads[i].ns[0].virt); + if (parasite_dump_cgroup(parasite_ctl, &cgroup_args)) + return -1; + } + + core->thread_core->has_cg_set = true; + if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) + return -1; + } + + return 0; +} + static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; @@ -1415,7 +1478,7 @@ static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Pre-dumping task (pid: %d)\n", pid); + pr_info("Pre-dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); /* @@ -1505,7 +1568,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Dumping task (pid: %d)\n", pid); + pr_info("Dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); if (item->pid->state == TASK_DEAD) @@ -1565,7 +1628,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; } - ret = fixup_task_rseq(pid, item); + ret = fixup_thread_rseq(item, 0); if (ret) { pr_err("Fixup rseq for %d failed %d\n", pid, ret); goto err; @@ -1675,6 +1738,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; } + ret = dump_task_cgroup(parasite_ctl, item); + if (ret) { + pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); + goto err_cure; + } + ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); @@ -1983,7 +2052,6 @@ static int cr_dump_finish(int ret) if (bfd_flush_images()) ret = -1; - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { @@ -2037,6 +2105,9 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; + + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); + pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); @@ -2049,7 +2120,11 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); close_image_dir(); - if (ret) { + if (ret || post_dump_ret) { + if (fault_injected(FI_DUMP_CRASH)) { + pr_info("fault: CRIU dump crashed!\n"); + abort(); + } pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); @@ -2063,11 +2138,13 @@ int cr_dump_tasks(pid_t pid) InventoryEntry he = INVENTORY_ENTRY__INIT; InventoryEntry *parent_ie = NULL; struct pstree_item *item; - int pre_dump_ret = 0; - int ret = -1; + int ret; + int exit_code = -1; + + kerndat_warn_about_madv_guards(); pr_info("========================================\n"); - pr_info("Dumping processes (pid: %d)\n", pid); + pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); pr_info("========================================\n"); /* @@ -2082,9 +2159,9 @@ int cr_dump_tasks(pid_t pid) goto err; root_item->pid->real = pid; - pre_dump_ret = run_scripts(ACT_PRE_DUMP); - if (pre_dump_ret != 0) { - pr_err("Pre dump script failed with %d!\n", pre_dump_ret); + ret = run_scripts(ACT_PRE_DUMP); + if (ret != 0) { + pr_err("Pre dump script failed with %d!\n", ret); goto err; } if (init_stats(DUMP_STATS)) @@ -2134,12 +2211,18 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree()) goto err; + if (checkpoint_devices()) + goto err; + if (collect_pstree_ids()) goto err; if (network_lock()) goto err; + if (rpc_query_external_files()) + goto err; + if (collect_file_locks()) goto err; @@ -2164,6 +2247,10 @@ int cr_dump_tasks(pid_t pid) goto err; } + ret = run_plugins(DUMP_DEVICES_LATE, pid); + if (ret && ret != -ENOTSUP) + goto err; + if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; @@ -2200,49 +2287,44 @@ int cr_dump_tasks(pid_t pid) * ipc shared memory, but an ipc namespace is dumped in a child * process. */ - ret = cr_dump_shmem(); - if (ret) + if (cr_dump_shmem()) goto err; if (root_ns_mask) { - ret = dump_namespaces(root_item, root_ns_mask); - if (ret) + if (dump_namespaces(root_item, root_ns_mask)) goto err; } if ((root_ns_mask & CLONE_NEWTIME) == 0) { - ret = dump_time_ns(0); - if (ret) + if (dump_time_ns(0)) goto err; } if (dump_aa_namespaces() < 0) goto err; - ret = dump_cgroups(); - if (ret) + if (dump_cgroups()) goto err; - ret = fix_external_unix_sockets(); - if (ret) + if (fix_external_unix_sockets()) goto err; - ret = tty_post_actions(); - if (ret) + if (tty_post_actions()) goto err; - ret = inventory_save_uptime(&he); - if (ret) + if (inventory_save_uptime(&he)) goto err; he.has_pre_dump_mode = false; + if (found_uprobes_vma()) { + he.has_allow_uprobes = true; + he.allow_uprobes = true; + } - ret = write_img_inventory(&he); - if (ret) - goto err; + exit_code = write_img_inventory(&he); err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); - return cr_dump_finish(ret); + return cr_dump_finish(exit_code); } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9853c0585..b92b92715 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -17,12 +17,12 @@ #include #include #include +#include #include "types.h" #include #include "common/compiler.h" -#include "linux/mount.h" #include "linux/rseq.h" #include "clone-noasan.h" @@ -80,12 +80,15 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" +#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" #include #include "compel/include/asm/syscall.h" +#include "linux/mount.h" + #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" @@ -97,6 +100,8 @@ #include "restore.h" #include "cr-errno.h" +#include "timer.h" +#include "sigact.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -117,7 +122,6 @@ static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* @@ -278,7 +282,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo }; /* These images are required to restore namespaces */ @@ -351,6 +355,10 @@ static int root_prepare_shared(void) if (ret) goto err; + ret = add_fake_unix_queuers(); + if (ret) + goto err; + /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. @@ -367,10 +375,6 @@ static int root_prepare_shared(void) if (ret) goto err; - ret = add_fake_unix_queuers(); - if (ret) - goto err; - show_saved_files(); err: return ret; @@ -406,268 +410,6 @@ static int populate_pid_proc(void) return 0; } -static rt_sigaction_t sigchld_act; -/* - * If parent's sigaction has blocked SIGKILL (which is non-sense), - * this parent action is non-valid and shouldn't be inherited. - * Used to mark parent_act* no more valid. - */ -static rt_sigaction_t parent_act[SIGMAX]; -#ifdef CONFIG_COMPAT -static rt_sigaction_t_compat parent_act_compat[SIGMAX]; -#endif - -static bool sa_inherited(int sig, rt_sigaction_t *sa) -{ - rt_sigaction_t *pa; - int i; - - if (current == root_item) - return false; /* XXX -- inherit from CRIU? */ - - pa = &parent_act[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_native_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); -#ifdef CONFIG_MIPS - e->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); - - memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); - memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); -#else - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); -#endif - if (sig == SIGCHLD) { - sigchld_act = act; - return 0; - } - - if (sa_inherited(sig - 1, &act)) - return 1; - - /* - * A pure syscall is used, because glibc - * sigaction overwrites se_restorer. - */ - ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); - if (ret < 0) { - pr_perror("Can't restore sigaction"); - return ret; - } - - parent_act[sig - 1] = act; - /* Mark SIGKILL blocked which makes compat sigaction non-valid */ -#ifdef CONFIG_COMPAT - parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; -#endif - - return 1; -} - -static void *stack32; - -#ifdef CONFIG_COMPAT -static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) -{ - rt_sigaction_t_compat *pa; - int i; - - if (current == root_item) - return false; - - pa = &parent_act_compat[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t_compat act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); - - if (sig == SIGCHLD) { - memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); - return 0; - } - - if (sa_compat_inherited(sig - 1, &act)) - return 1; - - if (!stack32) { - stack32 = alloc_compat_syscall_stack(); - if (!stack32) - return -1; - } - - ret = arch_compat_rt_sigaction(stack32, sig, &act); - if (ret < 0) { - pr_err("Can't restore compat sigaction: %d\n", ret); - return ret; - } - - parent_act_compat[sig - 1] = act; - /* Mark SIGKILL blocked which makes native sigaction non-valid */ - parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; - - return 1; -} -#else -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - return -1; -} -#endif - -static int prepare_sigactions_from_core(TaskCoreEntry *tc) -{ - int sig, i; - - if (tc->n_sigactions != SIGMAX - 2) { - pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); - return -1; - } - - pr_info("Restore on-core sigactions for %d\n", vpid(current)); - - for (sig = 1, i = 0; sig <= SIGMAX; sig++) { - int ret; - SaEntry *e; - bool sigaction_is_compat; - - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - e = tc->sigactions[i++]; - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - if (ret < 0) - return ret; - } - - return 0; -} - -/* Returns number of restored signals, -1 or negative errno on fail */ -static int restore_one_sigaction(int sig, struct cr_img *img, int pid) -{ - bool sigaction_is_compat; - SaEntry *e; - int ret = 0; - - BUG_ON(sig == SIGKILL || sig == SIGSTOP); - - ret = pb_read_one_eof(img, &e, PB_SIGACT); - if (ret == 0) { - if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ - pr_err("Unexpected EOF %d\n", sig); - return -1; - } - pr_warn("This format of sigacts-%d.img is deprecated\n", pid); - return -1; - } - if (ret < 0) - return ret; - - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - sa_entry__free_unpacked(e, NULL); - - return ret; -} - -static int prepare_sigactions_from_image(void) -{ - int pid = vpid(current); - struct cr_img *img; - int sig, rst = 0; - int ret = 0; - - pr_info("Restore sigacts for %d\n", pid); - - img = open_image(CR_FD_SIGACT, O_RSTR, pid); - if (!img) - return -1; - - for (sig = 1; sig <= SIGMAX; sig++) { - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - ret = restore_one_sigaction(sig, img, pid); - if (ret < 0) - break; - if (ret) - rst++; - } - - pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); - - close_image(img); - return ret; -} - -static int prepare_sigactions(CoreEntry *core) -{ - int ret; - - if (!task_alive(current)) - return 0; - - if (core->tc->n_sigactions != 0) - ret = prepare_sigactions_from_core(core->tc); - else - ret = prepare_sigactions_from_image(); - - if (stack32) { - free_compat_syscall_stack(stack32); - stack32 = NULL; - } - - return ret; -} - static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; @@ -862,6 +604,9 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; + if (tc->has_membarrier_registration_mask) + args->membarrier_registration_mask = tc->membarrier_registration_mask; + /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid); @@ -878,7 +623,6 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a return 0; } -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) @@ -971,6 +715,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (setup_uffd(pid, ta)) return -1; + if (arch_shstk_prepare(current, core, ta)) + return -1; + return sigreturn_restore(pid, ta, args_len, core); } @@ -1348,7 +1095,22 @@ static inline int fork_with_pid(struct pstree_item *item) return -1; item->pid->state = ca.core->tc->task_state; - rsti(item)->cg_set = ca.core->tc->cg_set; + + /* + * Zombie tasks' cgroup is not dumped/restored. + * cg_set == 0 is skipped in prepare_task_cgroup() + */ + if (item->pid->state == TASK_DEAD) { + rsti(item)->cg_set = 0; + } else { + if (ca.core->thread_core->has_cg_set) + rsti(item)->cg_set = ca.core->thread_core->cg_set; + else + rsti(item)->cg_set = ca.core->tc->cg_set; + } + + if (ca.core->tc->has_stop_signo) + item->pid->stop_signo = ca.core->tc->stop_signo; if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); @@ -1476,6 +1238,8 @@ static inline int fork_with_pid(struct pstree_item *item) pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } + arch_shstk_unlock(item, ca.core, ret); + err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) unlock_last_pid(); @@ -1742,7 +1506,7 @@ static int create_children_and_session(void) return 0; } -static int restore_task_with_children(void *_arg) +static int __restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; @@ -1778,7 +1542,7 @@ static int restore_task_with_children(void *_arg) } if (log_init_by_pid(vpid(current))) - return -1; + goto err; if (current->parent == NULL) { /* @@ -1805,9 +1569,19 @@ static int restore_task_with_children(void *_arg) goto err; } + if (set_opts_cap_eff()) + goto err; + /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; + + /* + * Since we don't support nesting of cgroup namespaces, let's + * only set up the cgns (if it exists) in the init task. + */ + if (prepare_cgroup_namespace(current) < 0) + goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) @@ -1819,7 +1593,7 @@ static int restore_task_with_children(void *_arg) * we will only move the root one there, others will * just have it inherited. */ - if (prepare_task_cgroup(current) < 0) + if (restore_task_cgroup(current) < 0) goto err; /* Restore root task */ @@ -1924,6 +1698,19 @@ err: exit(1); } +static int restore_task_with_children(void *_arg) +{ + struct cr_clone_arg *arg = _arg; + struct pstree_item *item = arg->item; + CoreEntry *core = arg->core; + + return arch_shstk_trampoline(item, core, __restore_task_with_children, + arg); +} + +int __attribute((weak)) arch_ptrace_restore(int pid, struct pstree_item *item); +int arch_ptrace_restore(int pid, struct pstree_item *item) { return 0; } + static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -1960,6 +1747,12 @@ static int attach_to_tasks(bool root_seized) return -1; } + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { + pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); + return -1; + } + if (arch_ptrace_restore(pid, item)) + return -1; /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -2024,9 +1817,10 @@ static int restore_rseq_cs(void) return 0; } -static int catch_tasks(bool root_seized, enum trace_flags *flag) +static int catch_tasks(bool root_seized) { struct pstree_item *item; + bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; for_each_pstree_item(item) { int status, i, ret; @@ -2054,7 +1848,7 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, flag, fault_injected(FI_NO_BREAKPOINTS)); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); if (ret < 0) return -1; } @@ -2063,24 +1857,6 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return 0; } -static int clear_breakpoints(void) -{ - struct pstree_item *item; - int ret = 0, i; - - if (fault_injected(FI_NO_BREAKPOINTS)) - return 0; - - for_each_pstree_item(item) { - if (!task_alive(item)) - continue; - for (i = 0; i < item->nr_threads; i++) - ret |= ptrace_flush_breakpoints(item->threads[i].real); - } - - return ret; -} - static void finalize_restore(void) { struct pstree_item *item; @@ -2104,8 +1880,14 @@ static void finalize_restore(void) xfree(ctl); - if ((item->pid->state == TASK_STOPPED) || (opts.final_state == TASK_STOPPED)) + if (opts.final_state == TASK_STOPPED) kill(item->pid->real, SIGSTOP); + else if (item->pid->state == TASK_STOPPED) { + if (item->pid->stop_signo > 0) + kill(item->pid->real, item->pid->stop_signo); + else + kill(item->pid->real, SIGSTOP); + } } } @@ -2215,7 +1997,6 @@ static void reap_zombies(void) static int restore_root_task(struct pstree_item *init) { - enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item; @@ -2339,7 +2120,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(); + ret = network_lock_internal(/* restore = */ true); if (ret) goto out_kill; } @@ -2351,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: + ret = run_plugins(POST_FORKING); + if (ret < 0 && ret != -ENOTSUP) + goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) @@ -2378,6 +2162,10 @@ skip_ns_bouncing: if (ret < 0) goto out_kill; + ret = stop_cgroupd(); + if (ret < 0) + goto out_kill; + ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; @@ -2430,7 +2218,7 @@ skip_ns_bouncing: timing_stop(TIME_RESTORE); - if (catch_tasks(root_seized, &flag)) { + if (catch_tasks(root_seized)) { pr_err("Can't catch all tasks\n"); goto out_kill_network_unlocked; } @@ -2440,16 +2228,18 @@ skip_ns_bouncing: __restore_switch_stage(CR_STATE_COMPLETE); - ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1)); if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; } - if (clear_breakpoints()) - pr_err("Unable to flush breakpoints\n"); - finalize_restore(); + + /* just before releasing threads we have to restore rseq_cs */ + if (restore_rseq_cs()) + pr_err("Unable to restore rseq_cs state\n"); + /* * Some external devices such as GPUs might need a very late * trigger to kick-off some events, memory notifiers and for @@ -2459,8 +2249,10 @@ skip_ns_bouncing: * mapped memory) could be done sanely once the pie code hands * over the control to master process. */ + pr_info("Run late stage hook from criu master for external devices\n"); for_each_pstree_item(item) { - pr_info("Run late stage hook from criu master for external devices\n"); + if (!task_alive(item)) + continue; ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); /* * This may not really be an error. Only certain plugin hooks @@ -2470,7 +2262,7 @@ skip_ns_bouncing: * might actually be a true error code but that would be also * captured in the plugin so no need to print the error here. */ - if (ret < 0) + if (ret < 0 && ret != -ENOTSUP) pr_debug("restore late stage hook for external plugin failed\n"); } @@ -2481,10 +2273,6 @@ skip_ns_bouncing: if (restore_freezer_state()) pr_err("Unable to restore freezer state\n"); - /* just before releasing threads we have to restore rseq_cs */ - if (restore_rseq_cs()) - pr_err("Unable to restore rseq_cs state\n"); - /* Detaches from processes and they continue run through sigreturn. */ if (finalize_restore_detach()) goto out_kill_network_unlocked; @@ -2551,6 +2339,7 @@ int prepare_task_entries(void) task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); + mutex_init(&task_entries->cgroupd_sync_lock); mutex_init(&task_entries->last_pid_mutex); return 0; @@ -2576,42 +2365,48 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + if (check_img_inventory(/* restore = */ true) < 0) return -1; - if (check_img_inventory(/* restore = */ true) < 0) - goto err; - if (init_stats(RESTORE_STATS)) - goto err; + return -1; if (lsm_check_opts()) - goto err; + return -1; timing_start(TIME_RESTORE); if (cpu_init() < 0) - goto err; + return -1; if (vdso_init_restore()) - goto err; + return -1; if (tty_init_restore()) - goto err; + return -1; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - goto err; + return -1; } if (prepare_task_entries() < 0) - goto err; + return -1; if (prepare_pstree() < 0) - goto err; + return -1; if (fdstore_init()) - goto err; + return -1; + + /* + * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store + * its socket file descriptor. This allows the main process and the target process to + * communicate with each other through this file descriptor. Therefore, cr_plugin_init + * must be initialized after fdstore_init. + */ + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) + return -1; if (inherit_fd_move_to_fdstore()) goto err; @@ -2636,23 +2431,24 @@ err: return ret; } -static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long min_addr, long vma_len) { struct vma_area *t_vma, *s_vma; - long prev_vma_end = 0; + long prev_vma_end = min_addr; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; - prev_vma_end = kdat.mmap_min_addr; + INIT_LIST_HEAD(&end_vma.list); s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if (s_vma->list.next == self_vma_list) { + if ((s_vma->list.next == self_vma_list) || + vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { s_vma = &end_vma; continue; } @@ -2665,7 +2461,8 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he } if (prev_vma_end + vma_len > t_vma->e->start) { - if (t_vma->list.next == tgt_vma_list) { + if ((t_vma->list.next == tgt_vma_list) || + vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { t_vma = &end_vma; continue; } @@ -2683,251 +2480,6 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he return -1; } -static inline int timeval_valid(struct timeval *tv) -{ - return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); -} - -static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) -{ - if (ie->isec == 0 && ie->iusec == 0) { - memzero_p(val); - return 0; - } - - val->it_interval.tv_sec = ie->isec; - val->it_interval.tv_usec = ie->iusec; - - if (!timeval_valid(&val->it_interval)) { - pr_err("Invalid timer interval\n"); - return -1; - } - - if (ie->vsec == 0 && ie->vusec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - val->it_value.tv_sec = ie->isec; - val->it_value.tv_usec = ie->iusec; - } else { - val->it_value.tv_sec = ie->vsec; - val->it_value.tv_usec = ie->vusec; - } - - if (!timeval_valid(&val->it_value)) { - pr_err("Invalid timer value\n"); - return -1; - } - - pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, - val->it_interval.tv_sec, val->it_interval.tv_usec); - - return 0; -} - -/* - * Legacy itimers restore from CR_FD_ITIMERS - */ - -static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) -{ - int ret = -1; - struct cr_img *img; - ItimerEntry *ie; - - if (!deprecated_ok("Itimers")) - return -1; - - img = open_image(CR_FD_ITIMERS, O_RSTR, pid); - if (!img) - return -1; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("real", ie, &args->itimers[0]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("virt", ie, &args->itimers[1]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("prof", ie, &args->itimers[2]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; -out: - close_image(img); - return ret; -} - -static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) -{ - int ret = 0; - TaskTimersEntry *tte = core->tc->timers; - - if (!tte) - return prepare_itimers_from_fd(pid, args); - - ret |= decode_itimer("real", tte->real, &args->itimers[0]); - ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); - ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); - - return ret; -} - -static inline int timespec_valid(struct timespec *ts) -{ - return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); -} - -static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) -{ - pt->val.it_interval.tv_sec = pte->isec; - pt->val.it_interval.tv_nsec = pte->insec; - - if (!timespec_valid(&pt->val.it_interval)) { - pr_err("Invalid timer interval(posix)\n"); - return -1; - } - - if (pte->vsec == 0 && pte->vnsec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - pt->val.it_value.tv_sec = pte->isec; - pt->val.it_value.tv_nsec = pte->insec; - } else { - pt->val.it_value.tv_sec = pte->vsec; - pt->val.it_value.tv_nsec = pte->vnsec; - } - - if (!timespec_valid(&pt->val.it_value)) { - pr_err("Invalid timer value(posix)\n"); - return -1; - } - - pt->spt.it_id = pte->it_id; - pt->spt.clock_id = pte->clock_id; - pt->spt.si_signo = pte->si_signo; - pt->spt.it_sigev_notify = pte->it_sigev_notify; - pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); - pt->spt.notify_thread_id = pte->notify_thread_id; - pt->overrun = pte->overrun; - - return 0; -} - -static int cmp_posix_timer_proc_id(const void *p1, const void *p2) -{ - return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; -} - -static void sort_posix_timers(struct task_restore_args *ta) -{ - void *tmem; - - /* - * This is required for restorer's create_posix_timers(), - * it will probe them one-by-one for the desired ID, since - * kernel doesn't provide another API for timer creation - * with given ID. - */ - - if (ta->posix_timers_n > 0) { - tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); - qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); - } -} - -/* - * Legacy posix timers restoration from CR_FD_POSIX_TIMERS - */ - -static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) -{ - struct cr_img *img; - int ret = -1; - struct restore_posix_timer *t; - - if (!deprecated_ok("Posix timers")) - return -1; - - img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); - if (!img) - return -1; - - ta->posix_timers_n = 0; - while (1) { - PosixTimerEntry *pte; - - ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); - if (ret <= 0) - break; - - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - break; - - ret = decode_posix_timer(pte, t); - if (ret < 0) - break; - - posix_timer_entry__free_unpacked(pte, NULL); - ta->posix_timers_n++; - } - - close_image(img); - if (!ret) - sort_posix_timers(ta); - - return ret; -} - -static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) -{ - int i, ret = -1; - TaskTimersEntry *tte = core->tc->timers; - struct restore_posix_timer *t; - - ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); - - if (!tte) - return prepare_posix_timers_from_fd(pid, ta); - - ta->posix_timers_n = tte->n_posix; - for (i = 0; i < ta->posix_timers_n; i++) { - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - goto out; - - if (decode_posix_timer(tte->posix[i], t)) - goto out; - } - - ret = 0; - sort_posix_timers(ta); -out: - return ret; -} - -static inline int verify_cap_size(CredsEntry *ce) -{ - return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && (ce->n_cap_prm == CR_CAP_SIZE) && - (ce->n_cap_bnd == CR_CAP_SIZE)); -} - static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; @@ -2953,7 +2505,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) args->fd_exe_link = exe_fd; - args->has_thp_enabled = rsti(current)->has_thp_enabled; + args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; ret = 0; out: @@ -3019,6 +2571,17 @@ static int remap_restorer_blob(void *addr) restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); + /* + * Ensure the infected thread sees the updated code. + * + * On architectures like ARM64, the Data Cache (D-cache) and + * Instruction Cache (I-cache) are not automatically coherent. + * Modifications land in the D-cache, so we must flush (clean) the + * D-cache to push changes to RAM to ensure the CPU fetches the updated + * instructions. + */ + __builtin___clear_cache(addr, addr + pbd.hdr.bsize); + return 0; } @@ -3027,7 +2590,7 @@ static int validate_sched_parm(struct rst_sched_param *sp) if ((sp->nice < -20) || (sp->nice > 19)) return 0; - switch (sp->policy) { + switch (sp->policy & ~SCHED_RESET_ON_FORK) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); @@ -3078,7 +2641,6 @@ static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) return 0; } -#if defined(__GLIBC__) && defined(RSEQ_SIG) static void prep_libc_rseq_info(struct rst_rseq_param *rseq) { if (!kdat.has_rseq) { @@ -3086,23 +2648,29 @@ static void prep_libc_rseq_info(struct rst_rseq_param *rseq) return; } - rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); - rseq->rseq_abi_size = __rseq_size; - rseq->signature = RSEQ_SIG; -} + if (!kdat.has_ptrace_get_rseq_conf) { +#if defined(__GLIBC__) && defined(RSEQ_SIG) + rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); + /* + * Current glibc reports the feature/active size in + * __rseq_size, not the size passed to the kernel. + * This could be 20, but older kernels expect 32 for + * the size argument even if only 20 bytes are used. + */ + rseq->rseq_abi_size = __rseq_size; + if (rseq->rseq_abi_size < 32) + rseq->rseq_abi_size = 32; + rseq->signature = RSEQ_SIG; #else -static void prep_libc_rseq_info(struct rst_rseq_param *rseq) -{ - /* - * TODO: handle built-in rseq on other libc'ies like musl - * We can do that using get_rseq_conf kernel feature. - * - * For now we just assume that other libc libraries are - * not registering rseq by default. - */ - rseq->rseq_abi_pointer = 0; -} + rseq->rseq_abi_pointer = 0; #endif + return; + } + + rseq->rseq_abi_pointer = kdat.libc_rseq_conf.rseq_abi_pointer; + rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; + rseq->signature = kdat.libc_rseq_conf.signature; +} static rlim_t decode_rlim(rlim_t ival) { @@ -3345,17 +2913,31 @@ static bool groups_match(gid_t *groups, int n_groups) return ret; } +static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) +{ + int i, cap_end; + + for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { + if (~in_caps[i / 32] & (1 << (i % 32))) + continue; + + pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); + /* extra caps will be cleared below */ + } + + n_words = min(n_words, (kdat.last_cap + 31) / 32); + cap_end = (kdat.last_cap & 31) + 1; + memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); + if ((cap_end & 31) && n_words) + out_caps[n_words - 1] &= (1 << cap_end) - 1; + memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); +} + static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; - if (!verify_cap_size(ce)) { - pr_err("Caps size mismatch %d %d %d %d\n", (int)ce->n_cap_inh, (int)ce->n_cap_eff, (int)ce->n_cap_prm, - (int)ce->n_cap_bnd); - return ERR_PTR(-EINVAL); - } - this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); @@ -3391,7 +2973,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_profile = lsm_profile; - strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); + __strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); xfree(rendered); } } else { @@ -3425,7 +3007,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_sockcreate = lsm_sockcreate; - strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); + __strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); xfree(rendered); } } else { @@ -3440,13 +3022,15 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; + args->creds.cap_amb = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; - memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); - memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); - memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); - memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); + copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); + copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); + copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); + copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); + copy_caps(args->cap_amb, ce->cap_amb, ce->n_cap_amb); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; @@ -3549,6 +3133,9 @@ static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) return restorer_sym(restorer_blob, arch_export_unmap); } +void arch_rsti_init(struct pstree_item *p) __attribute__((weak)); +void arch_rsti_init(struct pstree_item *p) {} + static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; @@ -3609,7 +3196,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); - task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size + shstk_restorer_stack_size(); BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); @@ -3639,7 +3226,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * or inited from scratch). */ - mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); + mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, + shstk_min_mmap_addr(&task_args->shstk, kdat.mmap_min_addr), + task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; @@ -3759,11 +3348,16 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns prep_libc_rseq_info(&task_args->libc_rseq); + task_args->uid = opts.uid; + for (i = 0; i < CR_CAP_SIZE; i++) + task_args->cap_eff[i] = opts.cap_eff[i]; + /* * Fill up per-thread data. */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; + arch_rsti_init(current); for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; @@ -3816,6 +3410,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); + if (tcore->thread_core->has_cg_set && rsti(current)->cg_set != tcore->thread_core->cg_set) { + thread_args[i].cg_set = tcore->thread_core->cg_set; + thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); + } else { + thread_args[i].cg_set = -1; + } + ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); if (ret) goto err; @@ -3866,6 +3467,10 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * self-vmas are unmaped. */ mem += rst_mem_size; + + shstk_set_restorer_stack(&task_args->shstk, mem); + mem += shstk_restorer_stack_size(); + task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; @@ -3910,6 +3515,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); + close_service_fd(CGROUPD_SK); __gcov_flush(); diff --git a/criu/cr-service.c b/criu/cr-service.c index a6eb9ebd3..dccf4ef38 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "version.h" #include "crtools.h" @@ -239,15 +240,165 @@ int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) return 0; } -static char images_dir[PATH_MAX]; +int exec_rpc_query_external_files(char *name, int sk) +{ + int i, ret; + CriuNotify cn = CRIU_NOTIFY__INIT; + CriuResp msg = CRIU_RESP__INIT; + CriuReq *req; + + cn.script = name; + + msg.type = CRIU_REQ_TYPE__NOTIFY; + msg.success = true; + msg.notify = &cn; + + ret = send_criu_msg_with_fd(sk, &msg, -1); + if (ret < 0) + return ret; + + ret = recv_criu_msg(sk, &req); + if (ret < 0) + return ret; + + if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { + pr_err("RPC client reported script error\n"); + return -1; + } + + ret = 0; + if (req->opts) + for (i = 0; i < req->opts->n_external; i++) { + char *key = req->opts->external[i]; + pr_info("Adding external object: %s\n", key); + if (add_external(key)) { + pr_err("Failed to add external object: %s\n", key); + ret = -1; + } + } + else + pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); + + criu_req__free_unpacked(req, NULL); + return ret; +} + +static int resolve_images_dir_path(char *images_dir_path, + bool imgs_changed_by_rpc_conf, + const CriuOpts *req, + pid_t peer_pid) +{ + /* + * images_dir_fd is a required RPC parameter with -1 as default value. + * + * This assumes that if opts.imgs_dir is set, we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. The idea is that only the + * RPC configuration file is able to overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) { + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else if (req->images_dir_fd != -1) { + snprintf(images_dir_path, PATH_MAX, "/proc/%d/fd/%d", peer_pid, req->images_dir_fd); + } else if (req->images_dir) { + strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); + images_dir_path[PATH_MAX - 1] = '\0'; + } else { + /* + * Since images dir is not required in CHECK mode, we need to + * check for work_dir_fd in setup_images_and_workdir() + */ + if (opts.mode == CR_CHECK) + return 0; + pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); + return -1; + } + + return 0; +} + +static int setup_images_and_workdir(const char *images_dir_path, + bool work_changed_by_rpc_conf, + CriuOpts *req, + pid_t peer_pid) +{ + char work_dir_path[PATH_MAX] = ""; + + /* We don't need to open images dir in CHECK mode. */ + if (opts.mode != CR_CHECK) { + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + return -1; + } + } + + if (work_changed_by_rpc_conf) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); + else if (opts.work_dir) + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (images_dir_path[0] != '\0') + strcpy(work_dir_path, images_dir_path); + + if (work_dir_path[0] == '\0') { + pr_err("images-dir or work-dir is required when using log file\n"); + return -1; + } + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + return -1; + } + + return 0; +} + +static int setup_logging_from_req(CriuOpts *req, bool output_changed_by_rpc_conf) +{ + if (req->log_file && !output_changed_by_rpc_conf) { + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + return -1; + } + SET_CHAR_OPTS(output, req->log_file); + } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { + xfree(opts.output); + opts.output = NULL; /* log_init(NULL) writes to stderr */ + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + opts.log_level = req->log_level; + log_set_loglevel(opts.log_level); + if (log_init(opts.output)) { + pr_perror("Can't initiate log"); + return -1; + } + + return 0; +} static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); - char images_dir_path[PATH_MAX]; - char work_dir_path[PATH_MAX]; + char images_dir_path[PATH_MAX] = ""; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -260,6 +411,23 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } + /* + * The options relevant in CHECK mode are: log_file, log_to_stderr, and log_level. + * When logging to a file, we also need to resolve images_dir and work_dir. + */ + if (opts.mode == CR_CHECK) { + if (!req) + return 0; /* nothing to do */ + + /* + * A log file is needed only if: + * - log_file is explicitly set, or + * - log_to_stderr is NOT requested (i.e., using DEFAULT_LOG_FILENAME) + */ + if (!req->log_file || (req->has_log_to_stderr && req->log_to_stderr)) + return 0; /* no log file, don't require images_dir or work_dir */ + } + if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; @@ -268,149 +436,8 @@ static int setup_opts_from_req(int sk, CriuOpts *req) BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; - /* - * Evaluate an additional configuration file if specified. - * This needs to happen twice, because it is needed early to detect - * things like work_dir, imgs_dir and logfile. The second parsing - * of the optional RPC configuration file happens at the end and - * overwrites all options set via RPC. - */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - char *tmp_imgs = opts.imgs_dir; - - opts.output = NULL; - opts.work_dir = NULL; - opts.imgs_dir = NULL; - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - goto err; - } - /* If this is non-NULL, the RPC configuration file had a value, use it.*/ - if (opts.output) - output_changed_by_rpc_conf = true; - /* If this is NULL, use the old value if it was set. */ - if (!opts.output && tmp_output) { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) - work_changed_by_rpc_conf = true; - if (!opts.work_dir && tmp_work) { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(tmp_output); - xfree(tmp_work); - xfree(tmp_imgs); - } - - /* - * open images_dir - images_dir_fd is a required RPC parameter - * - * This assumes that if opts.imgs_dir is set we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. - * The idea is that only the RPC configuration file is able to - * overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - else - sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); - - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); - - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - goto err; - } - - /* get full path to images_dir to use in process title */ - if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { - pr_perror("Can't readlink %s", images_dir_path); - goto err; - } - - /* chdir to work dir */ - if (work_changed_by_rpc_conf) - /* Use the value from the RPC configuration file first. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - /* Use the value set via RPC. */ - sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); - else if (opts.work_dir) - /* Use the value from one of the other configuration files. */ - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else - /* Use the images directory a work directory. */ - strcpy(work_dir_path, images_dir_path); - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - goto err; - } - - /* initiate log file in work dir */ - if (req->log_file && !output_changed_by_rpc_conf) { - /* - * If RPC sets a log file and if there nothing from the - * RPC configuration file, use the RPC value. - */ - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - goto err; - } - - SET_CHAR_OPTS(output, req->log_file); - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - /* This is needed later to correctly set the log_level */ - opts.log_level = req->log_level; - log_set_loglevel(req->log_level); - if (log_init(opts.output) == -1) { - pr_perror("Can't initiate log"); - goto err; - } - - if (req->config_file) { - pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); - } - - if (kerndat_init()) - return 1; + if (req->has_unprivileged) + opts.unprivileged = req->unprivileged; if (log_keep_err()) { pr_perror("Can't tune log"); @@ -421,6 +448,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; + if (req->has_leave_stopped && req->leave_stopped) + opts.final_state = TASK_STOPPED; + if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; @@ -464,6 +494,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_shell_job) opts.shell_job = req->shell_job; + if (req->has_skip_file_rwx_check) + opts.skip_file_rwx_check = req->skip_file_rwx_check; + if (req->has_file_locks) opts.handle_file_locks = req->file_locks; @@ -510,6 +543,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) case CRIU_NETWORK_LOCK_METHOD__NFTABLES: opts.network_lock_method = NETWORK_LOCK_NFTABLES; break; + case CRIU_NETWORK_LOCK_METHOD__SKIP: + opts.network_lock_method = NETWORK_LOCK_SKIP; + break; default: goto err; } @@ -685,14 +721,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } - - if (req->n_irmap_scan_paths) { - for (i = 0; i < req->n_irmap_scan_paths; i++) { - if (irmap_scan_path_add(req->irmap_scan_paths[i])) - goto err; - } - } - if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); @@ -704,25 +732,95 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } } - if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) - goto err; - if (req->orphan_pts_master) opts.orphan_pts_master = true; - /* Evaluate additional configuration file a second time to overwrite - * all RPC settings. */ + if (req->has_display_stats) + opts.display_stats = req->display_stats; + + /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + + opts.output = NULL; + opts.work_dir = NULL; + + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(opts.imgs_dir); + opts.imgs_dir = NULL; + + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + rpc_cfg_file = req->config_file; i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) + if (i) { + xfree(tmp_output); + xfree(tmp_work); goto err; + } + + /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ + /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ + if (opts.output) { + output_changed_by_rpc_conf = true; + } else { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) { + work_changed_by_rpc_conf = true; + } else { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + + xfree(tmp_output); + xfree(tmp_work); } + if (resolve_images_dir_path(images_dir_path, imgs_changed_by_rpc_conf, req, ids.pid) < 0) + goto err; + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + if (setup_images_and_workdir(images_dir_path, work_changed_by_rpc_conf, req, ids.pid)) + goto err; + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } + } + + /* initiate log file in work dir */ + if (setup_logging_from_req(req, output_changed_by_rpc_conf)) + goto err; + + if (check_caps()) + goto err; + + if (kerndat_init()) + goto err; + + /* init_pidfd_store_sk must be called after kerndat_init. */ + if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) + goto err; + if (req->mntns_compat_mode) opts.mntns_compat_mode = true; - log_set_loglevel(opts.log_level); if (check_options()) goto err; @@ -742,7 +840,7 @@ static int dump_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -785,7 +883,7 @@ static int restore_using_req(int sk, CriuOpts *req) if (setup_opts_from_req(sk, req)) goto exit; - setproctitle("restore --rpc -D %s", images_dir); + __setproctitle("restore --rpc"); if (cr_restore_tasks()) goto exit; @@ -824,6 +922,11 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -831,7 +934,7 @@ static int check(int sk, CriuOpts *req) } if (pid == 0) { - setproctitle("check --rpc"); + __setproctitle("check --rpc"); opts.mode = CR_CHECK; if (setup_opts_from_req(sk, req)) @@ -848,6 +951,7 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -856,6 +960,11 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) int pid, status; bool success = false; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -869,7 +978,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) if (setup_opts_from_req(sk, req)) goto cout; - setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); + __setproctitle("pre-dump --rpc -t %d", req->pid); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -934,6 +1043,11 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -947,7 +1061,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) if (setup_opts_from_req(sk, req)) goto out_ch; - setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); + __setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); pr_debug("Starting page server\n"); @@ -1007,6 +1121,7 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1107,7 +1222,7 @@ static int handle_feature_check(int sk, CriuReq *msg) if (kerndat_init()) exit(1); - setproctitle("feature-check --rpc"); + __setproctitle("feature-check --rpc"); if ((msg->features->has_mem_track == 1) && (msg->features->mem_track == true)) feat.mem_track = kdat.has_dirty_track; @@ -1181,6 +1296,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; + if (log_keep_err()) { + pr_perror("Can't tune log"); + goto out; + } + pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1190,12 +1310,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; - opts.mode = CR_CPUINFO; + opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; - setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", - images_dir); + __setproctitle("cpuinfo %s --rpc", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check"); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); @@ -1230,7 +1349,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) out: resp.type = msg->type; resp.success = success; - + set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1239,6 +1358,14 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); + more: opts.mode = CR_SWRK; diff --git a/criu/crtools.c b/criu/crtools.c index cc8d9179f..4dc55a065 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,19 +54,17 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } -static int image_dir_mode(char *argv[], int optind) +static int image_dir_mode(void) { switch (opts.mode) { case CR_DUMP: /* fallthrough */ + case CR_CPUINFO_DUMP: + /* fallthrough */ case CR_PRE_DUMP: return O_DUMP; case CR_RESTORE: return O_RSTR; - case CR_CPUINFO: - if (!strcmp(argv[optind + 1], "dump")) - return O_DUMP; - /* fallthrough */ default: return -1; } @@ -76,36 +74,55 @@ static int image_dir_mode(char *argv[], int optind) return -1; } -static int parse_criu_mode(char *mode) -{ - if (!strcmp(mode, "dump")) - opts.mode = CR_DUMP; - else if (!strcmp(mode, "pre-dump")) - opts.mode = CR_PRE_DUMP; - else if (!strcmp(mode, "restore")) - opts.mode = CR_RESTORE; - else if (!strcmp(mode, "lazy-pages")) - opts.mode = CR_LAZY_PAGES; - else if (!strcmp(mode, "check")) - opts.mode = CR_CHECK; - else if (!strcmp(mode, "page-server")) - opts.mode = CR_PAGE_SERVER; - else if (!strcmp(mode, "service")) - opts.mode = CR_SERVICE; - else if (!strcmp(mode, "swrk")) - opts.mode = CR_SWRK; - else if (!strcmp(mode, "dedup")) - opts.mode = CR_DEDUP; - else if (!strcmp(mode, "cpuinfo")) - opts.mode = CR_CPUINFO; - else if (!strcmp(mode, "exec")) - opts.mode = CR_EXEC_DEPRECATED; - else if (!strcmp(mode, "show")) - opts.mode = CR_SHOW_DEPRECATED; - else - return -1; +struct { + char *cmd; + int mode; +} commands[] = { + { "dump", CR_DUMP }, + { "pre-dump", CR_PRE_DUMP }, + { "restore", CR_RESTORE }, + { "lazy-pages", CR_LAZY_PAGES }, + { "check", CR_CHECK }, + { "page-server", CR_PAGE_SERVER }, + { "service", CR_SERVICE }, + { "swrk", CR_SWRK }, + { "dedup", CR_DEDUP }, + { "exec", CR_EXEC_DEPRECATED }, + { "show", CR_SHOW_DEPRECATED }, +}; - return 0; +static int parse_criu_mode(int argc, char **argv, int *optind) +{ + char *cmd = argv[*optind]; + bool has_sub_command = (argc - *optind) > 1; + char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + if (strcmp(cmd, commands[i].cmd)) + continue; + opts.mode = commands[i].mode; + return 0; + } + + if (!strcmp(cmd, "cpuinfo")) { + if (subcommand == NULL) { + pr_err("cpuinfo requires an action: dump or check\n"); + return -1; + } + if (!strcmp(subcommand, "dump")) + opts.mode = CR_CPUINFO_DUMP; + else if (!strcmp(subcommand, "check")) + opts.mode = CR_CPUINFO_CHECK; + else { + pr_err("unknown cpuinfo sub-command: %s\n", subcommand); + return -1; + } + (*optind)++; + return 0; + } + pr_err("unknown command: %s\n", argv[*optind]); + return -1; } int main(int argc, char *argv[], char *envp[]) @@ -115,6 +132,7 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; + char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -127,7 +145,7 @@ int main(int argc, char *argv[], char *envp[]) } cr_pb_init(); - setproctitle_init(argc, argv, envp); + __setproctitle_init(argc, argv, envp); if (argc < 2) goto usage; @@ -165,11 +183,18 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - if (parse_criu_mode(argv[optind])) { - pr_err("unknown command: %s\n", argv[optind]); + cmd = argv[optind]; + ret = parse_criu_mode(argc, argv, &optind); + if (ret) goto usage; - } + /* + * util_init initializes criu_run_id and compel_run_id so that sockets + * are generated with an unique name identifying the specific process + * even in cases where multiple processes with the same pid in + * different pid namespaces are sharing the same network namespace. + */ + util_init(); if (opts.mode == CR_SWRK) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); @@ -185,6 +210,9 @@ int main(int argc, char *argv[], char *envp[]) return cr_service_work(atoi(argv[optind + 1])); } + if (check_caps()) + return 1; + if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); @@ -214,25 +242,19 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else { - /* No subcommands except for cpuinfo and restore --exec-cmd */ - if (opts.mode != CR_CPUINFO && has_sub_command) { - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); - goto usage; - } else if (opts.mode == CR_CPUINFO && !has_sub_command) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } + } else if (has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); + goto usage; } - if (opts.stream && image_dir_mode(argv, optind) == -1) { - pr_err("--stream cannot be used with the %s command\n", argv[optind]); + if (opts.stream && image_dir_mode() == -1) { + pr_err("--stream cannot be used with the %s command\n", cmd); goto usage; } /* We must not open imgs dir, if service is called */ if (opts.mode != CR_SERVICE) { - ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); + ret = open_image_dir(opts.imgs_dir, image_dir_mode()); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; @@ -251,8 +273,6 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - util_init(); - if (log_init(opts.output)) return 1; @@ -279,14 +299,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - if (opts.mode == CR_DUMP) { + switch (opts.mode) { + case CR_DUMP: if (!opts.tree_id) goto opt_pid_missing; return cr_dump_tasks(opts.tree_id); - } - - if (opts.mode == CR_PRE_DUMP) { + case CR_PRE_DUMP: if (!opts.tree_id) goto opt_pid_missing; @@ -296,9 +315,7 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - } - - if (opts.mode == CR_RESTORE) { + case CR_RESTORE: if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -311,46 +328,41 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; - } - if (opts.mode == CR_LAZY_PAGES) + case CR_LAZY_PAGES: return cr_lazy_pages(opts.daemon_mode) != 0; - if (opts.mode == CR_CHECK) + case CR_CHECK: return cr_check() != 0; - if (opts.mode == CR_PAGE_SERVER) + case CR_PAGE_SERVER: return cr_page_server(opts.daemon_mode, false, -1) != 0; - if (opts.mode == CR_SERVICE) + case CR_SERVICE: return cr_service(opts.daemon_mode); - if (opts.mode == CR_DEDUP) + case CR_DEDUP: return cr_dedup() != 0; - if (opts.mode == CR_CPUINFO) { - if (!argv[optind + 1]) { - pr_err("cpuinfo requires an action: dump or check\n"); - goto usage; - } - if (!strcmp(argv[optind + 1], "dump")) - return cpuinfo_dump(); - else if (!strcmp(argv[optind + 1], "check")) - return cpuinfo_check(); - } + case CR_CPUINFO_DUMP: + return cpuinfo_dump(); - if (opts.mode == CR_EXEC_DEPRECATED) { + case CR_CPUINFO_CHECK: + return cpuinfo_check(); + + case CR_EXEC_DEPRECATED: pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; - } - if (opts.mode == CR_SHOW_DEPRECATED) { + case CR_SHOW_DEPRECATED: pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - } - pr_err("unknown command: %s\n", argv[optind]); + case CR_UNSET: + default: + pr_err("unknown command: %s\n", cmd); + } usage: pr_msg("\n" "Usage:\n" @@ -411,9 +423,10 @@ usage: " --mntns-compat-mode Use mount engine in compatibility mode. By default criu\n" " tries to use mount-v2 mode with more reliable algorithm\n" " based on MOVE_MOUNT_SET_GROUP kernel feature\n" - " --network-lock METHOD\n" - " network locking/unlocking method; argument\n" - " can be 'nftables' or 'iptables' (default).\n" + " --network-lock METHOD network locking/unlocking method; argument\n" + " can be 'nftables' or 'iptables' (default).\n" + " --unprivileged accept limitations when running as non-root\n" + " --allow-uprobes allow dump/restore with uprobes vma\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" @@ -441,6 +454,7 @@ usage: " is inaccessible\n" " --link-remap allow one to link unlinked files back when possible\n" " --ghost-limit size limit max size of deleted file contents inside image\n" + " --ghost-fiemap enable dumping of deleted files using fiemap\n" " --action-script FILE add an external action script\n" " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" " -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" @@ -489,8 +503,8 @@ usage: " Inherit file descriptors, treating fd NUM as being\n" " already opened via an existing RES, which can be:\n" " tty[rdev:dev]\n" - " pipe[inode]\n" - " socket[inode]\n" + " pipe:[inode]\n" + " socket:[inode]\n" " file[mnt_id:inode]\n" " /memfd:name\n" " path/to/file\n" @@ -504,6 +518,9 @@ usage: " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" + " --skip-file-rwx-check\n" + " Skip checking file permissions\n" + " (r/w/x for u/g/o) on restore.\n" "\n" "Check options:\n" " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 83dc1fc8d..5dd9acf60 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -1,6 +1,7 @@ #include #include "criu-log.h" #include "fault-injection.h" +#include "seize.h" enum faults fi_strategy; @@ -21,5 +22,13 @@ int fault_injection_init(void) } fi_strategy = start; + + switch (fi_strategy) { + case FI_COMPEL_INTERRUPT_ONLY_MODE: + set_compel_interrupt_only_mode(); + break; + default: + break; + }; return 0; } diff --git a/criu/fdstore.c b/criu/fdstore.c index 6a7f73a59..6ac639c55 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -13,6 +13,9 @@ #include "rst-malloc.h" #include "log.h" #include "util.h" +#include "cr_options.h" +#include "util-caps.h" +#include "sockets.h" /* clang-format off */ static struct fdstore_desc { @@ -49,15 +52,13 @@ int fdstore_init(void) return -1; } - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(sk, buf)) { close(sk); return -1; } addr.sun_family = AF_UNIX; - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%" PRIx64, st.st_ino, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%s", st.st_ino, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/files-ext.c b/criu/files-ext.c index 95ec8e37c..4cc99d921 100644 --- a/criu/files-ext.c +++ b/criu/files-ext.c @@ -45,10 +45,11 @@ static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; + bool retry_needed; xfi = container_of(d, struct ext_file_info, d); - fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); + fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; @@ -57,8 +58,11 @@ static int open_fd(struct file_desc *d, int *new_fd) if (restore_fown(fd, xfi->xfe->fown)) return -1; - *new_fd = fd; - return 0; + if (!retry_needed) + *new_fd = fd; + else + *new_fd = -1; + return retry_needed; } static struct file_desc_ops ext_desc_ops = { diff --git a/criu/files-reg.c b/criu/files-reg.c index 0249063c2..66c0e6cda 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -11,8 +11,13 @@ #include #include #include -#include +#include #include +#include +#include + +#include "tty.h" +#include "stats.h" #ifndef SEEK_DATA #define SEEK_DATA 3 @@ -29,6 +34,8 @@ * and checked. */ #define BUILD_ID_MAP_SIZE 1048576 +#define ST_UNIT 512 +#define EXTENT_MAX_COUNT 512 #include "cr_options.h" #include "imgset.h" @@ -79,7 +86,7 @@ static LIST_HEAD(ghost_files); /* * When opening remaps we first create a link on the remap * target, then open one, then unlink. In case the remap - * source has more than one instance, these tree steps + * source has more than one instance, these three steps * should be serialized with each other. */ static mutex_t *remap_open_lock; @@ -218,6 +225,92 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) return 0; } +static int skip_outstanding(struct fiemap_extent *fe, size_t file_size) +{ + /* Skip outstanding extent */ + if (fe->fe_logical > file_size) + return 1; + + /* Skip outstanding part of the extent */ + if (fe->fe_logical + fe->fe_length > file_size) + fe->fe_length = file_size - fe->fe_logical; + return 0; +} + +static int copy_file_to_chunks_fiemap(int fd, struct cr_img *img, size_t file_size) +{ + GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; + struct fiemap *fiemap_buf; + struct fiemap_extent *ext_buf; + int ext_buf_size, fie_buf_size; + off_t pos = 0; + unsigned int i; + int ret = 0; + int exit_code = 0; + + ext_buf_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent); + fie_buf_size = sizeof(struct fiemap) + ext_buf_size; + + fiemap_buf = xzalloc(fie_buf_size); + if (!fiemap_buf) { + pr_perror("Out of memory when allocating fiemap"); + return -1; + } + + ext_buf = fiemap_buf->fm_extents; + fiemap_buf->fm_length = FIEMAP_MAX_OFFSET; + fiemap_buf->fm_flags |= FIEMAP_FLAG_SYNC; + fiemap_buf->fm_extent_count = EXTENT_MAX_COUNT; + + do { + fiemap_buf->fm_start = pos; + memzero(ext_buf, ext_buf_size); + ret = ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); + if (ret < 0) { + if (errno == EOPNOTSUPP) { + exit_code = -EOPNOTSUPP; + } else { + exit_code = -1; + pr_perror("fiemap ioctl() failed"); + } + goto out; + } else if (fiemap_buf->fm_mapped_extents == 0) { + goto out; + } + + for (i = 0; i < fiemap_buf->fm_mapped_extents; i++) { + if (skip_outstanding(&fiemap_buf->fm_extents[i], file_size)) + continue; + + ce.len = fiemap_buf->fm_extents[i].fe_length; + ce.off = fiemap_buf->fm_extents[i].fe_logical; + + if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) { + exit_code = -1; + goto out; + } + + if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) { + exit_code = -1; + goto out; + } + + if (fiemap_buf->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) { + /* there are no extents left, break. */ + goto out; + } + } + + /* Record file's logical offset as pos */ + pos = ce.len + ce.off; + + /* Since there are still extents left, continue. */ + } while (fiemap_buf->fm_mapped_extents == EXTENT_MAX_COUNT); +out: + xfree(fiemap_buf); + return exit_code; +} + static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { int ret; @@ -314,46 +407,24 @@ static int mklnk_ghost(char *path, GhostFileEntry *gfe) static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; - int ret = -1; - if (S_ISLNK(gfe->mode)) { - if (lchown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) + return -1; - /* - * We have no lchmod() function, and fchmod() will fail on - * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() - * function and flag AT_SYMLINK_NOFOLLOW described in - * man 2 fchmodat, but it is not currently implemented. %) - */ - } else { - if (chown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (!gfe->atim) + return 0; - if (chmod(path, gfe->mode)) { - pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); - goto err; - } + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; + + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + return -1; } - if (gfe->atim) { - tv[0].tv_sec = gfe->atim->tv_sec; - tv[0].tv_usec = gfe->atim->tv_usec; - tv[1].tv_sec = gfe->mtim->tv_sec; - tv[1].tv_usec = gfe->mtim->tv_usec; - if (lutimes(path, tv)) { - pr_perror("Can't set access and modification times on ghost %s", path); - goto err; - } - } - - ret = 0; -err: - return ret; + return 0; } static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) @@ -414,7 +485,7 @@ static int nomntns_create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, stru if (ghost_apply_metadata(path, gfe)) return -1; - strlcpy(gf->remap.rpath, path + 1, PATH_MAX); + __strlcpy(gf->remap.rpath, path + 1, PATH_MAX); pr_debug("Remap rpath is %s\n", gf->remap.rpath); return 0; } @@ -545,7 +616,7 @@ static int open_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rpe) gf->remap.rmnt_id = rfi->rfe->mnt_id; if (S_ISDIR(gfe->mode)) - strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); + __strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); else ghost_path(gf->remap.rpath, PATH_MAX, rfi, rpe); @@ -910,10 +981,20 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de goto err_out; } - if (gfe.chunks) - ret = copy_file_to_chunks(fd, img, st->st_size); - else + if (gfe.chunks) { + if (opts.ghost_fiemap) { + ret = copy_file_to_chunks_fiemap(fd, img, st->st_size); + if (ret == -EOPNOTSUPP) { + pr_debug("file system don't support fiemap\n"); + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { + ret = copy_file_to_chunks(fd, img, st->st_size); + } + } else { ret = copy_file(fd, img_raw_fd(img), st->st_size); + } + close(fd); if (ret) goto err_out; @@ -946,8 +1027,8 @@ static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); - if (st->st_size > opts.ghost_limit) { - pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_size); + if (st->st_blocks * ST_UNIT > opts.ghost_limit) { + pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_blocks * ST_UNIT); return -1; } @@ -1035,7 +1116,6 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i RegFileEntry rfe = REG_FILE_ENTRY__INIT; FownEntry fwn = FOWN_ENTRY__INIT; int mntns_root; - int ret; const struct stat *ost = &parms->stat; if (!opts.link_remap_ok) { @@ -1070,23 +1150,22 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ - snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); + snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name) - 1, "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); -again: - ret = linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH); - if (ret < 0 && errno == ENOENT) { + while (linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH) < 0) { + if (errno != ENOENT) { + pr_perror("Can't link remap to %s", path); + return -1; + } + /* Use grand parent, if parent directory does not exist. */ if (trim_last_parent(link_name) < 0) { pr_err("trim failed: @%s@\n", link_name); check_overlayfs_fallback(path, parms, fallback); return -1; } - goto again; - } else if (ret < 0) { - pr_perror("Can't link remap to %s", path); - return -1; } if (note_link_remap(link_name, nsid)) @@ -1571,22 +1650,10 @@ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, co */ static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) { - char buf[SELFMAG + 1]; - void *start_addr; + char *start_addr; size_t mapped_size; int ret = -1; - if (read(fd, buf, SELFMAG + 1) != SELFMAG + 1) - return -1; - - /* - * The first 4 bytes contain a magic number identifying the file as an - * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and - * ‘F’, respectively. These characters are together defined as ELFMAG. - */ - if (strncmp(buf, ELFMAG, SELFMAG)) - return -1; - /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore at most only the first 1 MB of the @@ -1594,16 +1661,25 @@ static int get_build_id(const int fd, const struct stat *fd_status, unsigned cha */ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); - if (start_addr == MAP_FAILED) { + if ((void*)start_addr == MAP_FAILED) { pr_warn("Couldn't mmap file with fd %d\n", fd); return -1; } - if (buf[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(start_addr, build_id, fd, mapped_size); - if (buf[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(start_addr, build_id, fd, mapped_size); + /* + * The first 4 bytes contain a magic number identifying the file as an + * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and + * ‘F’, respectively. These characters are together defined as ELFMAG. + */ + if (memcmp(start_addr, ELFMAG, SELFMAG)) + goto out; + if (start_addr[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); + if (start_addr[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); + +out: munmap(start_addr, mapped_size); return ret; } @@ -1688,6 +1764,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) int ret; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; + bool skip_for_shell_job = false; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) @@ -1707,12 +1784,17 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) mi = lookup_mnt_id(p->mnt_id); if (mi == NULL) { - pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); - return -1; + if (opts.shell_job && is_tty(p->stat.st_rdev, p->stat.st_dev)) { + skip_for_shell_job = true; + } else { + pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); + return -1; + } } - if (mnt_is_overmounted(mi)) { - pr_err("Open files on overmounted mounts are not supported yet\n"); + if (!skip_for_shell_job && mnt_is_overmounted(mi)) { + pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", + p->mnt_id, p->fd, link->name + 1); return -1; } @@ -1731,7 +1813,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) return -1; } - if (check_path_remap(link, p, lfd, id, mi->nsid)) + if (!skip_for_shell_job && check_path_remap(link, p, lfd, id, mi->nsid)) return -1; rfe.name = &link->name[1]; ext: @@ -2199,9 +2281,21 @@ ext: if (!validate_file(tmp, &st, rfi)) goto err; - if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { - pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); - goto err; + if (rfi->rfe->has_mode) { + mode_t curr_mode = st.st_mode; + mode_t saved_mode = rfi->rfe->mode; + + if (opts.skip_file_rwx_check) { + curr_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + saved_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); + } + + if (curr_mode != saved_mode) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n" + "File r/w/x checks can be skipped with the --skip-file-rwx-check option\n", + rfi->path, (int)curr_mode, saved_mode); + goto err; + } } /* @@ -2409,9 +2503,10 @@ static int open_filemap(int pid, struct vma_area *vma) * using dup because dup returns a reference to the same struct file inside kernel, but we * cannot open a new FD. */ - ret = dup(plugin_fd); + ret = plugin_fd; } else if (vma->e->status & VMA_AREA_MEMFD) { - ret = memfd_open(vma->vmfd, &flags); + if (!inherited_fd(vma->vmfd, &ret)) + ret = memfd_open(vma->vmfd, &flags, true); } else { ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); } diff --git a/criu/files.c b/criu/files.c index 8a2250e19..af4b8aeac 100644 --- a/criu/files.c +++ b/criu/files.c @@ -21,7 +21,7 @@ #include "image.h" #include "common/list.h" #include "rst-malloc.h" -#include "util-pie.h" +#include "util-caps.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -302,7 +303,7 @@ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) char buf[PATH_MAX]; int n; - strlcpy(buf, link->name, PATH_MAX); + __strlcpy(buf, link->name, PATH_MAX); n = snprintf(link->name, PATH_MAX, "%s/%s", m->ns_mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; + else if (is_pidfd_link(link)) + ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -554,6 +557,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } + if (p.fs_type == PID_FS_MAGIC) { + ops = &pidfd_dump_ops; + return do_dump_gen_file(&p, lfd, ops, e); + } + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; @@ -970,7 +978,7 @@ static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%" PRIx64, pid, criu_run_id); + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%s", pid, criu_run_id); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } @@ -1321,7 +1329,6 @@ int prepare_fds(struct pstree_item *me) } } - BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) @@ -1346,10 +1353,35 @@ static int fchroot(int fd) return chroot("."); } +static int need_chroot(int saved_root) +{ + struct stat saved_root_stat, cur_root_stat; + int psd; + + if (fstat(saved_root, &saved_root_stat) == -1) { + pr_perror("Failed to stat saved root dir"); + return -1; + } + + psd = open_pid_proc(PROC_SELF); + if (psd < 0) { + pr_perror("Failed to open PROC_SELF"); + return -1; + } + + if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { + pr_perror("Failed to stat current root dir"); + return -1; + } + + return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; +} + int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); + bool do_chroot = true; /* * First -- open both descriptors. We will not @@ -1368,15 +1400,24 @@ int restore_fs(struct pstree_item *me) goto out; } + /* + * In unprivileged mode chroot() may fail if we don't have + * sufficient privileges, therefore only do it if the process + * is actually chrooted. + */ + if (opts.unprivileged) + do_chroot = need_chroot(dd_root); + /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ - - ret = fchroot(dd_root); - if (ret < 0) { - pr_perror("Can't change root"); - goto out; + if (do_chroot) { + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; + } } ret = fchdir(dd_cwd); @@ -1744,6 +1785,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; + case FD_TYPES__PIDFD: + ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); + break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1766,5 +1810,6 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); + init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/filesystems.c b/criu/filesystems.c index 890d5d06d..093e1c492 100644 --- a/criu/filesystems.c +++ b/criu/filesystems.c @@ -547,7 +547,8 @@ static int fusectl_dump(struct mount_info *pm) } for (it = mntinfo; it; it = it->next) { - if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && !it->external) { + if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && + !mnt_is_external_bind(it)) { pr_err("%s is a fuse mount but not external\n", it->ns_mountpoint); goto out; } diff --git a/criu/fsnotify.c b/criu/fsnotify.c index 03711f0b2..8572dc2f3 100644 --- a/criu/fsnotify.c +++ b/criu/fsnotify.c @@ -183,7 +183,7 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ return path; } } else - pr_debug("\t\t\tnot openable as %s (%m)\n", __path); + pr_debug("\t\t\tnot openable as %s (%s)\n", __path, strerror(errno)); } err: diff --git a/criu/hugetlb.c b/criu/hugetlb.c index aa98662d8..866c4050f 100644 --- a/criu/hugetlb.c +++ b/criu/hugetlb.c @@ -35,6 +35,19 @@ int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag) return 0; } +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma) +{ + /* + * Dump the hugetlb backed mapping using memfd_hugetlb when it is not + * anonymous private mapping. + */ + if (kdat.has_memfd_hugetlb && is_hugetlb_dev(dev, hugetlb_size_flag) && + !((vma->e->flags & MAP_PRIVATE) && !strncmp(file_path, ANON_HUGEPAGE_PREFIX, ANON_HUGEPAGE_PREFIX_LEN))) + return 1; + + return 0; +} + unsigned long get_size_from_hugetlb_flag(int flag) { int i; diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c098..2d87c7381 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/image.c b/criu/image.c index 353de48e8..91101c3eb 100644 --- a/criu/image.c +++ b/criu/image.c @@ -25,6 +25,15 @@ bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; +char dump_criu_run_id[RUN_ID_HASH_LENGTH]; + +struct inventory_plugin { + struct list_head node; + char *name; +}; + +struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); +static int n_inventory_plugins; int check_img_inventory(bool restore) { @@ -86,6 +95,11 @@ int check_img_inventory(bool restore) goto out_err; } + if (restore && he->allow_uprobes && !opts.allow_uprobes) { + pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); + goto out_err; + } + if (restore) { if (!he->has_network_lock_method) { /* @@ -99,6 +113,37 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } + + if (!he->plugins_entry) { + /* backwards compatibility: if the 'plugins_entry' field is missing, + * all plugins should be enabled during restore. + */ + n_inventory_plugins = -1; + } else { + PluginsEntry *pe = he->plugins_entry; + for (int i = 0; i < pe->n_plugins; i++) { + if (add_inventory_plugin(pe->plugins[i])) + goto out_err; + } + } + + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + if (he->dump_criu_run_id) { + strncpy(dump_criu_run_id, he->dump_criu_run_id, sizeof(dump_criu_run_id) - 1); + pr_info("Dump CRIU run id = %s\n", dump_criu_run_id); + } else { + /** + * If restoring from an old image this is a marker + * that no dump_criu_run_id exists. + */ + dump_criu_run_id[0] = NO_DUMP_CRIU_RUN_ID; + } + } ret = 0; @@ -110,8 +155,92 @@ out_close: return ret; } +/** + * Check if the 'plugins' field in the inventory image contains + * the specified plugin name. If found, the plugin is removed + * from the linked list. + */ +bool check_and_remove_inventory_plugin(const char *name, size_t n) +{ + if (n_inventory_plugins == -1) + return true; /* backwards compatibility */ + + if (n_inventory_plugins > 0) { + struct inventory_plugin *p, *tmp; + + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + if (!strncmp(name, p->name, n)) { + xfree(p->name); + list_del(&p->node); + xfree(p); + n_inventory_plugins--; + return true; + } + } + } + + return false; +} + +/** + * We expect during restore all loaded plugins to be removed from + * the inventory_plugins_list. If the list is not empty, show an + * error message for each missing plugin. + */ +int check_inventory_plugins(void) +{ + struct inventory_plugin *p; + + if (n_inventory_plugins <= 0) + return 0; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pr_err("Missing required plugin: %s\n", p->name); + } + + return -1; +} + +/** + * Add plugin name to the inventory image. These values + * can be used to identify required plugins during restore. + */ +int add_inventory_plugin(const char *name) +{ + struct inventory_plugin *p; + + p = xmalloc(sizeof(struct inventory_plugin)); + if (p == NULL) + return -1; + + p->name = xstrdup(name); + if (!p->name) { + xfree(p); + return -1; + } + list_add(&p->node, &inventory_plugins_list); + n_inventory_plugins++; + + return 0; +} + +void free_inventory_plugins_list(void) +{ + struct inventory_plugin *p, *tmp; + + if (!list_empty(&inventory_plugins_list)) { + list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { + xfree(p->name); + list_del(&p->node); + xfree(p); + } + } + n_inventory_plugins = 0; +} + int write_img_inventory(InventoryEntry *he) { + PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -121,8 +250,27 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; + if (!list_empty(&inventory_plugins_list)) { + struct inventory_plugin *p; + int i = 0; + + pe.n_plugins = n_inventory_plugins; + pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); + if (!pe.plugins) + return -1; + + list_for_each_entry(p, &inventory_plugins_list, node) { + pe.plugins[i] = p->name; + i++; + } + } + he->plugins_entry = &pe; + ret = pb_write_one(img, he, PB_INVENTORY); + free_inventory_plugins_list(); + xfree(pe.plugins); + xfree(he->root_ids); close_image(img); if (ret < 0) @@ -226,8 +374,9 @@ int prepare_inventory(InventoryEntry *he) if (get_task_ids(&crt.i)) return -1; - he->has_root_cg_set = true; - if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) + if (!opts.unprivileged) + he->has_root_cg_set = true; + if (dump_thread_cgroup(NULL, &he->root_cg_set, NULL, -1)) return -1; he->root_ids = crt.i.ids; @@ -242,6 +391,17 @@ int prepare_inventory(InventoryEntry *he) he->has_network_lock_method = true; he->network_lock_method = opts.network_lock_method; + /** + * This contains the criu_run_id during dumping of the process. + * For things like removing network locking (nftables) this + * information is needed to identify the name of the network + * locking table. + */ + he->dump_criu_run_id = xstrdup(criu_run_id); + + if (!he->dump_criu_run_id) + return -1; + return 0; } @@ -557,7 +717,7 @@ struct cr_img *img_from_fd(int fd) * This is used when opts.stream is enabled for picking the right streamer * socket name. `mode` is ignored when opts.stream is not enabled. */ -int open_image_dir(char *dir, int mode) +int open_image_dir(const char *dir, int mode) { int fd, ret; diff --git a/criu/img-streamer.c b/criu/img-streamer.c index 7e36eae01..305e6fae5 100644 --- a/criu/img-streamer.c +++ b/criu/img-streamer.c @@ -12,6 +12,7 @@ #include "rst-malloc.h" #include "common/scm.h" #include "common/lock.h" +#include "action-scripts.h" /* * We use different path names for the dump and restore sockets because: @@ -49,10 +50,17 @@ static const char *socket_name_for_mode(int mode) int img_streamer_init(const char *image_dir, int mode) { struct sockaddr_un addr; + int pre_stream_ret; int sockfd; img_streamer_mode = mode; + pre_stream_ret = run_scripts(ACT_PRE_STREAM); + if (pre_stream_ret != 0) { + pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); + return -1; + } + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { pr_perror("Unable to instantiate UNIX socket"); diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index c2e8850aa..6a331a32f 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -4,6 +4,7 @@ #include "asm/int.h" enum script_actions { + ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, @@ -16,6 +17,7 @@ enum script_actions { ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_STATUS_READY, + ACT_QUERY_EXT_FILES, ACT_MAX }; @@ -24,6 +26,8 @@ extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); +extern int rpc_query_external_files(void); +extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ diff --git a/criu/include/aio.h b/criu/include/aio.h index d1655739d..38e704020 100644 --- a/criu/include/aio.h +++ b/criu/include/aio.h @@ -1,7 +1,7 @@ #ifndef __CR_AIO_H__ #define __CR_AIO_H__ -#include +#include "linux/aio_abi.h" #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); diff --git a/criu/include/cgroup-props.h b/criu/include/cgroup-props.h index 11b677548..10a7061b8 100644 --- a/criu/include/cgroup-props.h +++ b/criu/include/cgroup-props.h @@ -10,6 +10,7 @@ typedef struct { } cgp_t; extern cgp_t cgp_global; +extern cgp_t cgp_global_v2; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index 2e9b8933c..dc264032e 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -7,9 +7,10 @@ struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; -int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); +int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); int dump_cgroups(void); -int prepare_task_cgroup(struct pstree_item *); +int restore_task_cgroup(struct pstree_item *); +int prepare_cgroup_namespace(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); @@ -60,6 +61,9 @@ struct cg_controller { /* for cgroup list in cgroup.c */ struct list_head l; + + /* controller is a threaded cgroup or not */ + int is_threaded; }; struct cg_controller *new_controller(const char *name); @@ -87,9 +91,12 @@ struct cg_ctl { */ struct list_head; struct parasite_dump_cgroup_args; -extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); +extern int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *l, + unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); +int stop_cgroupd(void); + #endif /* __CR_CGROUP_H__ */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index bf1a762cc..8c5707b41 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -1,10 +1,12 @@ #ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ -#include #include +#include #include "common/config.h" #include "common/list.h" +#include "int.h" +#include "image.h" /* Configuration and CLI parsing order defines */ #define PARSING_GLOBAL_CONF 1 @@ -65,9 +67,18 @@ struct cg_root_opt { enum NETWORK_LOCK_METHOD { NETWORK_LOCK_IPTABLES, NETWORK_LOCK_NFTABLES, + NETWORK_LOCK_SKIP, }; +/** + * CRIU currently defaults to the iptables locking backend. + * + * It is, however, possible to change this by defining + * NETWORK_LOCK_DEFAULT to a different value on the command-line. + */ +#ifndef NETWORK_LOCK_DEFAULT #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES +#endif /* * Ghost file size we allow to carry by default. @@ -93,6 +104,9 @@ enum FILE_VALIDATION_OPTIONS { /* This constant dictates which file validation method should be tried by default. */ #define FILE_VALIDATION_DEFAULT FILE_VALIDATION_BUILD_ID +/* This constant dictates that criu use fiemap to copy ghost file by default.*/ +#define FIEMAP_DEFAULT 1 + struct irmap; struct irmap_path_opt { @@ -111,7 +125,8 @@ enum criu_mode { CR_SERVICE, CR_SWRK, CR_DEDUP, - CR_CPUINFO, + CR_CPUINFO_DUMP, + CR_CPUINFO_CHECK, CR_EXEC_DEPRECATED, CR_SHOW_DEPRECATED, }; @@ -165,6 +180,7 @@ struct cr_options { int enable_external_masters; bool aufs; /* auto-detected, not via cli */ bool overlayfs; + int ghost_fiemap; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED bool has_binfmt_misc; /* auto-detected */ #endif @@ -179,6 +195,8 @@ struct cr_options { bool lazy_pages; char *work_dir; int network_lock_method; + int skip_file_rwx_check; + int allow_uprobes; /* * When we scheduler for removal some functionality we first @@ -209,6 +227,26 @@ struct cr_options { enum criu_mode mode; int mntns_compat_mode; + + /* Remember the program name passed to main() so we can use it in + * error messages elsewhere. + */ + char *argv_0; + /* + * This contains the eUID of the current CRIU user. It + * will only be set to a non-zero value if CRIU has + * the necessary capabilities to run as non root. + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN + */ + uid_t uid; + /* This contains the value from capget()->effective */ + u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; + /* + * If CRIU should be running as non-root with the help of + * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should + * explicitly request it as it comes with many limitations. + */ + int unprivileged; }; extern struct cr_options opts; diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h index ae2f38489..9d52fbdb1 100644 --- a/criu/include/criu-log.h +++ b/criu/include/criu-log.h @@ -26,7 +26,6 @@ extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); -extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index 886832eaa..c3bea1385 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -56,6 +56,16 @@ enum { CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, + CR_PLUGIN_HOOK__PAUSE_DEVICES = 10, + + CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, + + CR_PLUGIN_HOOK__POST_FORKING = 12, + + CR_PLUGIN_HOOK__RESTORE_INIT = 13, + + CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14, + CR_PLUGIN_HOOK__MAX }; @@ -64,7 +74,7 @@ enum { DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); @@ -72,6 +82,11 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id); enum { CR_PLUGIN_STAGE__DUMP, @@ -146,5 +161,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); typedef int(cr_plugin_resume_devices_late_t)(int pid); +typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/include/crtools.h b/criu/include/crtools.h index b9309654f..b54b9d929 100644 --- a/criu/include/crtools.h +++ b/criu/include/crtools.h @@ -26,6 +26,7 @@ extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); +extern int check_caps(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index f33918de8..e987c18ce 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -19,19 +19,15 @@ enum faults { FI_HUGE_ANON_SHMEM_ID = 132, FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, + FI_DONT_USE_PAGEMAP_SCAN = 135, + FI_DUMP_CRASH = 136, + FI_COMPEL_INTERRUPT_ONLY_MODE = 137, + FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { - /* - * Temporary workaround for Xen guests. Breakpoints degrade - * performance linearly, so until we find out the reason, - * let's disable them. - */ - if (f == FI_NO_BREAKPOINTS) - return true; - return fi_strategy == f; } diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ad34f4891..ffc0455d5 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -57,4 +57,8 @@ #define OVERLAYFS_SUPER_MAGIC 0x794c7630 #endif +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/hugetlb.h b/criu/include/hugetlb.h index c0e83652b..9aee5bed3 100644 --- a/criu/include/hugetlb.h +++ b/criu/include/hugetlb.h @@ -4,6 +4,11 @@ #include #include +#include "vma.h" + +#define ANON_HUGEPAGE_PREFIX "/anon_hugepage" +#define ANON_HUGEPAGE_PREFIX_LEN (sizeof(ANON_HUGEPAGE_PREFIX) - 1) + enum hugepage_size { HUGETLB_16KB, HUGETLB_64KB, @@ -46,6 +51,7 @@ struct htlb_info { extern struct htlb_info hugetlb_info[HUGETLB_MAX]; int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag); +int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma); unsigned long get_size_from_hugetlb_flag(int flag); #ifndef MFD_HUGETLB diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 9f369be64..79e1ac111 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,6 +113,7 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, + CR_FD_PIDFD, CR_FD_AUTOFS, diff --git a/criu/include/image.h b/criu/include/image.h index 5cb01bde2..30e32323d 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -35,13 +35,15 @@ * - stack * the memory area is used in application stack so we * should be careful about guard page here + * - shadow stack + * the memory area is used by shadow stack * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar - * the vDSO area, it might reqire additional memory + * the vDSO area, it might require additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap @@ -66,6 +68,18 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. + * - guard + * stands for a fake VMA (not represented in the kernel + * by a struct vm_area_struct). Used to keep an information + * about virtual address space ranges covered by + * MADV_GUARD_INSTALL guards. These ones must be always at + * the end of the vma_area_list and properly skipped a.e. + * - uprobes + * stands for a "[uprobes]" vma that's automatically mapped by + * the kernel when an active uprobe is hit. Contents of this vma + * are not dumped and neither are its madvise bits restored, + * because the kernel is in complete control of this vma. This is + * just used to track the existence of the uprobes vma. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -84,6 +98,9 @@ #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_SHSTK (1 << 15) +#define VMA_AREA_GUARD (1 << 16) +#define VMA_AREA_UPROBES (1 << 17) #define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) @@ -97,6 +114,8 @@ #define CR_PARENT_LINK "parent" +#define OPT_ALLOW_UPROBES "allow-uprobes" + extern bool ns_per_id; extern bool img_common_magic; @@ -146,7 +165,7 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(char *dir, int mode); +extern int open_image_dir(const char *dir, int mode); extern void close_image_dir(void); /* * Return -1 -- parent symlink points to invalid target @@ -174,4 +193,8 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); +extern int add_inventory_plugin(const char *name); +extern int check_inventory_plugins(void); +extern bool check_and_remove_inventory_plugin(const char *name, size_t n); + #endif /* __CR_IMAGE_H__ */ diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index 83d867e75..e4922f401 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -7,6 +7,7 @@ #include "asm/kerndat.h" #include "util-vdso.h" #include "hugetlb.h" +#include struct stat; @@ -82,6 +83,16 @@ struct kerndat_s { bool has_openat2; bool has_rseq; bool has_ptrace_get_rseq_conf; + struct __ptrace_rseq_configuration libc_rseq_conf; + bool has_ipv6_freebind; + bool has_membarrier_get_registrations; + bool has_pagemap_scan; + bool has_shstk; + bool has_close_range; + bool has_timer_cr_ids; + bool has_breakpoints; + bool has_madv_guard; + bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -104,4 +115,6 @@ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); +extern void kerndat_warn_about_madv_guards(void); + #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/linux/aio_abi.h b/criu/include/linux/aio_abi.h new file mode 100644 index 000000000..d9ce78720 --- /dev/null +++ b/criu/include/linux/aio_abi.h @@ -0,0 +1,14 @@ +#ifndef __LINUX__AIO_ABI_H +#define __LINUX__AIO_ABI_H + +typedef __kernel_ulong_t aio_context_t; + +/* read() from /dev/aio returns these structures. */ +struct io_event { + __u64 data; /* the data field from the iocb */ + __u64 obj; /* what iocb this event came from */ + __s64 res; /* result code for this event */ + __s64 res2; /* secondary result */ +}; + +#endif /* __LINUX__AIO_ABI_H */ diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index 9a3a28b10..fefafa89e 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -4,32 +4,40 @@ #include "common/config.h" #include "compel/plugins/std/syscall-codes.h" -#ifdef CONFIG_HAS_FSCONFIG -#include -#else +/* Copied from /usr/include/sys/mount.h */ + +#ifndef FSOPEN_CLOEXEC +/* The type of fsconfig call made. */ enum fsconfig_command { - FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ - FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ - FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ - FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ - FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ - FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ +#define FSCONFIG_SET_FLAG FSCONFIG_SET_FLAG + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ +#define FSCONFIG_SET_STRING FSCONFIG_SET_STRING + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ +#define FSCONFIG_SET_BINARY FSCONFIG_SET_BINARY + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ +#define FSCONFIG_SET_PATH FSCONFIG_SET_PATH + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ +#define FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_PATH_EMPTY + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ +#define FSCONFIG_SET_FD FSCONFIG_SET_FD + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ +#define FSCONFIG_CMD_CREATE FSCONFIG_CMD_CREATE FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +#define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; + +#endif // FSOPEN_CLOEXEC + +/* fsopen flags. With the redundant definition, we check if the kernel, + * glibc value and our value still match. + */ +#define FSOPEN_CLOEXEC 0x00000001 + +#ifndef MS_MGC_VAL +/* Magic mount flag number. Has to be or-ed to the flag values. */ +#define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */ +#define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */ #endif -static inline int sys_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} -static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - return syscall(__NR_fsconfig, fd, cmd, key, value, aux); -} -static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - return syscall(__NR_fsmount, fd, flags, attr_flags); -} - #endif diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h index a47876e66..5ceefbf8e 100644 --- a/criu/include/linux/rseq.h +++ b/criu/include/linux/rseq.h @@ -9,7 +9,12 @@ #endif #endif -#ifndef __GLIBC_HAVE_KERNEL_RSEQ +#include +#include + +#include "common/config.h" + +#ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS /* * linux/rseq.h * @@ -18,9 +23,6 @@ * Copyright (c) 2015-2018 Mathieu Desnoyers */ -#include -#include - enum rseq_cpu_id_state { RSEQ_CPU_ID_UNINITIALIZED = -1, RSEQ_CPU_ID_REGISTRATION_FAILED = -2, @@ -41,13 +43,20 @@ enum rseq_cs_flags { RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), }; +#endif /* CONFIG_HAS_NO_LIBC_RSEQ_DEFS */ +/* + * Let's use our own definition of struct rseq_cs because some distros + * (for example Mariner GNU/Linux) declares this structure their-own way. + * This makes trouble with inconsistency between printf formatters and + * struct rseq_cs field types. + */ /* * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. It is usually declared as * link-time constant data. */ -struct rseq_cs { +struct criu_rseq_cs { /* Version of this structure. */ __u32 version; /* enum rseq_cs_flags */ @@ -57,7 +66,6 @@ struct rseq_cs { __u64 post_commit_offset; __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); -#endif /* __GLIBC_HAVE_KERNEL_RSEQ */ /* * We have to have our own copy of struct rseq definition because diff --git a/criu/include/log.h b/criu/include/log.h index 85e6dc2e7..cbed33007 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -60,6 +60,8 @@ void flush_early_log_buffer(int fd); #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) + #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ diff --git a/criu/include/magic.h b/criu/include/magic.h index 22d7218e4..6f0aff26d 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -29,7 +29,7 @@ /* * The magic-s below correspond to coordinates - * of various Russian towns in the NNNNEEEE form. + * of various towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ @@ -100,6 +100,7 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/mem.h b/criu/include/mem.h index 03574ea3d..e9ce3518a 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,6 +7,7 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" +#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -30,10 +31,12 @@ extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); +extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) +#define PME_GUARD_REGION (1ULL << 58) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) @@ -47,5 +50,12 @@ int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); -bool should_dump_page(VmaEntry *vmae, u64 pme); + +struct page_info { + u64 next; + bool softdirty; +}; + +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); + #endif /* __CR_MEM_H__ */ diff --git a/criu/include/memfd.h b/criu/include/memfd.h index 1b1dc79bb..78d810019 100644 --- a/criu/include/memfd.h +++ b/criu/include/memfd.h @@ -1,7 +1,9 @@ #ifndef __CR_MEMFD_H__ #define __CR_MEMFD_H__ +#include #include + #include "int.h" #include "common/config.h" @@ -12,7 +14,7 @@ extern int is_memfd(dev_t dev); extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); extern const struct fdtype_ops memfd_dump_ops; -extern int memfd_open(struct file_desc *d, u32 *fdflags); +extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); extern struct collect_image_info memfd_cinfo; extern struct file_desc *collect_memfd(u32 id); extern int apply_memfd_seals(void); diff --git a/criu/include/mman.h b/criu/include/mman.h index 8ca71fadf..43e0b6cc7 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -4,6 +4,9 @@ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -13,5 +16,11 @@ #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index e2ea6e17f..183a3b852 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -1,6 +1,8 @@ #ifndef __CR_NS_H__ #define __CR_NS_H__ +#include + #include "common/compiler.h" #include "files.h" #include "common/list.h" @@ -224,4 +226,19 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + +extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); +extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); +extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); + #endif /* __CR_NS_H__ */ diff --git a/criu/include/net.h b/criu/include/net.h index 0da4cad13..7c5ede21e 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(void); +extern int network_lock_internal(bool restore); extern struct ns_desc net_ns_desc; @@ -50,7 +50,6 @@ extern int kerndat_has_newifindex(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); extern struct ns_id *net_get_root_ns(void); -extern int kerndat_nsid(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index 15178c015..65292b7ab 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -92,9 +92,9 @@ struct kernel_pipe_buffer { struct page_pipe_buf { int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int pipe_off; /* where this buf is started in a pipe */ - unsigned int pages_in; /* how many pages are there */ unsigned int nr_segs; /* how many iov-s are busy */ + unsigned long pipe_off; /* where this buf is started in a pipe */ + unsigned long pages_in; /* how many pages are there */ #define PPB_LAZY (1 << 0) unsigned int flags; struct iovec *iov; /* vaddr:len map */ @@ -149,7 +149,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 36fe67092..0d9b35019 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -69,9 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); -extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); +extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 1d8bbffaf..875e69e56 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,10 +1,12 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ +#include #include #include "int.h" #include "common/list.h" +#include "pagemap_scan.h" struct vma_area; @@ -15,9 +17,15 @@ typedef struct { unsigned long start; /* start of area */ unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ + int fd; /* file to read PMs from */ + u64 *map; /* local buffer */ size_t map_len; /* length of a buffer */ - int fd; /* file to read PMs from */ + + struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ + size_t regs_len; /* actual length of regs */ + size_t regs_max_len; /* maximum length of regs */ + size_t regs_idx; /* current index in the regs array */ } pmc_t; #define PMC_INIT \ @@ -26,7 +34,8 @@ typedef struct { } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); +extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 8c7180559..4cbc87cc6 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,7 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -52,12 +52,15 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; + /* Whether or not disable image deduplication*/ + bool disable_dedup; + /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; @@ -112,6 +115,8 @@ int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); */ extern void dup_page_read(struct page_read *src, struct page_read *dst); +extern void page_read_disable_dedup(struct page_read *pr); + extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h new file mode 100644 index 000000000..9046e01ed --- /dev/null +++ b/criu/include/pagemap_scan.h @@ -0,0 +1,69 @@ +#ifndef __CR_PAGEMAP_SCAN_H__ +#define __CR_PAGEMAP_SCAN_H__ + +#ifndef PAGEMAP_SCAN +#include +#include "int.h" + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) +#define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + u64 start; + u64 end; + u64 categories; +}; + +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + u64 size; + u64 flags; + u64 start; + u64 end; + u64 walk_end; + u64 vec; + u64 vec_len; + u64 max_pages; + u64 category_inverted; + u64 category_mask; + u64 category_anyof_mask; + u64 return_mask; +}; +#endif /* PAGEMAP_SCAN */ + +#endif /* __CR_PAGEMAP_SCAN_H__ */ diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index 4540e11ee..4a8ec2fee 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -21,13 +21,6 @@ struct rt_sigframe; struct parasite_ctl; struct parasite_thread_ctl; -extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); -extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); - -struct proc_posix_timers_stat; -extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *); - extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); diff --git a/criu/include/parasite.h b/criu/include/parasite.h index d2a06889f..176357711 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -63,7 +63,7 @@ struct parasite_dump_pages_args { unsigned int add_prot; unsigned int off; unsigned int nr_segs; - unsigned int nr_pages; + unsigned long nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) @@ -118,6 +118,8 @@ static inline int posix_timers_dump_size(int timer_n) */ struct parasite_dump_misc { + bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ + unsigned long brk; u32 pid; @@ -128,6 +130,7 @@ struct parasite_dump_misc { int dumpable; int thp_disabled; int child_subreaper; + int membarrier_registration_mask; }; /* @@ -145,9 +148,11 @@ struct parasite_dump_creds { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; int uids[4]; int gids[4]; + int no_new_privs; unsigned int secbits; unsigned int ngroups; /* @@ -241,7 +246,12 @@ struct parasite_dump_cgroup_args { * * The string is null terminated. */ - char contents[1 << 12]; + char contents[(1 << 12) - 32]; + /* + * Contains the path to thread cgroup procfs. + * "self/task//cgroup" + */ + char thread_cgrp[32]; }; #endif /* !__ASSEMBLY__ */ diff --git a/criu/include/pid.h b/criu/include/pid.h index 49cb2d322..b2b7a361a 100644 --- a/criu/include/pid.h +++ b/criu/include/pid.h @@ -31,6 +31,10 @@ struct pid { pid_t real; int state; /* TASK_XXX constants */ + /* If an item is in stopped state it has a signal number + * that caused task to stop. + */ + int stop_signo; /* * The @virt pid is one which used in the image itself and keeps diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h new file mode 100644 index 000000000..bcc0fb45a --- /dev/null +++ b/criu/include/pidfd.h @@ -0,0 +1,16 @@ +#ifndef __CR_PIDFD_H__ +#define __CR_PIDFD_H__ + +#include "files.h" +#include "pidfd.pb-c.h" + +extern const struct fdtype_ops pidfd_dump_ops; +extern struct collect_image_info pidfd_cinfo; +extern int is_pidfd_link(char *link); +extern void init_dead_pidfd_hash(void); +struct pidfd_dump_info { + PidfdEntry pidfe; + pid_t pid; +}; + +#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/prctl.h b/criu/include/prctl.h index c843f40a7..2966659da 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -30,6 +30,21 @@ #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif +#ifndef PR_GET_NO_NEW_PRIVS +#define PR_GET_NO_NEW_PRIVS 39 +#endif +#ifndef PR_SET_NO_NEW_PRIVS +#define PR_SET_NO_NEW_PRIVS 38 +#endif +#ifndef PR_CAP_AMBIENT +#define PR_CAP_AMBIENT 47 +#endif +#ifndef PR_CAP_AMBIENT_IS_SET +#define PR_CAP_AMBIENT_IS_SET 1 +#endif +#ifndef PR_CAP_AMBIENT_RAISE +#define PR_CAP_AMBIENT_RAISE 2 +#endif #ifndef PR_SET_MM #define PR_SET_MM 35 @@ -82,4 +97,11 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 #endif +#ifndef PR_TIMER_CREATE_RESTORE_IDS +#define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 0c334a190..76d3242d2 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -81,6 +81,7 @@ struct proc_status_creds { u32 cap_prm[PROC_CAP_SIZE]; u32 cap_eff[PROC_CAP_SIZE]; u32 cap_bnd[PROC_CAP_SIZE]; + u32 cap_amb[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) @@ -104,4 +105,6 @@ extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); +extern bool found_uprobes_vma(void); + #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101..c4241be55 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,7 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 8ae750e1a..b750a919e 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -63,7 +63,7 @@ struct dmp_info { struct parasite_ctl *parasite_ctl; struct parasite_thread_ctl **thread_ctls; uint64_t *thread_sp; - struct rseq_cs *thread_rseq_cs; + struct criu_rseq_cs *thread_rseq_cs; /* * Although we don't support dumping different struct creds in general, @@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; +extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) diff --git a/criu/include/rbtree.h b/criu/include/rbtree.h index ba0a8100e..6981aa8f9 100644 --- a/criu/include/rbtree.h +++ b/criu/include/rbtree.h @@ -14,7 +14,7 @@ #define RB_MASK 3 struct rb_node { - unsigned long rb_parent_color; /* Keeps both parent anc color */ + unsigned long rb_parent_color; /* Keeps both parent and color */ struct rb_node *rb_right; struct rb_node *rb_left; } __aligned(sizeof(long)); diff --git a/criu/include/restore.h b/criu/include/restore.h index 8ef0dbddf..189051826 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -7,4 +7,57 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); +struct task_restore_args; +struct pstree_item; +struct rst_shstk_info; + +#ifndef arch_shstk_prepare +static inline int arch_shstk_prepare(struct pstree_item *item, + CoreEntry *core, + struct task_restore_args *ta) +{ + return 0; +} +#define arch_shstk_prepare arch_shstk_prepare +#endif + +#ifndef arch_shstk_unlock +static inline int arch_shstk_unlock(struct pstree_item *item, + CoreEntry *core, pid_t pid) +{ + return 0; +} +#define arch_shstk_unlock arch_shstk_unlock +#endif + +#ifndef arch_shstk_trampoline +static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, + int (*func)(void *arg), void *arg) +{ + return func(arg); +} +#define arch_shstk_trampoline arch_shstk_trampoline +#endif + +#ifndef shstk_restorer_stack_size +static always_inline long shstk_restorer_stack_size(void) +{ + return 0; +} +#endif + +#ifndef shstk_set_restorer_stack +static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) +{ + return 0; +} +#endif + +#ifndef shstk_min_mmap_addr +static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long def) +{ + return def; +} +#endif + #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 325804e44..14c0a3768 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -56,6 +56,10 @@ struct restore_posix_timer { int overrun; }; +#ifndef rst_shstk_info +struct rst_shstk_info {}; +#endif + /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things @@ -71,8 +75,8 @@ struct thread_creds_args { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; + u32 cap_amb[CR_CAP_SIZE]; - unsigned int secbits; char *lsm_profile; unsigned int *groups; char *lsm_sockcreate; @@ -120,7 +124,11 @@ struct thread_restore_args { unsigned int seccomp_filters_n; bool seccomp_force_tsync; + struct rst_shstk_info shstk; + char comm[TASK_COMM_LEN]; + int cg_set; + int cgroupd_sk; } __aligned(64); typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); @@ -142,7 +150,7 @@ struct task_restore_args { struct timeval logstart; int uffd; - bool has_thp_enabled; + bool thp_disabled; /* threads restoration */ int nr_threads; /* number of threads */ @@ -162,6 +170,7 @@ struct task_restore_args { struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; + bool posix_timer_cr_ids; struct restore_timerfd *timerfd; unsigned int timerfd_n; @@ -228,6 +237,7 @@ struct task_restore_args { #endif int lsm_type; int child_subreaper; + int membarrier_registration_mask; bool has_clone3_set_tid; /* @@ -235,6 +245,11 @@ struct task_restore_args { * unregister it before memory restoration procedure */ struct rst_rseq_param libc_rseq; + + uid_t uid; + u32 cap_eff[CR_CAP_SIZE]; + + struct rst_shstk_info shstk; } __aligned(64); /* @@ -326,4 +341,27 @@ enum { #define __r_sym(name) restorer_sym##name #define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) +#ifndef arch_shstk_switch_to_restorer +static inline int arch_shstk_switch_to_restorer(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_switch_to_restorer arch_shstk_switch_to_restorer +#endif + +#ifndef arch_shstk_restore +static inline int arch_shstk_restore(struct rst_shstk_info *shstk) +{ + return 0; +} +#define arch_shstk_restore arch_shstk_restore +#endif + +#ifndef shstk_vma_restore +static always_inline int shstk_vma_restore(VmaEntry *vma_entry) +{ + return -1; +} +#endif + #endif /* __CR_RESTORER_H__ */ diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index d0a3db6c5..deb297e5f 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -1,6 +1,7 @@ #ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ +#include "asm/restore.h" #include "common/lock.h" #include "common/list.h" #include "vma.h" @@ -14,6 +15,7 @@ struct task_entries { futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; + mutex_t cgroupd_sync_lock; mutex_t last_pid_mutex; }; @@ -22,7 +24,7 @@ struct fdt { pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr - * The fdt table was restrored, if fdt_lock is equal to nr + 1 + * The fdt table was restored, if fdt_lock is equal to nr + 1 */ futex_t fdt_lock; }; @@ -32,6 +34,11 @@ struct rst_rseq { uint64_t rseq_cs_pointer; }; +#ifndef ARCH_RST_INFO +struct rst_arch_info { +}; +#endif + struct rst_info { struct list_head fds; @@ -73,11 +80,14 @@ struct rst_info { */ bool has_old_seccomp_filter; - bool has_thp_enabled; - struct rst_rseq *rseqe; + futex_t shstk_enable; + futex_t shstk_unlock; + void *breakpoint; + + struct rst_arch_info arch_info; }; extern struct task_entries *task_entries; diff --git a/criu/include/seize.h b/criu/include/seize.h index cf7366cb0..fc7facad3 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,8 +2,14 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); +extern int checkpoint_devices(void); +struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); +extern char *task_comm_info(pid_t pid, char *comm, size_t size); +extern char *__task_comm_info(pid_t pid); +extern void set_compel_interrupt_only_mode(void); + #endif diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index c6979de7f..4265d94ed 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -24,6 +24,7 @@ enum sfd_type { */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, + CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ USERNSD_SK, /* Socket for usernsd */ NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ diff --git a/criu/include/setproctitle.h b/criu/include/setproctitle.h index bc634331b..a4873578a 100644 --- a/criu/include/setproctitle.h +++ b/criu/include/setproctitle.h @@ -1,19 +1,7 @@ #ifndef __CR_SETPROCTITLE_H__ #define __CR_SETPROCTITLE_H__ -#ifdef CONFIG_HAS_LIBBSD -#include -#else - -/* - * setproctitle_init is in the libbsd since v0.6.0. This macro allows to - * compile criu with libbsd<0.6.0. - */ -#ifndef CONFIG_HAS_SETPROCTITLE_INIT -#define setproctitle_init(argc, argv, envp) -#endif - -#define setproctitle(fmt, ...) -#endif +extern void __setproctitle_init(int argc, char *argv[], char *envp[]); +extern void __setproctitle(const char *fmt, ...); #endif /* __CR_SETPROCTITLE_H__ */ diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 813ef630e..15cab1146 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,13 +4,14 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" +#include "pagemap-cache.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); diff --git a/criu/include/sigact.h b/criu/include/sigact.h new file mode 100644 index 000000000..4df011f96 --- /dev/null +++ b/criu/include/sigact.h @@ -0,0 +1,14 @@ +#ifndef __CR_SIGACT_H__ +#define __CR_SIGACT_H__ + +#include "images/core.pb-c.h" + +extern rt_sigaction_t sigchld_act; + +struct parasite_ctl; +struct pstree_item; + +extern int prepare_sigactions(CoreEntry *core); +extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); + +#endif diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 5dd2a6551..69ee8589e 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -69,6 +69,7 @@ extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt +#define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { @@ -76,7 +77,7 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_err("Failed to turn off repair mode on socket: %m\n"); + pr_perror("Failed to turn off repair mode on socket %d", fd); } extern void tcp_locked_conn_add(struct inet_sk_info *); @@ -86,6 +87,9 @@ extern void cpt_unlock_tcp_connections(void); extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); +extern int dump_tcp_opts(int sk, TcpOptsEntry *toe); +extern int restore_tcp_opts(int sk, TcpOptsEntry *toe); + #define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" #define SK_CLOSE_PARAM "tcp-close" diff --git a/criu/include/sockets.h b/criu/include/sockets.h index 399d38664..6c81d3edd 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -25,8 +25,9 @@ struct socket_desc { }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); diff --git a/criu/include/string.h b/criu/include/string.h index e11a42058..4c71d961c 100644 --- a/criu/include/string.h +++ b/criu/include/string.h @@ -3,18 +3,9 @@ #include -#ifdef CONFIG_HAS_LIBBSD -#include -#endif - #include "common/config.h" -#ifndef CONFIG_HAS_STRLCPY -extern size_t strlcpy(char *dest, const char *src, size_t size); -#endif - -#ifndef CONFIG_HAS_STRLCAT -extern size_t strlcat(char *dest, const char *src, size_t count); -#endif +extern size_t __strlcpy(char *dest, const char *src, size_t size); +extern size_t __strlcat(char *dest, const char *src, size_t count); #endif /* __CR_STRING_H__ */ diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index ac7924dcd..2d689a9a0 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -34,8 +34,9 @@ enum { /* * Some entries might be missing mark them as optional. */ -#define CTL_FLAGS_OPTIONAL 1 -#define CTL_FLAGS_HAS 2 -#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_OPTIONAL 1 +#define CTL_FLAGS_HAS 2 +#define CTL_FLAGS_READ_EIO_SKIP 4 +#define CTL_FLAGS_IPC_EACCES_SKIP 8 #endif /* __CR_SYSCTL_H__ */ diff --git a/criu/include/timer.h b/criu/include/timer.h new file mode 100644 index 000000000..d1deb6051 --- /dev/null +++ b/criu/include/timer.h @@ -0,0 +1,17 @@ +#ifndef __CR_TIMER_H__ +#define __CR_TIMER_H__ + +#include "images/core.pb-c.h" + +struct task_restore_args; +struct pstree_item; +struct parasite_ctl; +struct proc_posix_timers_stat; + +extern int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); +extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); + +extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); +extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item); +#endif diff --git a/criu/include/util-caps.h b/criu/include/util-caps.h new file mode 100644 index 000000000..7ccd162f5 --- /dev/null +++ b/criu/include/util-caps.h @@ -0,0 +1,58 @@ +#ifndef __CR_UTIL_CAPS_H__ +#define __CR_UTIL_CAPS_H__ + +#include + +#ifndef CAP_CHECKPOINT_RESTORE +#define CAP_CHECKPOINT_RESTORE 40 +#endif + +static inline bool has_capability(int cap, u32 *cap_eff) +{ + int mask = CAP_TO_MASK(cap); + int index = CAP_TO_INDEX(cap); + u32 effective; + + effective = cap_eff[index]; + + if (!(mask & effective)) { + pr_debug("Effective capability %d missing\n", cap); + return false; + } + + return true; +} + +static inline bool has_cap_checkpoint_restore(u32 *cap_eff) +{ + /* + * Everything guarded by CAP_CHECKPOINT_RESTORE is also + * guarded by CAP_SYS_ADMIN. Check for both capabilities. + */ + if (has_capability(CAP_CHECKPOINT_RESTORE, cap_eff) || has_capability(CAP_SYS_ADMIN, cap_eff)) + return true; + + return false; +} + +static inline bool has_cap_net_admin(u32 *cap_eff) +{ + return has_capability(CAP_NET_ADMIN, cap_eff); +} + +static inline bool has_cap_sys_chroot(u32 *cap_eff) +{ + return has_capability(CAP_SYS_CHROOT, cap_eff); +} + +static inline bool has_cap_setuid(u32 *cap_eff) +{ + return has_capability(CAP_SETUID, cap_eff); +} + +static inline bool has_cap_sys_resource(u32 *cap_eff) +{ + return has_capability(CAP_SYS_RESOURCE, cap_eff); +} + +#endif /* __CR_UTIL_CAPS_H__ */ diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h index c4386cf8e..9fd9a6de4 100644 --- a/criu/include/util-vdso.h +++ b/criu/include/util-vdso.h @@ -30,6 +30,7 @@ struct vdso_symbol { struct vdso_symtable { unsigned long vdso_size; unsigned long vvar_size; + unsigned long vvar_vclock_size; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; bool vdso_before_vvar; /* order of vdso/vvar pair */ }; diff --git a/criu/include/util.h b/criu/include/util.h index 4e29c079e..55ad5b63c 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -21,6 +21,8 @@ #include "log.h" #include "common/err.h" +#include "compel/infect-util.h" + #define PREF_SHIFT_OP(pref, op, size) ((size)op(pref##BYTES_SHIFT)) #define KBYTES_SHIFT 10 #define MBYTES_SHIFT 20 @@ -170,6 +172,7 @@ extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); extern int cr_daemon(int nochdir, int noclose, int close_fd); extern int status_ready(void); extern int is_root_user(void); +extern int close_fds(int minfd); extern int set_proc_self_fd(int fd); @@ -263,6 +266,10 @@ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); + int fd_has_data(int lfd); int make_yard(char *path); @@ -274,8 +281,6 @@ static inline int sk_wait_data(int sk) } void fd_set_nonblocking(int fd, bool on); -void tcp_nodelay(int sk, bool on); -void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); @@ -384,7 +389,14 @@ static inline void print_stack_trace(pid_t pid) extern int mount_detached_fs(const char *fsname); -extern char *get_legacy_iptables_bin(bool ipv6); +extern int cr_fsopen(const char *fsname, unsigned int flags); +extern int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux); +extern int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags); +extern void fsfd_dump_messages(int fd); + +extern char *get_legacy_iptables_bin(bool ipv6, bool restore); + +extern int set_opts_cap_eff(void); extern ssize_t read_all(int fd, void *buf, size_t size); extern ssize_t write_all(int fd, const void *buf, size_t size); @@ -396,15 +408,27 @@ static inline void cleanup_freep(void *p) free(*pp); } +#define cleanup_file __attribute__((cleanup(cleanup_filep))) +static inline void cleanup_filep(FILE **f) +{ + FILE *file = *f; + if (file) + (void)fclose(file); +} + extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args); /* * criu_run_id is a unique value of the current run. It can be used to * generate resource ID-s to avoid conflicts with other CRIU processes. */ -extern uint64_t criu_run_id; +extern char criu_run_id[RUN_ID_HASH_LENGTH]; extern void util_init(void); +#define NO_DUMP_CRIU_RUN_ID 0x7f +extern char dump_criu_run_id[RUN_ID_HASH_LENGTH]; extern char *resolve_mountpoint(char *path); +extern int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/include/vma.h b/criu/include/vma.h index 106c56af2..b8ddfc142 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -106,6 +106,7 @@ static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size return (vma_entry_is(entry, VMA_AREA_REGULAR) && (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && (entry->end <= task_size)) || + vma_entry_is(entry, VMA_AREA_SHSTK) || vma_entry_is(entry, VMA_AREA_AIORING); } @@ -122,8 +123,8 @@ static inline struct vma_area *vma_next(struct vma_area *vma) static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && - !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && - !(e->flags & MAP_HUGETLB)); + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && + !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); } #endif /* __CR_VMA_H__ */ diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c index 4fe082fbb..7e95be8c5 100644 --- a/criu/ipc_ns.c +++ b/criu/ipc_ns.c @@ -292,6 +292,8 @@ static void pr_info_ipc_shm(const IpcShmEntry *shm) static int ipc_sysctl_req(IpcVarEntry *e, int op) { + int i; + struct sysctl_req req[] = { { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, @@ -332,6 +334,9 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op) if (e->has_shm_next_id) req[nr++] = req[16]; + for (i = 0; i < nr; i++) + req[i].flags = CTL_FLAGS_IPC_EACCES_SKIP; + return sysctl_op(req, nr, op, CLONE_NEWIPC); } @@ -570,7 +575,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { - { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, + { "kernel/sem_next_id", &sem->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct semid_ds semid; @@ -703,7 +708,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { - { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, + { "kernel/msg_next_id", &msq->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct msqid_ds msqid; @@ -841,7 +846,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { int ret, id, hugetlb_flag = 0; struct sysctl_req req[] = { - { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, + { "kernel/shm_next_id", &shm->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, }; struct shmid_ds shmid; diff --git a/criu/irmap.c b/criu/irmap.c index 7b9d77bc1..d2c5d588a 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -67,6 +67,7 @@ static struct irmap hints[] = { .path = "/var/log", .nr_kids = -1, }, + { .path = "/usr/share/dbus-1/services", .nr_kids = -1 }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, @@ -101,7 +102,7 @@ static int irmap_update_stat(struct irmap *i) pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { - pr_perror("Can't stat %s", i->path); + pr_pwarn("Can't stat %s", i->path); return -1; } @@ -136,7 +137,7 @@ static int irmap_update_dir(struct irmap *t) pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { - pr_perror("Can't open %s", t->path); + pr_pwarn("Can't open %s", t->path); return -1; } @@ -499,8 +500,13 @@ int irmap_scan_path_add(char *path) return -1; } - o->ir->path = path; + o->ir->path = xstrdup(path); + if (!o->ir->path) { + xfree(o->ir); + xfree(o); + return -1; + } o->ir->nr_kids = -1; - list_add(&o->node, &opts.irmap_scan_paths); + list_add_tail(&o->node, &opts.irmap_scan_paths); return 0; } diff --git a/criu/kerndat.c b/criu/kerndat.c index b8b6bc95d..2dc2f77d5 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -12,15 +12,17 @@ #include #include #include -#include /* for sockaddr_in and inet_ntoa() */ +#include #include #include #include #include +#include #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include #endif +#include #include "common/config.h" #include "int.h" @@ -29,6 +31,7 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" +#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -51,13 +54,24 @@ #include "sched.h" #include "memfd.h" #include "mount-v2.h" +#include "util-caps.h" +#include "pagemap_scan.h" struct kerndat_s kdat = {}; +volatile int dummy_var; static int check_pagemap(void) { - int ret, fd; + int ret, fd, retry; u64 pfn = 0; + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { @@ -70,11 +84,44 @@ static int check_pagemap(void) return -1; } - /* Get the PFN of some present page. Stack is here, so try it :) */ - ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); - if (ret != sizeof(pfn)) { - pr_perror("Can't read pagemap"); - return -1; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { + pr_debug("PAGEMAP_SCAN is supported\n"); + kdat.has_pagemap_scan = true; + + args.return_mask |= PAGE_IS_GUARD; + if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) + kdat.has_pagemap_scan_guard_pages = true; + } else { + switch (errno) { + case EINVAL: + case ENOTTY: + pr_debug("PAGEMAP_SCAN isn't supported\n"); + break; + default: + pr_perror("PAGEMAP_SCAN failed with unexpected errno"); + return -1; + } + } + + retry = 3; + while (retry--) { + ++dummy_var; + /* Get the PFN of a page likely to be present. */ + ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + close(fd); + return -1; + } + /* The page can be swapped out by the time the read occurs, + * in which case the rest of the bits are a swap type + offset + * (which could be zero even if not hidden). + * Retry if this happens. */ + if (pfn & PME_PRESENT) + break; + pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, + retry ? "retrying" : "giving up"); + pfn = 0; } close(fd); @@ -420,10 +467,6 @@ static int kerndat_get_dirty_track(void) } else { no_dt: pr_info("Dirty tracking support is OFF\n"); - if (opts.track_mem) { - pr_err("Tracking memory is not available\n"); - return -1; - } } return 0; @@ -467,8 +510,15 @@ static int get_last_cap(void) struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; + int ret; - return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); + if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) + return ret; + + pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", + kdat.last_cap, 32 * CR_CAP_SIZE - 1); + return -1; } static bool kerndat_has_memfd_create(void) @@ -502,7 +552,7 @@ static bool kerndat_has_memfd_hugetlb(void) if (ret >= 0) { kdat.has_memfd_hugetlb = true; close(ret); - } else if (ret == -1 && (errno == EINVAL || errno == ENOENT)) { + } else if (ret == -1 && (errno == EINVAL || errno == ENOENT || errno == ENOSYS)) { kdat.has_memfd_hugetlb = false; } else { pr_perror("Unexpected error from memfd_create(\"\", MFD_HUGETLB)"); @@ -601,7 +651,7 @@ static int kerndat_loginuid(void) static int kerndat_iptables_has_xtlocks(void) { int fd; - char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; + char *argv[4] = { "sh", "-c", "iptables -n -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { @@ -617,29 +667,52 @@ static int kerndat_iptables_has_xtlocks(void) return 0; } -int kerndat_tcp_repair(void) -{ - int sock, clnt = -1, yes = 1, exit_code = -1; - struct sockaddr_in addr; - socklen_t aux; +/* + * Unfortunately in C htonl() is not constexpr and cannot be used in a static + * initialization below. + */ +#define constant_htonl(x) \ + (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ + (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); - addr.sin_port = 0; +static int kerndat_tcp_repair(void) +{ + static const struct sockaddr_in loopback_ip4 = { + .sin_family = AF_INET, + .sin_port = 0, + .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, + }; + static const struct sockaddr_in6 loopback_ip6 = { + .sin6_family = AF_INET6, + .sin6_port = 0, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + int sock, clnt = -1, yes = 1, exit_code = -1; + const struct sockaddr *addr; + struct sockaddr_storage listener_addr; + socklen_t addrlen; + + addr = (const struct sockaddr *)&loopback_ip4; + addrlen = sizeof(loopback_ip4); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) { + addr = (const struct sockaddr *)&loopback_ip6; + addrlen = sizeof(loopback_ip6); + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); + } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } - if (bind(sock, (struct sockaddr *)&addr, sizeof(addr))) { + if (bind(sock, addr, addrlen)) { pr_perror("Unable to bind a socket"); goto err; } - aux = sizeof(addr); - if (getsockname(sock, (struct sockaddr *)&addr, &aux)) { + addrlen = sizeof(listener_addr); + if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { pr_perror("Unable to get a socket name"); goto err; } @@ -649,13 +722,13 @@ int kerndat_tcp_repair(void) goto err; } - clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } - if (connect(clnt, (struct sockaddr *)&addr, sizeof(addr))) { + if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { pr_perror("Unable to connect a socket"); goto err; } @@ -682,20 +755,22 @@ err: return exit_code; } -int kerndat_nsid(void) +static int kerndat_nsid(void) { int nsid, sk; + kdat.has_nsid = false; + sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { - pr_perror("Unable to create a netlink socket"); - return -1; + pr_pwarn("Unable to create a netlink socket: NSID can't be used."); + return 0; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { - pr_err("NSID is not supported\n"); + pr_warn("NSID is not supported\n"); close(sk); - return -1; + return 0; } kdat.has_nsid = true; @@ -764,7 +839,7 @@ static int kerndat_detect_stack_guard_gap(void) * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete - * patch released which hitted a number + * patch released which hit a number * of repos (Ubuntu, Fedora) where instead * of PAGE_SIZE the 1M gap is cut off. */ @@ -927,6 +1002,7 @@ static int kerndat_has_ptrace_get_rseq_conf(void) pid_t pid; int len; struct __ptrace_rseq_configuration rseq; + int ret = 0; pid = fork_and_ptrace_attach(NULL); if (pid < 0) @@ -934,6 +1010,9 @@ static int kerndat_has_ptrace_get_rseq_conf(void) len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); if (len != sizeof(rseq)) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); goto out; @@ -944,16 +1023,27 @@ static int kerndat_has_ptrace_get_rseq_conf(void) * we need to pay attention to that and, possibly, make changes on the CRIU side. */ if (rseq.flags != 0) { + if (kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = false; pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); } else { + if (!kdat.has_ptrace_get_rseq_conf) + ret = 1; /* we should update kdat */ + kdat.has_ptrace_get_rseq_conf = true; + + if (memcmp(&kdat.libc_rseq_conf, &rseq, sizeof(rseq))) + ret = 1; /* we should update kdat */ + + kdat.libc_rseq_conf = rseq; } out: kill(pid, SIGKILL); waitpid(pid, NULL, 0); - return 0; + return ret; } int kerndat_sockopt_buf_lock(void) @@ -964,6 +1054,8 @@ int kerndat_sockopt_buf_lock(void) int sock; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0 && errno == EAFNOSUPPORT) + sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); if (sock < 0) { pr_perror("Unable to create a socket"); return -1; @@ -1042,9 +1134,9 @@ static int kerndat_has_move_mount_set_group(void) exit_code = 0; out: if (umount2(tmpdir, MNT_DETACH)) - pr_warn("Fail to umount2 %s: %m\n", tmpdir); + pr_warn("Fail to umount2 %s: %s\n", tmpdir, strerror(errno)); if (rmdir(tmpdir)) - pr_warn("Fail to rmdir %s: %m\n", tmpdir); + pr_warn("Fail to rmdir %s: %s\n", tmpdir, strerror(errno)); return exit_code; } @@ -1064,19 +1156,84 @@ static int kerndat_has_openat2(void) return 0; } -#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" -#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" +int __attribute__((weak)) kdat_has_shstk(void) +{ + return 0; +} +static int kerndat_has_shstk(void) +{ + int ret = kdat_has_shstk(); + + if (ret < 0) { + pr_err("kdat_has_shstk failed\n"); + return ret; + } + + kdat.has_shstk = !!ret; + return 0; +} + +#define KERNDAT_CACHE_NAME "criu.kdat" +#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME + +/* + * Returns: + * -1 if kdat_file was not written due to error + * 0 if kdat_file was written + * 1 if kdat_file was not written because cache directory undefined in env (non-root mode) + */ +static int get_kerndat_filename(char **kdat_file) +{ + int ret; + + /* + * Running as non-root, even with CAP_CHECKPOINT_RESTORE, does not + * allow to write to KDAT_RUNDIR which usually is only writable by root. + * Let's write criu.kdat file to XDG_RUNTIME_DIR for non-root cases. + * Note that XDG_RUNTIME_DIR is not always defined (e.g. when executing + * via su/sudo). + */ + if (opts.unprivileged) { + const char *cache_dir = getenv("XDG_RUNTIME_DIR"); + if (!cache_dir) { + pr_warn("$XDG_RUNTIME_DIR not set. Cannot find location for kerndat file\n"); + return 1; + } + ret = asprintf(kdat_file, "%s/%s", cache_dir, KERNDAT_CACHE_NAME); + } else { + ret = asprintf(kdat_file, "%s", KERNDAT_CACHE_FILE); + } + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return -1; + } + + return 0; +} + +/* + * Returns: + * -1 if error + * 0 if cache was loaded + * 1 if cache does not exist or is stale or cache directory undefined in env (non-root mode) + */ static int kerndat_try_load_cache(void) { + cleanup_free char *kdat_file = NULL; int fd, ret; - fd = open(KERNDAT_CACHE_FILE, O_RDONLY); + ret = get_kerndat_filename(&kdat_file); + if (ret) + return ret; + + fd = open(kdat_file, O_RDONLY); if (fd < 0) { if (ENOENT == errno) - pr_debug("File %s does not exist\n", KERNDAT_CACHE_FILE); + pr_debug("File %s does not exist\n", kdat_file); else - pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); + pr_warn("Can't load %s\n", kdat_file); return 1; } @@ -1090,12 +1247,12 @@ static int kerndat_try_load_cache(void) close(fd); if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { - pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); - unlink(KERNDAT_CACHE_FILE); + pr_warn("Stale %s file\n", kdat_file); + unlink(kdat_file); return 1; } - pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); + pr_info("Loaded kdat cache from %s\n", kdat_file); return 0; } @@ -1103,8 +1260,20 @@ static void kerndat_save_cache(void) { int fd, ret; struct statfs s; + cleanup_free char *kdat_file = NULL; + cleanup_free char *kdat_file_tmp = NULL; - fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (get_kerndat_filename(&kdat_file)) + return; + + ret = asprintf(&kdat_file_tmp, "%s.tmp", kdat_file); + + if (unlikely(ret < 0)) { + pr_warn("Cannot allocate memory for kerndat file name\n"); + return; + } + + fd = open(kdat_file_tmp, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu @@ -1113,6 +1282,10 @@ static void kerndat_save_cache(void) */ return; + /* + * If running as root we store the cache file on a tmpfs (/run), + * because the file should be gone after reboot. + */ if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); @@ -1126,20 +1299,21 @@ static void kerndat_save_cache(void) */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; + ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) - ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); + ret = rename(kdat_file_tmp, kdat_file); else { ret = -1; errno = EIO; } if (ret < 0) { - pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); + pr_perror("Couldn't save %s", kdat_file); unl: - unlink(KERNDAT_CACHE_FILE_TMP); + unlink(kdat_file); } } @@ -1147,6 +1321,14 @@ static int kerndat_uffd(void) { int uffd, err = 0; + if (opts.unprivileged) + /* + * If running as non-root uffd_open() fails with + * 'Operation not permitted'. Just ignore uffd for + * non-root for now. + */ + return 0; + kdat.uffd_features = 0; uffd = uffd_open(0, &kdat.uffd_features, &err); @@ -1239,6 +1421,8 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); + if (!kdat.has_thp_disable) + pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } @@ -1282,17 +1466,20 @@ static bool kerndat_has_clone3_set_tid(void) */ pid = syscall(__NR_clone3, &args, sizeof(args)); - if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { - kdat.has_clone3_set_tid = false; - return 0; - } - if (pid == -1 && errno == EINVAL) { - kdat.has_clone3_set_tid = true; - } else { - pr_perror("Unexpected error from clone3"); + if (pid != -1) { + pr_err("Unexpected success: clone3() returned %d\n", pid); return -1; } + if (errno == ENOSYS || errno == E2BIG) + return 0; + + if (errno != EINVAL) { + pr_pwarn("Unexpected error from clone3"); + return 0; + } + + kdat.has_clone3_set_tid = true; return 0; } @@ -1420,7 +1607,9 @@ static int __has_nftables_concat(void *arg) return 1; if (NFT_RUN_CMD(nft, "create table inet CRIU")) { - pr_err("Can't create nftables table\n"); + pr_warn("Can't create nftables table\n"); + *has = false; /* kdat.has_nftables_concat = false */ + ret = 0; goto nft_ctx_free_out; } @@ -1456,6 +1645,214 @@ static int kerndat_has_nftables_concat(void) #endif } +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + +static int __kerndat_has_ipv6_freebind(int sk) +{ + int val = 1; + + if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { + if (errno == ENOPROTOOPT) { + kdat.has_ipv6_freebind = false; + return 0; + } + pr_perror("Unable to setsockopt ipv6_freebind"); + return -1; + } + + kdat.has_ipv6_freebind = true; + return 0; +} + +static int kerndat_has_ipv6_freebind(void) +{ + int sk, ret; + + if (!kdat.ipv6) { + kdat.has_ipv6_freebind = false; + return 0; + } + + sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); + if (sk == -1) { + pr_perror("Unable to create a ipv6 dgram socket"); + return -1; + } + + ret = __kerndat_has_ipv6_freebind(sk); + close(sk); + return ret; +} + +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int kerndat_has_membarrier_get_registrations(void) +{ + int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); + if (ret < 0) { + if (errno != EINVAL) { + return ret; + } + + kdat.has_membarrier_get_registrations = false; + } else { + kdat.has_membarrier_get_registrations = true; + } + + return 0; +} + +static int kerndat_has_close_range(void) +{ + /* fd is greater than max_fd, so close_range should return EINVAL. */ + if (cr_close_range(2, 1, 0) == 0) { + pr_err("close_range succeeded unexpectedly\n"); + return -1; + } + + if (errno == ENOSYS) { + pr_debug("close_range isn't supported\n"); + return 0; + } + if (errno != EINVAL) { + pr_perror("close_range returned unexpected error code"); + return -1; + } + + kdat.has_close_range = true; + return 0; +} + +static int kerndat_has_timer_cr_ids(void) +{ + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { + if (errno == EINVAL) { + pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); + return 0; + } + pr_perror("prctl returned unexpected error code"); + return -1; + } + + kdat.has_timer_cr_ids = true; + return 0; +} + +static void breakpoint_func(void) +{ + if (raise(SIGSTOP)) + pr_perror("Unable to kill itself with SIGSTOP"); + exit(1); +} + +/* + * kerndat_breakpoints checks that hardware breakpoints work as they should. + * In some cases, they might not work in virtual machines if the hypervisor + * doesn't virtualize them. For example, they don't work in AMD SEV virtual + * machines if the Debug Virtualization extension isn't supported or isn't + * enabled in SEV_FEATURES. + */ +static int kerndat_breakpoints(void) +{ + int status, ret, exit_code = -1; + pid_t pid; + + pid = fork(); + if (pid == -1) { + pr_perror("fork"); + return -1; + } + if (pid == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { + pr_perror("ptrace(PTRACE_TRACEME)"); + exit(1); + } + raise(SIGSTOP); + breakpoint_func(); + exit(1); + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for initial stop"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { + pr_err("Child didn't stop as expected: status=%x\n", status); + goto err; + } + ret = ptrace_set_breakpoint(pid, &breakpoint_func); + if (ret < 0) { + pr_err("Failed to set breakpoint\n"); + goto err; + } + if (ret == 0) { + pr_debug("Hardware breakpoints appear to be disabled\n"); + goto out; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("waitpid for breakpoint trigger"); + goto err; + } + if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { + pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); + goto out; + } + kdat.has_breakpoints = true; +out: + exit_code = 0; +err: + if (kill(pid, SIGKILL)) { + pr_perror("Failed to kill the child process"); + exit_code = -1; + } + if (waitpid(pid, &status, 0) == -1) { + pr_perror("Failed to wait for the child process"); + exit_code = -1; + } + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { + pr_err("The child exited with unexpected code: %x\n", status); + exit_code = -1; + } + return exit_code; +} + +static int kerndat_has_madv_guard(void) +{ + void *map; + + map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (map == MAP_FAILED) { + pr_perror("Can't mmap a page for has_madv_guard feature test"); + return -1; + } + + if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { + if (errno != EINVAL) { + pr_perror("madvise failed (has_madv_guard check)"); + goto mmap_cleanup; + } + } else { + kdat.has_madv_guard = true; + } + + munmap(map, PAGE_SIZE); + return 0; + +mmap_cleanup: + munmap(map, PAGE_SIZE); + return -1; +} + +void kerndat_warn_about_madv_guards(void) +{ + if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) + pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " + "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " + "Please, consider updating your kernel.\n"); +} + /* * Some features depend on resource that can be dynamically changed * at the OS runtime. There are cases that we cannot determine the @@ -1476,12 +1873,63 @@ int kerndat_try_load_new(void) if (ret < 0) return ret; + ret = kerndat_has_ptrace_get_rseq_conf(); + if (ret < 0) { + pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); + return ret; + } + + ret = kerndat_has_shstk(); + if (ret < 0) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + return ret; + } + /* New information is found, we need to save to the cache */ if (ret) kerndat_save_cache(); return 0; } +static int root_only_init(void) +{ + int ret = 0; + + if (opts.unprivileged) + return 0; + + if (!ret && kerndat_loginuid()) { + pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_tun_netns()) { + pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_unix_file()) { + pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_link_nsid()) { + pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_socket_netns()) { + pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_nftables_concat()) { + pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_move_mount_set_group()) { + pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); + ret = -1; + } + + return ret; +} + int kerndat_init(void) { int ret; @@ -1499,7 +1947,16 @@ int kerndat_init(void) memset(&kdat, 0, sizeof(kdat)); preload_socket_modules(); - preload_netfilter_modules(); + if (!opts.unprivileged) + /* + * This uses 'iptables -L' to implicitly load necessary modules. + * If the non nft backed iptables is used it does a + * openat(AT_FDCWD, "/run/xtables.lock", O_RDONLY|O_CREAT, 0600) = -1 EACCES + * which will fail as non-root. There are no capabilities to + * change this. The iptables nft backend fails with + * openat(AT_FDCWD, "/proc/net/ip_tables_names", O_RDONLY) = -1 EACCES + */ + preload_netfilter_modules(); if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); @@ -1537,10 +1994,14 @@ int kerndat_init(void) pr_err("get_ipv6 failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_loginuid()) { - pr_err("kerndat_loginuid failed when initializing kerndat.\n"); + if (!ret && kerndat_nsid()) { + pr_err("kerndat_nsid failed when initializing kerndat.\n"); ret = -1; } + + if (!ret && root_only_init()) + ret = -1; + if (!ret && kerndat_iptables_has_xtlocks()) { pr_err("kerndat_iptables_has_xtlocks failed when initializing kerndat.\n"); ret = -1; @@ -1553,22 +2014,6 @@ int kerndat_init(void) pr_err("kerndat_compat_restore failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_tun_netns()) { - pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_socket_unix_file()) { - pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_nsid()) { - pr_err("kerndat_nsid failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_link_nsid()) { - pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_memfd_create()) { pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); ret = -1; @@ -1599,10 +2044,6 @@ int kerndat_init(void) pr_err("kerndat_vdso_preserves_hint failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_socket_netns()) { - pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_x86_has_ptrace_fpu_xsave_bug()) { pr_err("kerndat_x86_has_ptrace_fpu_xsave_bug failed when initializing kerndat.\n"); ret = -1; @@ -1627,7 +2068,7 @@ int kerndat_init(void) pr_err("has_time_namespace failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_newifindex()) { + if (!ret && (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) && kerndat_has_newifindex()) { pr_err("kerndat_has_newifindex failed when initializing kerndat.\n"); ret = -1; } @@ -1641,18 +2082,10 @@ int kerndat_init(void) pr_err("kerndat_has_nspid failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_nftables_concat()) { - pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_sockopt_buf_lock()) { pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_move_mount_set_group()) { - pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_has_openat2()) { pr_err("kerndat_has_openat2 failed when initializing kerndat.\n"); ret = -1; @@ -1661,10 +2094,38 @@ int kerndat_init(void) pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_ptrace_get_rseq_conf()) { + if (!ret && (kerndat_has_ptrace_get_rseq_conf() < 0)) { pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); ret = -1; } + if (!ret && (kerndat_has_ipv6_freebind() < 0)) { + pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_membarrier_get_registrations()) { + pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_shstk()) { + pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_close_range()) { + pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_timer_cr_ids()) { + pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_breakpoints()) { + pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_madv_guard()) { + pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + ret = -1; + } kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/libnetlink.c b/criu/libnetlink.c index f0304b0db..c7a84a44d 100644 --- a/criu/libnetlink.c +++ b/criu/libnetlink.c @@ -214,8 +214,3 @@ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], in return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } - -int32_t nla_get_s32(const struct nlattr *nla) -{ - return *(const int32_t *)nla_data(nla); -} diff --git a/criu/log.c b/criu/log.c index c4ce90ec0..bf6f657f2 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -71,7 +72,8 @@ static void print_ts(void) gettimeofday(&t, NULL); timediff(&start, &t); - snprintf(buffer, TS_BUF_OFF, "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec); + snprintf(buffer, TS_BUF_OFF, "(%02u.%06u", (unsigned)t.tv_sec, (unsigned)t.tv_usec); + buffer[TS_BUF_OFF - 2] = ')'; /* this will overwrite the last digit if tv_sec>=100 */ buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */ } @@ -113,6 +115,9 @@ static struct str_and_lock *first_err; int log_keep_err(void) { + if (first_err) + return 0; + first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; @@ -131,10 +136,11 @@ static void log_note_err(char *msg) * anyway, so it doesn't make much sense to try hard * and optimize this out. */ - mutex_lock(&first_err->l); - if (first_err->s[0] == '\0') - strlcpy(first_err->s, msg, sizeof(first_err->s)); - mutex_unlock(&first_err->l); + if (mutex_trylock(&first_err->l)) { + if (first_err->s[0] == '\0') + __strlcpy(first_err->s, msg, sizeof(first_err->s)); + mutex_unlock(&first_err->l); + } } } @@ -184,7 +190,7 @@ void flush_early_log_buffer(int fd) * with reading the log_level. */ struct early_log_hdr *hdr = (void *)early_log_buffer + pos; - pos += sizeof(hdr); + pos += sizeof(*hdr); if (hdr->level <= current_loglevel) { size_t size = 0; while (size < hdr->len) { @@ -196,7 +202,7 @@ void flush_early_log_buffer(int fd) } pos += hdr->len; } - if (early_log_buf_off == EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(struct early_log_hdr)) >= EARLY_LOG_BUF_LEN) pr_warn("The early log buffer is full, some messages may have been lost\n"); early_log_buf_off = 0; } @@ -314,10 +320,10 @@ unsigned int log_get_loglevel(void) static void early_vprint(const char *format, unsigned int loglevel, va_list params) { - unsigned int log_size = 0; + int log_size = 0, log_space; struct early_log_hdr *hdr; - if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(*hdr)) >= EARLY_LOG_BUF_LEN) return; /* Save loglevel */ @@ -325,7 +331,8 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para hdr = (void *)early_log_buffer + early_log_buf_off; hdr->level = loglevel; /* Skip the log entry size */ - early_log_buf_off += sizeof(hdr); + early_log_buf_off += sizeof(*hdr); + log_space = EARLY_LOG_BUF_LEN - early_log_buf_off; if (loglevel >= LOG_TIMESTAMP) { /* * If logging is not yet setup we just write zeros @@ -333,12 +340,17 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para * keep the same format as the other messages on * log levels with timestamps (>=LOG_TIMESTAMP). */ - log_size = snprintf(early_log_buffer + early_log_buf_off, sizeof(early_log_buffer) - early_log_buf_off, + log_size = snprintf(early_log_buffer + early_log_buf_off, log_space, "(00.000000) "); } - log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, - sizeof(early_log_buffer) - early_log_buf_off - log_size, format, params); + if (log_size < log_space) + log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, + log_space - log_size, format, params); + if (log_size > log_space) { + /* vsnprintf always add the terminating null byte. */ + log_size = log_space - 1; + } /* Save log entry size */ hdr->len = log_size; @@ -397,15 +409,28 @@ void print_on_level(unsigned int loglevel, const char *format, ...) int write_pidfile(int pid) { - int fd; + int fd, ret, exit_code = -1; fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600); if (fd == -1) { - pr_perror("Can't open %s", opts.pidfile); + pr_perror("pidfile: Can't open %s", opts.pidfile); return -1; } - dprintf(fd, "%d", pid); + ret = dprintf(fd, "%d", pid); + if (ret < 0) { + pr_perror("pidfile: Can't write pid %d to %s", pid, opts.pidfile); + goto close; + } + + if (ret == 0) { + pr_err("pidfile: Can't write pid %d to %s\n", pid, opts.pidfile); + goto close; + } + + pr_debug("pidfile: Wrote pid %d to %s (%d bytes)\n", pid, opts.pidfile, ret); + exit_code = 0; +close: close(fd); - return 0; + return exit_code; } diff --git a/criu/lsm.c b/criu/lsm.c index d1b73cc79..5faf3e5b2 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -29,7 +29,9 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/apparmor/current"); + if (!f) + f = fopen_proc(pid, "attr/current"); if (!f) return -1; @@ -370,7 +372,7 @@ int render_lsm_profile(char *profile, char **val) case LSMTYPE__APPARMOR: return render_aa_profile(val, profile); case LSMTYPE__SELINUX: - if (asprintf(val, "%s", profile) < 0) { + if (asprintf(val, "%s", opts.lsm_supplied ? opts.lsm_profile : profile) < 0) { *val = NULL; return -1; } diff --git a/criu/mem.c b/criu/mem.c index 136439518..9e8740c07 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,6 +10,7 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" +#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -99,7 +100,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -bool should_dump_page(VmaEntry *vmae, u64 pme) +static bool should_dump_entire_vma(VmaEntry *vmae) { /* * vDSO area must be always dumped because on restore @@ -107,30 +108,83 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vmae, VMA_AREA_VVAR)) - return false; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) - return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) - return true; return false; } +/* + * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. + * Otherwise, it writes an address that has to be inspected next. + */ +int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) +{ + if (!page_info) + goto err; + + if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) + goto err; + + if (pmc->regs) { + while (1) { + if (pmc->regs_idx == pmc->regs_len) { + page_info->next = pmc->end; + return 0; + } + + if (vaddr < pmc->regs[pmc->regs_idx].end) + break; + pmc->regs_idx++; + } + + if (vaddr < pmc->regs[pmc->regs_idx].start) { + page_info->next = pmc->regs[pmc->regs_idx].start; + return 0; + } + + if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) + goto skip_guard_page; + + page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; + page_info->next = vaddr; + return 0; + } else { + u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; + + if (pme & PME_GUARD_REGION) + goto skip_guard_page; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { + page_info->next = vaddr + PAGE_SIZE; + return 0; + } + + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { + page_info->softdirty = pme & PME_SOFT_DIRTY; + page_info->next = vaddr; + return 0; + } + + page_info->next = vaddr + PAGE_SIZE; + return 0; + } + +err: + pr_err("should_dump_page failed on vma " + "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", + vmae->start, vmae->end, vaddr); + return -1; + +skip_guard_page: + page_info->next = vaddr + PAGE_SIZE; + return 0; +} + bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -161,28 +215,34 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but - * the memory contents is present in the pagent image set. + * the memory contents is present in the parent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, bool has_parent) { - u64 *at = &map[PAGE_PFN(*off)]; - unsigned long pfn, nr_to_scan; + unsigned long nr_scanned; unsigned long pages[3] = {}; + unsigned long vaddr; + bool dump_all_pages; int ret = 0; - nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; + dump_all_pages = should_dump_entire_vma(vma->e); - for (pfn = 0; pfn < nr_to_scan; pfn++) { - unsigned long vaddr; + nr_scanned = 0; + for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { unsigned int ppb_flags = 0; + struct page_info page_info = {}; int st; - if (!should_dump_page(vma->e, at[pfn])) - continue; + /* If dump_all_pages is true, should_dump_page is called to get pme. */ + if (should_dump_page(pmc, vma->e, vaddr, &page_info)) + return -1; - vaddr = vma->e->start + *off + pfn * PAGE_SIZE; + if (!dump_all_pages && page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; + continue; + } if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -194,7 +254,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { + if (has_parent && page_in_parent(page_info.softdirty)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -214,9 +274,8 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *off += pfn * PAGE_SIZE; - - cnt_add(CNT_PAGES_SCANNED, nr_to_scan); + *pvaddr = vaddr; + cnt_add(CNT_PAGES_SCANNED, nr_scanned); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); @@ -246,6 +305,12 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; + /* + * We totally ignore MAP_HUGETLB on pre-dump. + * See also generate_vma_iovs() comment. + */ + if ((vma->e->flags & MAP_HUGETLB) && skip_non_trackable) + continue; if (vma->e->prot & PROT_READ) continue; @@ -271,7 +336,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); @@ -350,12 +415,31 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 off = 0; - u64 *map; + u64 vaddr; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vma->e, VMA_AREA_VVAR)) + return 0; + + /* + * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") + * tells us that: + * Under memory pressure, mm can just drop the pages (so that they're + * zero when read back again). + * + * Let's just skip MAP_DROPPABLE mappings pages dump logic. + */ + if (vma->e->flags & MAP_DROPPABLE) + return 0; /* * To facilitate any combination of pre-dump modes to run after @@ -402,21 +486,27 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { + /* + * We want to completely ignore these VMA types on the pre-dump: + * 1. VMA_AREA_AIORING because it is not soft-dirty trackable (kernel writes) + * 2. MAP_HUGETLB mappings because they are not premapped and we can't use + * parent images from pre-dump stages. Instead, the content is restored from + * the parasite context using full memory image. + */ + if (vma_entry_is(vma->e, VMA_AREA_AIORING) || vma->e->flags & MAP_HUGETLB) { if (pre_dump) return 0; has_parent = false; } - map = pmc_get_map(pmc, vma); - if (!map) + if (pmc_get_map(pmc, vma)) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, map); - + return add_shmem_area(item->pid->real, vma->e, pmc); + vaddr = vma->e->start; again: - ret = generate_iovs(item, vma, pp, map, &off, has_parent); + ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); @@ -509,6 +599,9 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { + if (vma_area_is(vma_area, VMA_AREA_GUARD)) + continue; + ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) @@ -769,14 +862,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if (&vma->list == &vmas->h) + if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if (&pvma->list == &pvmas->h) + if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) return; } } @@ -835,6 +928,7 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); + if (!vma_inherited(vma)) { int flag = 0; /* @@ -910,6 +1004,15 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { + /* + * Shadow stack VMAs cannot be mmap()ed, they must be created using + * map_shadow_stack() system call. + * Premap them to reserve virtual address space and populate them + * to have there contents available for later copying. + */ + if (vma_area_is(vma, VMA_AREA_SHSTK)) + return true; + /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the @@ -960,6 +1063,9 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -991,7 +1097,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; - if (pr->pe->vaddr > vma->e->end) + if (pr->pe->vaddr >= vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); @@ -1021,6 +1127,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned int nr_shared = 0; unsigned int nr_dropped = 0; unsigned int nr_compared = 0; + unsigned int nr_enqueued = 0; unsigned int nr_lazy = 0; unsigned long va; @@ -1096,7 +1203,8 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; - pr_debug("Enqueue page-read\n"); + + nr_enqueued++; continue; } @@ -1165,6 +1273,9 @@ err_read: unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (!vma_inherited(vma)) continue; @@ -1192,7 +1303,8 @@ err_read: pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); - pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_dropped_pages: %d\n", nr_dropped); + pr_info("nr_enqueued: %d\n", nr_enqueued); pr_info("nr_lazy: %d\n", nr_lazy); return 0; @@ -1204,8 +1316,6 @@ err_addr: static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { - MmEntry *mm = rsti(t)->mm; - /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until @@ -1228,8 +1338,6 @@ static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) pr_perror("Cannot disable THP"); return -1; } - if (!(mm->has_thp_disabled && mm->thp_disabled)) - rsti(t)->has_thp_enabled = true; return 0; } @@ -1431,3 +1539,72 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } + +int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) +{ + int pagemap_fd = -1; + struct page_region *regs = NULL; + long regs_len = 0; + int i, ret = -1; + + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = 0, + .end = kdat.task_size, + .walk_end = 0, + .vec_len = 1000, /* this should be enough for most cases */ + .max_pages = 0, + .category_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + }; + + if (!kdat.has_pagemap_scan_guard_pages) { + ret = 0; + goto out; + } + + pagemap_fd = open_proc(pid, "pagemap"); + if (pagemap_fd < 0) + goto out; + + regs = xmalloc(args.vec_len * sizeof(struct page_region)); + if (!regs) + goto out; + args.vec = (long)regs; + + do { + /* start from where we finished the last time */ + args.start = args.walk_end; + regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); + if (regs_len == -1) { + pr_perror("PAGEMAP_SCAN"); + goto out; + } + + for (i = 0; i < regs_len; i++) { + struct vma_area *vma; + + BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); + + vma = alloc_vma_area(); + if (!vma) + goto out; + + vma->e->start = regs[i].start; + vma->e->end = regs[i].end; + vma->e->status = VMA_AREA_GUARD; + + list_add_tail(&vma->list, &vma_area_list->h); + vma_area_list->nr++; + } + } while (args.walk_end != kdat.task_size); + + ret = 0; + +out: + xfree(regs); + if (pagemap_fd >= 0) + close(pagemap_fd); + return ret; +} diff --git a/criu/memfd.c b/criu/memfd.c index da2937703..9d9f0621f 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -46,6 +46,7 @@ struct memfd_restore_inode { int fdstore_id; unsigned int pending_seals; MemfdInodeEntry *mie; + bool was_opened_rw; }; static LIST_HEAD(memfd_inodes); @@ -91,10 +92,21 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * mie.has_hugetlb_flag = true; mie.hugetlb_flag = flag | MFD_HUGETLB; } + mie.mode = st->st_mode; + mie.has_mode = true; mie.seals = fcntl(fd, F_GET_SEALS); - if (mie.seals == -1) - goto out; + if (mie.seals == -1) { + if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { + pr_perror("fcntl(F_GET_SEALS)"); + goto out; + } + /* Kernels before 4.16 don't allow MFD_HUGETLB | + * MFD_ALLOW_SEALING and return EINVAL for + * fcntl(MFD_HUGETLB-enabled fd). + */ + mie.seals = F_SEAL_SEAL; + } if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) goto out; @@ -222,6 +234,7 @@ static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_im mutex_init(&inode->lock); inode->fdstore_id = -1; inode->pending_seals = 0; + inode->was_opened_rw = false; list_add_tail(&inode->list, &memfd_inodes); @@ -270,8 +283,13 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (fchown(fd, mie->uid, mie->gid)) { - pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); + if (mie->has_mode) + ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); + else + ret = cr_fchown(fd, mie->uid, mie->gid); + if (ret) { + pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, + (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); goto out; } @@ -305,7 +323,7 @@ static int memfd_open_inode(struct memfd_restore_inode *inode) return fd; } -int memfd_open(struct file_desc *d, u32 *fdflags) +int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) { struct memfd_info *mfi; MemfdFileEntry *mfe; @@ -315,57 +333,80 @@ int memfd_open(struct file_desc *d, u32 *fdflags) mfi = container_of(d, struct memfd_info, d); mfe = mfi->mfe; - if (inherited_fd(d, &fd)) - return fd; - pr_info("Restoring memfd id=%d\n", mfe->id); fd = memfd_open_inode(mfi->inode); if (fd < 0) - goto err; + return -1; /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; + + if (filemap && (flags & O_ACCMODE) == O_RDWR) + return fd; + + if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { + /* + * If there is only a single RW-opened fd for a memfd, it can + * be used to pass it to execveat() with AT_EMPTY_PATH to have + * its contents executed. This currently works only for the + * original fd from memfd_create() so return the original fd + * once -- in case the caller expects to be the sole opener + * and does execveat() from this memfd. + */ + if (!fcntl(fd, F_SETFL, flags)) { + mfi->inode->was_opened_rw = true; + return fd; + } + + pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); + } + /* * Ideally we should call compat version open() to not force the * O_LARGEFILE file flag with regular open(). It doesn't seem that * important though. */ _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); - if (_fd < 0) { + if (_fd < 0) pr_perror("Can't reopen memfd id=%d", mfe->id); - goto err; - } + else if (!filemap && (flags & O_ACCMODE) == O_RDWR) + pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); + close(fd); - fd = _fd; + return _fd; +} + +static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) +{ + MemfdFileEntry *mfe; + int fd; + + if (inherited_fd(d, new_fd)) + return 0; + + fd = memfd_open(d, NULL, false); + if (fd < 0) + return -1; + + mfe = container_of(d, struct memfd_info, d)->mfe; if (restore_fown(fd, mfe->fown) < 0) goto err; if (lseek(fd, mfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file position of memfd id=%d", mfe->id); + pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); goto err; } - return fd; + *new_fd = fd; + return 0; err: - if (fd >= 0) - close(fd); + close(fd); return -1; } -static int memfd_open_fe_fd(struct file_desc *fd, int *new_fd) -{ - int tmp; - - tmp = memfd_open(fd, NULL); - if (tmp < 0) - return -1; - *new_fd = tmp; - return 0; -} - static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) { MemfdInodeEntry *mie = NULL; diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 5d53e9a22..1e33ac12a 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -443,6 +443,7 @@ err: /* Mounts root container mount. */ static int do_mount_root_v2(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); unsigned long flags = MS_BIND; int fd; @@ -477,6 +478,11 @@ static int do_mount_root_v2(struct mount_info *mi) return -1; } + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + mi->mounted = true; return 0; @@ -927,8 +933,12 @@ static int move_mount_set_group(int src_id, char *source, int dst_id) static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) { + int nsfd = -1, orig_nsfd = -1, exit_code = -1; char target_path[PATH_MAX]; - int target_fd; + int target_fd = -1; + + if (!sg->master_id && !sg->shared_id) + return 0; target_fd = fdstore_get(target->mnt_fd_id); BUG_ON(target_fd < 0); @@ -943,8 +953,7 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ first = get_first_mount(sg->parent); if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); - close(target_fd); - return -1; + goto err; } } else { /* @@ -956,16 +965,23 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ */ if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); - close(target_fd); - return -1; + goto err; } } + } + nsfd = fdstore_get(target->nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) + goto err; + + if (sg->master_id) { /* Convert shared_id to master_id */ if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { pr_perror("Failed to make mount %d slave", target->mnt_id); - close(target_fd); - return -1; + goto err; } } @@ -973,13 +989,16 @@ static int restore_one_sharing(struct sharing_group *sg, struct mount_info *targ if (sg->shared_id) { if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { pr_perror("Failed to make mount %d shared", target->mnt_id); - close(target_fd); - return -1; + goto err; } } - close(target_fd); - - return 0; + exit_code = 0; +err: + close_safe(&target_fd); + close_safe(&nsfd); + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + return exit_code; } static int restore_one_sharing_group(struct sharing_group *sg) diff --git a/criu/mount.c b/criu/mount.c index 115e3d067..b643a7f26 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -98,7 +98,7 @@ static char *ext_mount_lookup(char *key) int len = strlen(key); char mkey[len + 6]; - sprintf(mkey, "mnt[%s]", key); + snprintf(mkey, sizeof(mkey), "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; @@ -826,7 +826,7 @@ static struct ns_id *find_ext_ns_id(void) for (ns = ns_ids; ns->next; ns = ns->next) if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { - if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, true)) + if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, false)) break; return ns; } @@ -888,7 +888,11 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + if (cut_root[0] == '\0') { + p = xstrdup(match->ns_mountpoint + 1); + } else { + p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); + } if (!p) return -1; @@ -1197,8 +1201,8 @@ int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinf dev == pm->s_dev_rt) return 0; - pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, - pm->fstype->name, pm->ns_mountpoint); + pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); return -1; } @@ -1239,12 +1243,16 @@ int __open_mountpoint(struct mount_info *pm) int open_mount(unsigned int s_dev) { struct mount_info *m; + int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; - return __open_mountpoint(m); + mnt_fd = __open_mountpoint(m); + if (mnt_fd < 0) + pr_err("Can't open mount %#x\n", s_dev); + return mnt_fd; } /* Bind-mount a mount point in a temporary place without children */ @@ -1723,38 +1731,49 @@ err: return NULL; } -/* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */ +/* + * Returns: + * 0 - success + * -1 - error + * 1 - skip + */ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, const char *target, const char *type) { - int mnt_fd, cwd_fd, ret, exit_code = 0; + int mnt_fd, cwd_fd, exit_code = -1; struct stat st; - ret = switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd); - if (ret < 0) { + if (switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd)) { pr_err("Can't switch mnt_ns\n"); - goto out; + return -1; } - ret = mount(source, target, type, 0, NULL); - if (ret < 0) { - pr_perror("Unable to mount %s %s", source, target); - exit_code = -errno; - goto restore_ns; - } else { - if (stat(target, &st) < 0) { - pr_perror("Can't stat %s", target); - exit_code = 0; - } else { - *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + if (mount(source, target, type, 0, NULL)) { + switch (errno) { + case EPERM: + case EBUSY: + case ENODEV: + case ENOENT: + pr_debug("Skipping %s as was unable to mount it: %s\n", type, strerror(errno)); exit_code = 1; + break; + default: + pr_perror("Unable to mount %s %s %s", type, source, target); } + goto restore_ns; } + if (stat(target, &st)) { + pr_perror("Can't stat %s", target); + goto restore_ns; + } + + *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); + exit_code = 0; restore_ns: - ret = restore_mnt_ns(mnt_fd, &cwd_fd); -out: - return ret < 0 ? 0 : exit_code; + if (restore_mnt_ns(mnt_fd, &cwd_fd)) + exit_code = -1; + return exit_code; } static int dump_one_fs(struct mount_info *mi) @@ -2675,9 +2694,16 @@ shared: static int do_mount_root(struct mount_info *mi) { + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; + if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply root mount options"); + return -1; + } + return fetch_rt_stat(mi, service_mountpoint(mi)); } @@ -2812,7 +2838,7 @@ static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remaped into the root yards */ + struct mount_info *mi; /* child is remapped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; @@ -3978,16 +4004,10 @@ int collect_mnt_namespaces(bool for_dump) if (ns) { ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, "binfmt_misc"); - if (ret == -EPERM) - pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n"); - else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) { - pr_err("Can't mount binfmt_misc: %d %s\n", ret, strerror(-ret)); + if (ret == -1) { goto err; - } else if (ret == 0) { - ret = -1; - goto err; - } else if (ret > 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, - s_dev, false)) { + } else if (ret == 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, + s_dev, false)) { ret = -1; goto err; } diff --git a/criu/namespaces.c b/criu/namespaces.c index 7356fe8c2..0c9b16a87 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -28,6 +27,7 @@ #include "cgroup.h" #include "fdstore.h" #include "kerndat.h" +#include "util-caps.h" #include "protobuf.h" #include "util.h" @@ -284,7 +284,6 @@ int restore_ns(int rst, struct ns_desc *nd) int switch_mnt_ns(int pid, int *rst, int *cwd_fd) { - int ret; int fd; if (!cwd_fd) @@ -293,13 +292,12 @@ int switch_mnt_ns(int pid, int *rst, int *cwd_fd) fd = open(".", O_PATH); if (fd < 0) { pr_perror("unable to open current directory"); - return fd; + return -1; } - ret = switch_ns(pid, &mnt_ns_desc, rst); - if (ret < 0) { + if (switch_ns(pid, &mnt_ns_desc, rst)) { close(fd); - return ret; + return -1; } *cwd_fd = fd; @@ -308,23 +306,22 @@ int switch_mnt_ns(int pid, int *rst, int *cwd_fd) int restore_mnt_ns(int rst, int *cwd_fd) { - int ret = -1; + int exit_code = -1; - ret = restore_ns(rst, &mnt_ns_desc); - if (ret < 0) + if (restore_ns(rst, &mnt_ns_desc)) goto err_restore; - if (cwd_fd) { - ret = fchdir(*cwd_fd); - if (ret) - pr_perror("unable to restore current directory"); + if (cwd_fd && fchdir(*cwd_fd)) { + pr_perror("Unable to restore current directory"); + goto err_restore; } + exit_code = 0; err_restore: if (cwd_fd) close_safe(cwd_fd); - return ret; + return exit_code; } struct ns_id *ns_ids = NULL; @@ -1012,36 +1009,31 @@ int dump_user_ns(pid_t pid, int ns_id) ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) - goto err; + /* + * The uid_map and gid_map is clean up in free_userns_maps + * later, so we don't need to clean these up in error cases. + */ + return -1; + e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) - goto err; + return -1; e->n_gid_map = ret; if (check_user_ns(pid)) - goto err; + return -1; img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) - goto err; + return -1; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) - goto err; + return -1; return 0; -err: - if (e->uid_map) { - xfree(e->uid_map[0]); - xfree(e->uid_map); - } - if (e->gid_map) { - xfree(e->gid_map[0]); - xfree(e->gid_map); - } - return -1; } void free_userns_maps(void) @@ -1217,20 +1209,9 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) return 0; } -struct unsc_msg { - struct msghdr h; - /* - * 0th is the call address - * 1st is the flags - * 2nd is the optional (NULL in response) arguments - */ - struct iovec iov[3]; - char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; -}; - static int usernsd_pid; -static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd) +inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) { struct cmsghdr *ch; struct ucred *ucred; @@ -1268,7 +1249,10 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *)CMSG_DATA(ch); - ucred->pid = getpid(); + if (pid) + ucred->pid = *pid; + else + ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); @@ -1283,7 +1267,7 @@ static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void } } -static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1321,7 +1305,7 @@ static int usernsd(int sk) int flags, fd, ret; pid_t pid; - unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; @@ -1366,7 +1350,7 @@ static int usernsd(int sk) else fd = -1; - unsc_msg_init(&um, &call, &ret, NULL, 0, fd); + unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; @@ -1417,7 +1401,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Send the request */ - unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); @@ -1432,7 +1416,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Get the response back */ - unsc_msg_init(&um, &call, &res, NULL, 0, 0); + unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); @@ -1453,14 +1437,11 @@ out: return ret; } -static int start_usernsd(void) +int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) { int sk[2]; int one = 1; - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - /* * Seqpacket to * @@ -1468,7 +1449,7 @@ static int start_usernsd(void) * each other easily. Stream socket require manual * messages boundaries. * - * b) Make callers note the damon death by seeing the + * b) Make callers note the daemon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. @@ -1489,24 +1470,39 @@ static int start_usernsd(void) return -1; } - usernsd_pid = fork(); - if (usernsd_pid < 0) { - pr_perror("Can't fork usernsd"); + *pid = fork(); + if (*pid < 0) { + pr_perror("Can't unix daemon"); close(sk[0]); close(sk[1]); return -1; } - if (usernsd_pid == 0) { + if (*pid == 0) { int ret; - close(sk[0]); - ret = usernsd(sk[1]); + ret = daemon_func(sk[1]); exit(ret); } - close(sk[1]); - if (install_service_fd(USERNSD_SK, sk[0]) < 0) { + + return sk[0]; +} + +static int start_usernsd(void) +{ + int sk; + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + sk = start_unix_cred_daemon(&usernsd_pid, usernsd); + if (sk < 0) { + pr_err("failed to start usernsd\n"); + return -1; + } + + if (install_service_fd(USERNSD_SK, sk) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; @@ -1623,10 +1619,12 @@ int collect_namespaces(bool for_dump) int prepare_userns_creds(void) { - /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ - if (setuid(0) || setgid(0) || setgroups(0, NULL)) { - pr_perror("Unable to initialize id-s"); - return -1; + if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; + } } /* diff --git a/criu/net.c b/criu/net.c index 2eff519c5..e5775a328 100644 --- a/criu/net.c +++ b/criu/net.c @@ -51,6 +51,9 @@ #include "images/netdev.pb-c.h" #include "images/inventory.pb-c.h" +#undef LOG_PREFIX +#define LOG_PREFIX "net: " + #ifndef IFLA_NEW_IFINDEX #define IFLA_NEW_IFINDEX 49 #endif @@ -108,15 +111,18 @@ int read_ns_sys_file(char *path, char *buf, int len) } rlen = read(fd, buf, len); + if (rlen == -1) + pr_perror("Can't read ns' %s", path); close(fd); if (rlen == len) { + buf[0] = '\0'; pr_err("Too small buffer to read ns sys file %s\n", path); return -1; } - if (rlen > 0) - buf[rlen - 1] = '\0'; + if (rlen >= 0) + buf[rlen] = '\0'; return rlen; } @@ -353,22 +359,23 @@ static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntr return net_conf_op(tgt, conf, n, op, "ipv6", req, path, ARRAY_SIZE(devconfs6), devconfs6, def_conf); } -static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) +static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) { int i, ret = -1, flags = 0; char path[ARRAY_SIZE(unix_conf_entries)][MAX_CONF_UNIX_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(unix_conf_entries)] = {}; SysctlEntry **conf = *rconf; + size_t n = *pn; - if (*n != ARRAY_SIZE(unix_conf_entries)) { - pr_err("unix: Unexpected entries in config (%zu %zu)\n", *n, ARRAY_SIZE(unix_conf_entries)); + if (n != ARRAY_SIZE(unix_conf_entries)) { + pr_err("unix: Unexpected entries in config (%zu %zu)\n", n, ARRAY_SIZE(unix_conf_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { snprintf(path[i], MAX_CONF_UNIX_PATH, CONF_UNIX_FMT, unix_conf_entries[i]); req[i].name = path[i]; req[i].flags = flags; @@ -384,7 +391,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) } } - ret = sysctl_op(req, *n, op, CLONE_NEWNET); + ret = sysctl_op(req, n, op, CLONE_NEWNET); if (ret < 0) { pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", CONF_UNIX_BASE); return -1; @@ -393,7 +400,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) if (op == CTL_READ) { bool has_entries = false; - for (i = 0; i < *n; i++) { + for (i = 0; i < n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { conf[i]->has_iarg = true; if (!has_entries) @@ -406,7 +413,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) * Unix conf is optional. */ if (!has_entries) { - *n = 0; + *pn = 0; *rconf = NULL; } } @@ -1398,7 +1405,7 @@ static int move_veth(const char *netdev, struct ns_id *ns, struct net_link *link len_val = strlen(netdev); if (len_val >= IFNAMSIZ) return -1; - strlcpy(mvreq.ifnam, netdev, IFNAMSIZ); + __strlcpy(mvreq.ifnam, netdev, IFNAMSIZ); ret = userns_call(move_veth_cb, 0, &mvreq, sizeof(mvreq), ns->net.ns_fd); if (ret < 0) @@ -1528,7 +1535,7 @@ static int changeflags(int s, char *name, short flags) { struct ifreq ifr; - strlcpy(ifr.ifr_name, name, IFNAMSIZ); + __strlcpy(ifr.ifr_name, name, IFNAMSIZ); ifr.ifr_flags = flags; if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { @@ -2039,10 +2046,10 @@ static inline int dump_iptables(struct cr_imgset *fds) * and iptables backend is nft to prevent duplicate dumps. */ #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) - iptables_cmd = get_legacy_iptables_bin(false); + iptables_cmd = get_legacy_iptables_bin(false, false); if (kdat.ipv6) - ip6tables_cmd = get_legacy_iptables_bin(true); + ip6tables_cmd = get_legacy_iptables_bin(true, false); #endif if (!iptables_cmd) { @@ -2121,6 +2128,117 @@ nft_ctx_free_out: } #endif +static const char *ipv4_sysctl_entries[] = { + "ping_group_range", +}; + +#define IPV4_SYSCTL_BASE "net/ipv4" +#define IPV4_SYSCTL_FMT IPV4_SYSCTL_BASE"/%s" +#define MAX_IPV4_SYSCTL_OPT 32 +#define MAX_IPV4_SYSCTL_PATH (sizeof(IPV4_SYSCTL_FMT) + MAX_IPV4_SYSCTL_OPT - 2) +#define MAX_STR_IPV4_SYSCTL_LEN 200 + +static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) +{ + int i, ret = -1, flags = 0; + char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; + struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; + SysctlEntry **sysctl = *rsysctl; + size_t n = *pn, ri; + + if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { + pr_err("ipv4: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); + return -EINVAL; + } + + if (opts.weak_sysctls || op == CTL_READ) + flags = CTL_FLAGS_OPTIONAL; + + for (i = 0, ri = 0; i < n; i++) { + snprintf(path[ri], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); + req[ri].name = path[ri]; + req[ri].flags = flags; + + switch (sysctl[i]->type) { + case SYSCTL_TYPE__CTL_STR: + req[ri].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); + + /* skip write if have no value */ + if (op == CTL_WRITE && !sysctl[i]->sarg) + continue; + + req[ri].arg = sysctl[i]->sarg; + break; + default: + pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); + return -1; + } + ri++; + } + + ret = sysctl_op(req, ri, op, CLONE_NEWNET); + if (ret < 0) { + pr_err("ipv4: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); + return -1; + } + + if (op == CTL_READ) { + bool has_entries = false; + + BUG_ON(ri != n); + for (i = 0; i < n; i++) { + if (req[i].flags & CTL_FLAGS_HAS) { + has_entries = true; + } else { + sysctl[i]->sarg = NULL; + } + } + + if (!has_entries) { + *pn = 0; + *rsysctl = NULL; + } + } + + return 0; +} + +static int ipv4_sysctls_ping_group_range_map_gid(SysctlEntry *ent, size_t size) +{ + int start, end, ustart, uend, ret; + + if (sscanf(ent->sarg, "%d %d", &start, &end) != 2) { + pr_err("Failed to parse ping_group_range: %s\n", ent->sarg); + return -1; + } + + /* + * The default is "1 0", which means no group + * is allowed to create ICMP Echo sockets. + */ + if (start == 1 && end == 0) { + pr_debug("The ping_group_range is set to default, skipping it.\n"); + ent->sarg = NULL; + return 0; + } + + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + + ustart = userns_gid(start); + uend = userns_gid(end); + pr_debug("Mapping ping_group_range %d %d to userns -> %d %d\n", + start, end, ustart, uend); + + ret = snprintf(ent->sarg, size, "%d\t%d\n", ustart, uend); + if (ret < 0 || ret >= size) { + pr_err("Failed to map ping_group_range: %d\t%d\n", ustart, uend); + return -1; + } + + return 0; +} + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2135,6 +2253,10 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) int size6 = ARRAY_SIZE(devconfs6); char def_stable_secret[MAX_STR_CONF_LEN + 1] = {}; char all_stable_secret[MAX_STR_CONF_LEN + 1] = {}; + SysctlEntry *ipv4_sysctls = NULL; + size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); + char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; + int ping_group_range_id = -1; NetnsId *ids; struct netns_id *p; @@ -2142,10 +2264,16 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) list_for_each_entry(p, &ns->net.ids, node) i++; + /* + * Here we allocate one single big buffer for storing multiple arrays + * of protobuf entries and pointers to entries in it and we later use + * xptr_pull_s to claim a part of this buffer of proper size for each + * particular array. Next we read data from sysctl files to those + * arrays and then finally save them into images. + */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - size4 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - size6 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + - sizex * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); + (2 * size4 + 2 * size6 + sizex + ipv4_sysctl_size) * + (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; @@ -2210,6 +2338,22 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) netns.unix_conf[i]->type = SYSCTL_TYPE__CTL_32; } + netns.n_ipv4_sysctl = ipv4_sysctl_size; + netns.ipv4_sysctl = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry *)); + ipv4_sysctls = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry)); + for (i = 0; i < ipv4_sysctl_size; i++) { + sysctl_entry__init(&ipv4_sysctls[i]); + netns.ipv4_sysctl[i] = &ipv4_sysctls[i]; + if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { + netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; + netns.ipv4_sysctl[i]->sarg = ping_group_range; + ping_group_range_id = i; + } else { + /* Need to handle this case when we have more sysctls */ + BUG(); + } + } + ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; @@ -2228,6 +2372,16 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + ret = ipv4_sysctls_op(&netns.ipv4_sysctl, &netns.n_ipv4_sysctl, CTL_READ); + if (ret < 0) + goto err_free; + + BUG_ON(ping_group_range_id == -1); + ret = ipv4_sysctls_ping_group_range_map_gid(netns.ipv4_sysctl[ping_group_range_id], + MAX_STR_IPV4_SYSCTL_LEN + 1); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); @@ -2360,9 +2514,19 @@ static int prepare_xtable_lock(void) static inline int restore_iptables(int pid) { + char *iptables_cmd = "iptables-restore"; + char *ip6tables_cmd = "ip6tables-restore"; + char comm[32]; int ret = -1; struct cr_img *img; +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + iptables_cmd = get_legacy_iptables_bin(false, true); + + if (kdat.ipv6) + ip6tables_cmd = get_legacy_iptables_bin(true, true); +#endif + img = open_image(CR_FD_IPTABLES, O_RSTR, pid); if (img == NULL) return -1; @@ -2372,7 +2536,19 @@ static inline int restore_iptables(int pid) goto ipt6; } - ret = run_iptables_tool("iptables-restore -w", img_raw_fd(img), -1); + if (!iptables_cmd) { + pr_err("Can't restore iptables dump - no legacy version present\n"); + close_image(img); + return -1; + } + + if (snprintf(comm, sizeof(comm), "%s -w", iptables_cmd) >= sizeof(comm)) { + pr_err("Can't fit '%s -w' to buffer\n", iptables_cmd); + close_image(img); + return -1; + } + + ret = run_iptables_tool(comm, img_raw_fd(img), -1); close_image(img); if (ret) return ret; @@ -2383,7 +2559,19 @@ ipt6: if (empty_image(img)) goto out; - ret = run_iptables_tool("ip6tables-restore -w", img_raw_fd(img), -1); + if (!ip6tables_cmd) { + pr_err("Can't restore ip6tables dump - no legacy version present\n"); + close_image(img); + return -1; + } + + if (snprintf(comm, sizeof(comm), "%s -w", ip6tables_cmd) >= sizeof(comm)) { + pr_err("Can't fit '%s -w' to buffer\n", ip6tables_cmd); + close_image(img); + return -1; + } + + ret = run_iptables_tool(comm, img_raw_fd(img), -1); out: close_image(img); @@ -2391,58 +2579,85 @@ out: } #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) -static inline int restore_nftables(int pid) +static inline int do_restore_nftables(struct cr_img *img) { - int ret = -1; - struct cr_img *img; + int exit_code = -1; struct nft_ctx *nft; off_t img_data_size; char *buf; + if ((img_data_size = img_raw_size(img)) < 0) { + pr_err("image size mismatch\n"); + goto out; + } + + if (read_img_str(img, &buf, img_data_size) < 0) { + pr_err("Failed to read nftables data\n"); + goto out; + } + + nft = nft_ctx_new(NFT_CTX_DEFAULT); + if (!nft) { + pr_err("Failed to create nft context object\n"); + goto buf_free_out; + } + + if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft)) { + pr_err("Failed to enable std/err output buffering\n"); + goto nft_ctx_free_out; + } + +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) + if (nft_run_cmd_from_buffer(nft, buf, strlen(buf))) +#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) + if (nft_run_cmd_from_buffer(nft, buf)) +#else + BUILD_BUG_ON(1); +#endif + { + pr_err("nft command error:\n%s\n%s\n", + nft_ctx_get_error_buffer(nft), buf); + goto nft_ctx_free_out; + } + + exit_code = 0; + +nft_ctx_free_out: + nft_ctx_free(nft); +buf_free_out: + xfree(buf); +out: + return exit_code; +} +#endif + +static inline int restore_nftables(int pid) +{ + int exit_code = -1; + struct cr_img *img; + img = open_image(CR_FD_NFTABLES, O_RSTR, pid); if (img == NULL) return -1; if (empty_image(img)) { /* Backward compatibility */ pr_info("Skipping nft restore, no image\n"); - ret = 0; + exit_code = 0; goto image_close_out; } - if ((img_data_size = img_raw_size(img)) < 0) - goto image_close_out; - - if (read_img_str(img, &buf, img_data_size) < 0) - goto image_close_out; - - nft = nft_ctx_new(NFT_CTX_DEFAULT); - if (!nft) - goto buf_free_out; - - if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft) || -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) - nft_run_cmd_from_buffer(nft, buf, strlen(buf))) -#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) - nft_run_cmd_from_buffer(nft, buf)) +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + if (!do_restore_nftables(img)) + exit_code = 0; #else - { - BUILD_BUG_ON(1); - } + pr_err("Unable to restore nftables. CRIU was built without libnftables support\n"); #endif - goto nft_ctx_free_out; - ret = 0; - -nft_ctx_free_out: - nft_ctx_free(nft); -buf_free_out: - xfree(buf); image_close_out: close_image(img); - return ret; + return exit_code; } -#endif int read_net_ns_img(void) { @@ -2519,6 +2734,12 @@ static int restore_netns_conf(struct ns_id *ns) goto out; } + if ((netns)->ipv4_sysctl) { + ret = ipv4_sysctls_op(&(netns)->ipv4_sysctl, &(netns)->n_ipv4_sysctl, CTL_WRITE); + if (ret) + goto out; + } + ns->net.netns = netns; out: return ret; @@ -2771,10 +2992,8 @@ static int prepare_net_ns_second_stage(struct ns_id *ns) ret = restore_rule(nsid); if (!ret) ret = restore_iptables(nsid); -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) if (!ret) ret = restore_nftables(nsid); -#endif } if (!ret) @@ -3000,11 +3219,45 @@ err: return ret; } -static inline int nftables_lock_network_internal(void) +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +static inline FILE *redirect_nftables_output(struct nft_ctx *nft) +{ + FILE *fp; + int fd; + + fd = dup(log_get_fd()); + if (fd < 0) { + pr_perror("dup() to redirect nftables output failed"); + return NULL; + } + + fp = fdopen(fd, "w"); + if (!fp) { + pr_perror("fdopen() to redirect nftables output failed"); + return NULL; + } + + /** + * Without setvbuf() the output from libnftables will be + * somewhere in the log file, probably at the end. + * With setvbuf() potential output will be at the correct + * position. + */ + setvbuf(fp, NULL, _IONBF, 0); + + nft_ctx_set_output(nft, fp); + nft_ctx_set_error(nft, fp); + + return fp; +} +#endif + +static inline int nftables_lock_network_internal(bool restore) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; - int ret = 0; + int ret = 0, exit_code = -1; char table[32]; char buf[128]; @@ -3015,10 +3268,19 @@ static inline int nftables_lock_network_internal(void) if (!nft) return -1; - snprintf(buf, sizeof(buf), "create table %s", table); - if (NFT_RUN_CMD(nft, buf)) + fp = redirect_nftables_output(nft); + if (!fp) goto err2; + snprintf(buf, sizeof(buf), "create table %s", table); + ret = NFT_RUN_CMD(nft, buf); + if (ret) { + /* The network has been locked on dump. */ + if (restore && errno == EEXIST) + return 0; + goto err2; + } + snprintf(buf, sizeof(buf), "add chain %s output { type filter hook output priority 0; policy drop; }", table); if (NFT_RUN_CMD(nft, buf)) goto err1; @@ -3035,17 +3297,16 @@ static inline int nftables_lock_network_internal(void) if (NFT_RUN_CMD(nft, buf)) goto err1; - goto out; - + exit_code = 0; +out: + nft_ctx_free(nft); + return exit_code; err1: snprintf(buf, sizeof(buf), "delete table %s", table); NFT_RUN_CMD(nft, buf); err2: - ret = -1; pr_err("Locking network failed using nftables\n"); -out: - nft_ctx_free(nft); - return ret; + goto out; #else pr_err("CRIU was built without libnftables support\n"); return -1; @@ -3077,17 +3338,20 @@ static int iptables_network_lock_internal(void) return ret; } -int network_lock_internal(void) +int network_lock_internal(bool restore) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; if (opts.network_lock_method == NETWORK_LOCK_IPTABLES) ret = iptables_network_lock_internal(); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) - ret = nftables_lock_network_internal(); + ret = nftables_lock_network_internal(restore); if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -3099,6 +3363,7 @@ static inline int nftables_network_unlock(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) int ret = 0; + cleanup_file FILE *fp = NULL; struct nft_ctx *nft; char table[32]; char buf[128]; @@ -3110,6 +3375,10 @@ static inline int nftables_network_unlock(void) if (!nft) return -1; + fp = redirect_nftables_output(nft); + if (!fp) + return -1; + snprintf(buf, sizeof(buf), "delete table %s", table); if (NFT_RUN_CMD(nft, buf)) ret = -1; @@ -3122,19 +3391,53 @@ static inline int nftables_network_unlock(void) #endif } +static bool iptables_has_criu_jump_target(void) +{ + int fd, ret; + char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fd = -1; + pr_perror("failed to open /dev/null, using log fd"); + } + + ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); + close_safe(&fd); + return !ret; +} + static int iptables_network_unlock_internal(void) { - char conf[] = "*filter\n" - ":CRIU - [0:0]\n" - "-D INPUT -j CRIU\n" - "-D OUTPUT -j CRIU\n" - "-X CRIU\n" - "COMMIT\n"; + char delete_jump_targets[] = "*filter\n" + ":CRIU - [0:0]\n" + "-D INPUT -j CRIU\n" + "-D OUTPUT -j CRIU\n" + "COMMIT\n"; + + char delete_criu_chain[] = "*filter\n" + ":CRIU - [0:0]\n" + "-X CRIU\n" + "COMMIT\n"; + int ret = 0; - ret |= iptables_restore(false, conf, sizeof(conf) - 1); + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); if (kdat.ipv6) - ret |= iptables_restore(true, conf, sizeof(conf) - 1); + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + + /* For compatibility with iptables-nft backend, we need to make sure that all jump + * targets have been removed before deleting the CRIU chain. + */ + if (iptables_has_criu_jump_target()) { + ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); + } + + ret |= iptables_restore(false, delete_criu_chain, sizeof(delete_criu_chain) - 1); + if (kdat.ipv6) + ret |= iptables_restore(true, delete_criu_chain, sizeof(delete_criu_chain) - 1); return ret; } @@ -3143,6 +3446,9 @@ static int network_unlock_internal(void) { int ret = 0, nsret; + if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; + if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3171,7 +3477,7 @@ int network_lock(void) if (run_scripts(ACT_NET_LOCK)) return -1; - return network_lock_internal(); + return network_lock_internal(false); } void network_unlock(void) @@ -3215,7 +3521,7 @@ int macvlan_ext_add(struct external *ext) /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the - * kernel will synchonously go on a very slow routine called + * kernel will synchronously go on a very slow routine called * synchronize_rcu() trying to put a reference on old namespaces. * * To avoid doing this more than once we pre-create all the @@ -3366,7 +3672,7 @@ int collect_net_namespaces(bool for_dump) struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net"); -struct ns_id *net_get_root_ns() +struct ns_id *net_get_root_ns(void) { static struct ns_id *root_netns = NULL; @@ -3383,7 +3689,7 @@ struct ns_id *net_get_root_ns() /* * socket_diag doesn't report unbound and unconnected sockets, - * so we have to get their network namesapces explicitly + * so we have to get their network namespaces explicitly */ struct ns_id *get_socket_ns(int lfd) { @@ -3483,7 +3789,7 @@ static int move_to_bridge(struct external *ext, void *arg) ret = -1; goto out; } - strlcpy(ifr.ifr_name, br, IFNAMSIZ); + __strlcpy(ifr.ifr_name, br, IFNAMSIZ); ret = ioctl(s, SIOCBRADDIF, &ifr); if (ret < 0) { pr_perror("Can't add interface %s to bridge %s", out, br); @@ -3495,7 +3801,7 @@ static int move_to_bridge(struct external *ext, void *arg) * $ ip link set dev up */ ifr.ifr_ifindex = 0; - strlcpy(ifr.ifr_name, out, IFNAMSIZ); + __strlcpy(ifr.ifr_name, out, IFNAMSIZ); ret = ioctl(s, SIOCGIFFLAGS, &ifr); if (ret < 0) { pr_perror("Can't get flags of interface %s", out); diff --git a/criu/netfilter.c b/criu/netfilter.c index 2212fd9f2..e2c82764f 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -48,8 +48,8 @@ void preload_netfilter_modules(void) fd = -1; pr_perror("failed to open /dev/null, using log fd for net module preload"); } - cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, 0); - cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, 0); + cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, CRS_CAN_FAIL); + cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, CRS_CAN_FAIL); close_safe(&fd); } @@ -299,7 +299,25 @@ int nftables_lock_connection(struct inet_sk_desc *sk) int nftables_get_table(char *table, int n) { - if (snprintf(table, n, "inet CRIU-%d", root_item->pid->real) < 0) { + int ret; + + switch(dump_criu_run_id[0]) { + case 0: + /* This is not a restore.*/ + ret = snprintf(table, n, "inet CRIU-%s", criu_run_id); + break; + case NO_DUMP_CRIU_RUN_ID: + /** + * This is a restore from an older image with no + * dump_criu_run_id available. Let's use the old ID. + */ + ret = snprintf(table, n, "inet CRIU-%d", root_item->pid->real); + break; + default: + ret = snprintf(table, n, "inet CRIU-%s", dump_criu_run_id); + } + + if (ret < 0) { pr_err("Cannot generate CRIU's nftables table name\n"); return -1; } diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 54dc3ccc4..4601d8f9c 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -99,6 +99,7 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl { struct page_pipe_buf *prev = pp_prev_ppb(pp, ppb_flags); struct page_pipe_buf *ppb; + int ppb_size = 0; ppb = xmalloc(sizeof(*ppb)); if (!ppb) @@ -120,7 +121,13 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl cnt_add(CNT_PAGE_PIPES, 1); ppb->pipe_off = 0; - ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; + ppb_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0); + if (ppb_size < 0) { + xfree(ppb); + pr_perror("Can't get pipe size"); + return NULL; + } + ppb->pipe_size = ppb_size / PAGE_SIZE; pp->nr_pipes++; } @@ -374,7 +381,7 @@ int pipe_read_dest_init(struct pipe_read_dest *prd) return 0; } -int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; @@ -399,7 +406,7 @@ int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned lo } /* clamp the request if it passes the end of iovec */ - len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, *nr_pages * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; skip += ppb->pipe_off * PAGE_SIZE; @@ -439,17 +446,17 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %lx pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; - pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; - pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); + pr_debug("\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); } } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 782d4cafc..463d4c506 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -31,7 +32,7 @@ static int page_server_sk = -1; struct page_server_iov { u32 cmd; - u32 nr_pages; + u64 nr_pages; u64 vaddr; u64 dst_id; }; @@ -157,18 +158,32 @@ static inline int send_psi(int sk, struct page_server_iov *pi) return send_psi_flags(sk, pi, 0); } +static void tcp_cork(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_CORK=%d", val); +} + +static void tcp_nodelay(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) + pr_pwarn("Unable to set TCP_NODELAY=%d", val); +} + /* page-server xfer */ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long len) { ssize_t ret, left = len; if (opts.tls) { - pr_debug("Sending %lu bytes / %lu pages\n", len, len / PAGE_SIZE); + pr_debug("Sending %lx bytes\n", len); if (tls_send_data_from_fd(p, len)) return -1; } else { - pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); + pr_debug("Splicing %lx bytes into socket\n", len); while (left > 0) { ret = splice(p, NULL, xfer->sk, NULL, left, SPLICE_F_MOVE); @@ -177,7 +192,7 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le return -1; } - pr_debug("\tSpliced: %lu bytes sent\n", (unsigned long)ret); + pr_debug("\tSpliced: %lx bytes sent\n", (unsigned long)ret); left -= ret; } } @@ -273,7 +288,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) * read_pagemap_page routine. */ - pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); + pr_debug("Checking %p - %p hole\n", iov->iov_base, iov->iov_base + iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { @@ -285,7 +300,8 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) return -1; } - pr_debug("\tFound %" PRIx64 "/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); + pr_debug("\tFound %" PRIx64 " - %" PRIx64 "\n", + p->pe->vaddr, p->pe->vaddr + pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be @@ -311,6 +327,7 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; + pe.has_nr_pages = true; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { @@ -324,7 +341,8 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { - pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); + pr_err("Hole %p - %p not found in parent\n", + iov->iov_base, iov->iov_base + iov->iov_len); return -1; } } @@ -834,7 +852,7 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\t p %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\t p %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -870,7 +888,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %lx/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -882,7 +900,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); + pr_debug("\tp %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); flags = ppb_xfer_flags(xfer, ppb); @@ -1055,7 +1073,8 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 "/%u\n", pi->vaddr, pi->nr_pages); + pr_debug("Adding %" PRIx64 " - %" PRIx64 "\n", + pi->vaddr, pi->vaddr + pi->nr_pages * PAGE_SIZE); if (prep_loc_xfer(pi)) return -1; @@ -1120,13 +1139,17 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; - unsigned long len; + unsigned long len, nr_pages; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; - ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); + /* page_pipe_read() uses 'unsigned long *' but pi->nr_pages is u64. + * Use a temporary variable to fix the incompatible pointer type + * on 32-bit platforms (e.g. armv7). */ + nr_pages = pi->nr_pages; + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); if (ret) return ret; @@ -1135,6 +1158,7 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * .dst_id all remain intact. */ + pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; @@ -1332,7 +1356,7 @@ static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; - int nr_pages = 0; + unsigned long nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); @@ -1406,7 +1430,7 @@ int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) if (opts.ps_socket != -1) { ask = opts.ps_socket; - pr_info("Re-using ps socket %d\n", ask); + pr_info("Reusing ps socket %d\n", ask); goto no_server; } @@ -1452,7 +1476,7 @@ static int connect_to_page_server(void) if (opts.ps_socket != -1) { page_server_sk = opts.ps_socket; - pr_info("Re-using ps socket %d\n", page_server_sk); + pr_info("Reusing ps socket %d\n", page_server_sk); goto out; } @@ -1535,13 +1559,13 @@ struct ps_async_read { static LIST_HEAD(async_reads); -static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) +static inline void async_read_set_goal(struct ps_async_read *ar, unsigned long nr_pages) { ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->nr_pages = nr_pages; } -static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, +static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; @@ -1551,7 +1575,7 @@ static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages async_read_set_goal(ar, nr_pages); } -static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) +static int page_server_start_async_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; @@ -1651,7 +1675,7 @@ int connect_to_page_server_to_recv(int epfd) return epoll_add_rfd(epfd, &ps_rfd); } -int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) +int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, @@ -1668,7 +1692,7 @@ int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) return 0; } -static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) +static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; @@ -1679,7 +1703,7 @@ static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete return ret; } -int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) +int page_server_start_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 00f088ff3..457c0d649 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -1,5 +1,6 @@ #include #include +#include #include "page.h" #include "pagemap-cache.h" @@ -10,6 +11,7 @@ #include "vma.h" #include "mem.h" #include "kerndat.h" +#include "fault-injection.h" #undef LOG_PREFIX #define LOG_PREFIX "pagemap-cache: " @@ -22,6 +24,8 @@ #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) +#define PAGE_REGIONS_MAX_NR 32768 + /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. @@ -50,10 +54,23 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; + pmc->regs_max_len = PAGE_PFN(map_size); + if (pmc->regs_max_len > PAGE_REGIONS_MAX_NR) + pmc->regs_max_len = PAGE_REGIONS_MAX_NR; + pmc->regs_len = 0; + pmc->regs_idx = 0; + pmc->regs = NULL; + pmc->map = NULL; - pmc->map = xmalloc(pmc->map_len); - if (!pmc->map) - goto err; + if (kdat.has_pagemap_scan && !fault_injected(FI_DONT_USE_PAGEMAP_SCAN)) { + pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); + if (!pmc->regs) + goto err; + } else { + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; + } if (pagemap_cache_disabled) pr_warn_once("The pagemap cache is disabled\n"); @@ -87,17 +104,11 @@ err: return -1; } -static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) -{ - return &pmc->map[PAGE_PFN(addr - pmc->start)]; -} - static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); - size_t size_map; if (high > kdat.task_size) high = kdat.task_size; @@ -115,7 +126,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) * fit in solid manner, iow -- either the whole vma fits * the cache window, either plain read is used. * - * The benefit (apart redusing the number of read() calls) + * The benefit (apart reducing the number of read() calls) * is to walk page tables less. */ if (!pagemap_cache_disabled && len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { @@ -149,39 +160,89 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) pr_debug("\t%d: simple mode [l:%lx h:%lx]\n", pmc->pid, pmc->start, pmc->end); } + return pmc_fill(pmc, pmc->start, pmc->end); +} + +int pmc_fill(pmc_t *pmc, u64 start, u64 end) +{ + size_t size_map, off; + + pmc->start = start; + pmc->end = end; + size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); - if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; + if (pmc->regs) { + struct pm_scan_arg args = { + .size = sizeof(struct pm_scan_arg), + .flags = 0, + .start = pmc->start, + .end = pmc->end, + .vec = (long)pmc->regs, + .vec_len = pmc->regs_max_len, + .max_pages = 0, + /* + * Request pages that are in RAM or swap, excluding + * zero-filled and file-backed pages. + */ + .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, + .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, + }; + long ret; + + if (kdat.has_pagemap_scan_guard_pages) + args.return_mask |= PAGE_IS_GUARD; + + ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); + if (ret == -1) { + pr_perror("PAGEMAP_SCAN"); + pmc_zap(pmc); + return -1; + } + pmc->regs_len = ret; + pmc->regs_idx = 0; + pmc->end = args.walk_end; + } else { + for (off = 0; off != size_map;) { + ssize_t ret; + char *ptr = (char *)pmc->map; + + ret = pread(pmc->fd, ptr + off, size_map - off, PAGEMAP_PFN_OFF(pmc->start) + off); + if (ret == -1) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; + } + off += ret; + } } return 0; } -u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +int pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) - return __pmc_get_map(pmc, vma->e->start); + return 0; /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); - return NULL; + return -1; } - - /* Hit for sure */ - return __pmc_get_map(pmc, vma->e->start); + return 0; } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); + xfree(pmc->regs); pmc_reset(pmc); } diff --git a/criu/pagemap.c b/criu/pagemap.c index 83f69bba3..6c9c4f7fe 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -168,15 +168,15 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) return 0; } -static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%" PRIx64 " vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } -static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned long int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; @@ -195,7 +195,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v */ do { - int p_nr; + unsigned long int p_nr; pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); @@ -210,7 +210,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, v * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; - pr_info("\tparent has %u pages in\n", p_nr); + pr_info("\tparent has %lu pages in\n", p_nr); if (p_nr > nr) p_nr = nr; @@ -261,7 +261,7 @@ static int read_local_page(struct page_read *pr, unsigned long vaddr, unsigned l break; } - if (opts.auto_dedup) { + if (opts.auto_dedup && !pr->disable_dedup) { ret = punch_hole(pr, pr->pi_off, len, false); if (ret == -1) return -1; @@ -374,7 +374,7 @@ int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, st return 0; } -static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; @@ -402,7 +402,7 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int * We cannot use maybe_read_page_local() for streaming images as it uses * pread(), seeking in the file. Instead, we use this custom page reader. */ -static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { unsigned long len = nr * PAGE_SIZE; int fd; @@ -445,7 +445,7 @@ static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vadd return ret; } -static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) +static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigned long int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; @@ -463,7 +463,7 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_ return ret; } -static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { int ret; @@ -474,9 +474,9 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int return ret; } -static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) { - pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); + pr_info("pr%lu-%u Read %lx %lu pages\n", pr->img_id, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { @@ -682,6 +682,9 @@ static void init_compat_pagemap_entry(PagemapEntry *pe) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; + + if (!pe->has_nr_pages) + pe->nr_pages = pe->compat_nr_pages; } /* @@ -792,6 +795,7 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p pr->bunch.iov_base = NULL; pr->pmes = NULL; pr->pieok = false; + pr->disable_dedup = false; pr->pmi = open_image_at(dfd, i_typ, O_RSTR, img_id); if (!pr->pmi) @@ -852,6 +856,14 @@ int open_page_read(unsigned long img_id, struct page_read *pr, int pr_flags) #define DUP_IDS_BASE 1000 +void page_read_disable_dedup(struct page_read *pr) +{ + pr_debug("disable dedup, id: %d\n", pr->id); + pr->disable_dedup = true; + if (pr->parent) + page_read_disable_dedup(pr->parent); +} + void dup_page_read(struct page_read *src, struct page_read *dst) { static int dup_ids = 1; diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index ee4fa86f4..e19847b37 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -9,7 +9,6 @@ #include "common/compiler.h" #include "types.h" #include "protobuf.h" -#include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" @@ -104,17 +103,24 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0])); BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0])); BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0])); + BUILD_BUG_ON(sizeof(ce->cap_amb[0]) != sizeof(c->cap_amb[0])); BUG_ON(ce->n_cap_inh != CR_CAP_SIZE); BUG_ON(ce->n_cap_prm != CR_CAP_SIZE); BUG_ON(ce->n_cap_eff != CR_CAP_SIZE); BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE); + BUG_ON(ce->n_cap_amb != CR_CAP_SIZE); memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE); memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE); memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); + memcpy(ce->cap_amb, c->cap_amb, sizeof(c->cap_amb[0]) * CR_CAP_SIZE); + if (c->no_new_privs > 0) { + ce->no_new_privs = c->no_new_privs; + ce->has_no_new_privs = true; + } ce->secbits = c->secbits; ce->n_groups = c->ngroups; @@ -195,13 +201,13 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit ret = compel_get_thread_regs(tctl, save_task_regs, core); if (ret) { pr_err("Can't obtain regs for thread %d\n", pid); - goto err_rth; + return -1; } ret = compel_arch_fetch_thread_area(tctl); if (ret) { pr_err("Can't obtain thread area of %d\n", pid); - goto err_rth; + return -1; } compel_arch_get_tls_thread(tctl, &args->tls); @@ -211,223 +217,17 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD); if (ret) { pr_err("Can't init thread in parasite %d\n", pid); - goto err_rth; + return -1; } ret = alloc_groups_copy_creds(creds, pc); if (ret) { pr_err("Can't copy creds for thread %d\n", pid); - goto err_rth; + return -1; } - compel_release_thread(tctl); - tid->ns[0].virt = args->tid; return dump_thread_core(pid, core, args); - -err_rth: - compel_release_thread(tctl); - return -1; -} - -int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - TaskCoreEntry *tc = item->core[0]->tc; - struct parasite_dump_sa_args *args; - int ret, sig; - SaEntry *sa, **psa; - - args = compel_parasite_args(ctl, struct parasite_dump_sa_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); - if (ret < 0) - return ret; - - psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); - if (!psa) - return -1; - - sa = (SaEntry *)(psa + SIGMAX - 2); - - tc->n_sigactions = SIGMAX - 2; - tc->sigactions = psa; - - for (sig = 1; sig <= SIGMAX; sig++) { - int i = sig - 1; - - if (sig == SIGSTOP || sig == SIGKILL) - continue; - - sa_entry__init(sa); - ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); - ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); - ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); -#ifdef CONFIG_MIPS - sa->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); - memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); -#else - BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); -#endif - sa->has_compat_sigaction = true; - sa->compat_sigaction = !compel_mode_native(ctl); - - *(psa++) = sa++; - } - - return 0; -} - -static void encode_itimer(struct itimerval *v, ItimerEntry *ie) -{ - ie->isec = v->it_interval.tv_sec; - ie->iusec = v->it_interval.tv_usec; - ie->vsec = v->it_value.tv_sec; - ie->vusec = v->it_value.tv_usec; -} - -int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - struct parasite_dump_itimers_args *args; - int ret; - - args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); - if (ret < 0) - return ret; - - encode_itimer((&args->real), (core->tc->timers->real)); - encode_itimer((&args->virt), (core->tc->timers->virt)); - encode_itimer((&args->prof), (core->tc->timers->prof)); - - return 0; -} - -static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) -{ - int sz; - - /* - * Will be free()-ed in core_entry_free() - */ - - sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); - tte->posix = xmalloc(sz); - if (!tte->posix) - return -1; - - tte->n_posix = n; - *pte = (PosixTimerEntry *)(tte->posix + n); - return 0; -} - -static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) -{ - pid_t vtid = 0; - int i; - - if (rtid == 0) - return 0; - - if (!(root_ns_mask & CLONE_NEWPID)) { - /* Non-pid-namespace case */ - pte->notify_thread_id = rtid; - pte->has_notify_thread_id = true; - return 0; - } - - /* Pid-namespace case */ - if (!kdat.has_nspid) { - pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); - return -1; - } - - for (i = 0; i < item->nr_threads; i++) { - if (item->threads[i].real != rtid) - continue; - - vtid = item->threads[i].ns[0].virt; - break; - } - - if (vtid == 0) { - pr_err("Unable to convert the notify thread id %d\n", rtid); - return -1; - } - - pte->notify_thread_id = vtid; - pte->has_notify_thread_id = true; - return 0; -} - -static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, - PosixTimerEntry *pte) -{ - pte->it_id = vp->spt.it_id; - pte->clock_id = vp->spt.clock_id; - pte->si_signo = vp->spt.si_signo; - pte->it_sigev_notify = vp->spt.it_sigev_notify; - pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); - - pte->overrun = v->overrun; - - pte->isec = v->val.it_interval.tv_sec; - pte->insec = v->val.it_interval.tv_nsec; - pte->vsec = v->val.it_value.tv_sec; - pte->vnsec = v->val.it_value.tv_nsec; - - if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) - return -1; - - return 0; -} - -int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - TaskTimersEntry *tte = core->tc->timers; - PosixTimerEntry *pte; - struct proc_posix_timer *temp; - struct parasite_dump_posix_timers_args *args; - int ret, exit_code = -1; - int args_size; - int i; - - if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) - return -1; - - args_size = posix_timers_dump_size(proc_args->timer_n); - args = compel_parasite_args_s(ctl, args_size); - args->timer_n = proc_args->timer_n; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - args->timer[i].it_id = temp->spt.it_id; - i++; - } - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); - if (ret < 0) - goto end_posix; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - posix_timer_entry__init(&pte[i]); - if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) - goto end_posix; - tte->posix[i] = &pte[i]; - i++; - } - - exit_code = 0; -end_posix: - free_posix_timers(proc_args); - return exit_code; } int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) @@ -435,6 +235,7 @@ int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_mis struct parasite_dump_misc *ma; ma = compel_parasite_args(ctl, struct parasite_dump_misc); + ma->has_membarrier_get_registrations = kdat.has_membarrier_get_registrations; if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) return -1; @@ -513,6 +314,7 @@ int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_a struct parasite_dump_cgroup_args *ca; ca = compel_parasite_args(ctl, struct parasite_dump_cgroup_args); + memcpy(ca->thread_cgrp, cgroup->thread_cgrp, sizeof(ca->thread_cgrp)); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_CGROUP, ctl); if (ret) { pr_err("Parasite failed to dump /proc/self/cgroup\n"); @@ -619,7 +421,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; - if (fault_injected(FI_NO_BREAKPOINTS)) + if (fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index b15568e08..110f7802a 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -13,6 +13,7 @@ #include "log.h" #include "util.h" #include "pidfd-store.h" +#include "sockets.h" struct pidfd_entry { pid_t pid; @@ -94,13 +95,11 @@ int init_pidfd_store_sk(pid_t pid, int sk) * This is similar to how fdstore_init() works. */ if (addrlen == sizeof(sa_family_t)) { - if (setsockopt(pidfd_store_sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || - setsockopt(pidfd_store_sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { - pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + if (sk_setbufs(pidfd_store_sk, buf)) { goto err; } - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%" PRIx64, pid, sk, + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%s", pid, sk, criu_run_id); addrlen += sizeof(addr.sun_family); diff --git a/criu/pidfd.c b/criu/pidfd.c new file mode 100644 index 000000000..ae32025b0 --- /dev/null +++ b/criu/pidfd.c @@ -0,0 +1,305 @@ +#include "common/lock.h" +#include "imgset.h" +#include "pidfd.h" +#include "fdinfo.h" +#include "pidfd.pb-c.h" +#include "protobuf.h" +#include "pstree.h" +#include +#include +#include +#include "common/bug.h" +#include "rst-malloc.h" + +#include "compel/plugins/std/syscall-codes.h" + +#undef LOG_PREFIX +#define LOG_PREFIX "pidfd: " + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + PidfdEntry *pidfe; + struct file_desc d; + + struct dead_pidfd *dead; + struct pidfd_info *next; +}; + +struct dead_pidfd { + unsigned int ino; + int creator_id; + + struct hlist_node hash; + struct pidfd_info *list; +}; + +#define DEAD_PIDFD_HASH_SIZE 32 +static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; + +void init_dead_pidfd_hash(void) +{ + for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) + INIT_HLIST_HEAD(&dead_pidfd_hash[i]); +} + +static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) +{ + struct dead_pidfd *dead; + struct hlist_head *chain; + + chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; + hlist_for_each_entry(dead, chain, hash) { + if (dead->ino == ino) { + return dead; + } + } + + return NULL; +} + +int is_pidfd_link(char *link) +{ + /* + * pidfs was introduced in Linux 6.9 + * before which anonymous-inodes were used + */ + return is_anon_link_type(link, "[pidfd]"); +} + +static void pr_info_pidfd(char *action, PidfdEntry *pidfe) +{ + pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", + action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino + ); +} + +static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) +{ + struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; + FileEntry fe = FILE_ENTRY__INIT; + + if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) + return -1; + + if (p->flags & PIDFD_THREAD) { + pr_err("PIDFD_THREAD flag is currently not supported\n"); + return -1; + } + + /* + * Check if the pid pidfd refers to is part of process tree + * This ensures the process will exist on restore. + */ + if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { + pr_err("pidfd pid %d is not a part of process tree..\n", + pidfd_info.pid); + return -1; + } + + pidfd_info.pidfe.id = id; + pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); + pidfd_info.pidfe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__PIDFD; + fe.id = pidfd_info.pidfe.id; + fe.pidfd = &pidfd_info.pidfe; + + pr_info_pidfd("Dumping", &pidfd_info.pidfe); + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +const struct fdtype_ops pidfd_dump_ops = { + .type = FD_TYPES__PIDFD, + .dump = dump_one_pidfd, +}; + +static int pidfd_open(pid_t pid, int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int create_tmp_process(void) +{ + int tmp_process; + tmp_process = fork(); + if (tmp_process < 0) { + pr_perror("Could not fork"); + return -1; + } else if (tmp_process == 0) { + while(1) + sleep(1); + } + return tmp_process; +} + +static int kill_helper(pid_t pid) +{ + int status; + sigset_t blockmask, oldmask; + + /* + * Block SIGCHLD to prevent interfering from sigchld_handler() + * and to properly handle the tmp process termination without + * a race condition. A similar approach is used in cr_system(). + */ + sigemptyset(&oldmask); + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { + pr_perror("Cannot set mask of blocked signals"); + goto err; + } + + if (kill(pid, SIGKILL) < 0) { + pr_perror("Could not kill temporary process with pid: %d", pid); + goto err; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("Could not wait on temporary process with pid: %d", pid); + goto err; + } + + /* Restore the original signal mask after tmp process has terminated */ + if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { + pr_perror("Cannot clear blocked signals"); + goto err; + } + + if (!WIFSIGNALED(status)) { + pr_err("Expected temporary process to be terminated by a signal\n"); + goto err; + } + + if (WTERMSIG(status) != SIGKILL) { + pr_err("Expected temporary process to be terminated by SIGKILL\n"); + goto err; + } + + return 0; +err: + return -1; +} + +static int open_one_pidfd(struct file_desc *d, int *new_fd) +{ + struct pidfd_info *info, *child; + struct dead_pidfd *dead = NULL; + pid_t pid; + int pidfd; + + info = container_of(d, struct pidfd_info, d); + if (info->pidfe->nspid != -1) { + pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + goto out; + } + + dead = lookup_dead_pidfd(info->pidfe->ino); + BUG_ON(!dead); + + if (info->dead && info->dead->creator_id != info->pidfe->id) { + int ret = recv_desc_from_peer(&info->d, &pidfd); + if (ret != 0) { + if (ret != 1) + pr_err("Can't get fd\n"); + return ret; + } + goto out; + } + + pid = create_tmp_process(); + if (pid < 0) + goto err_close; + + for (child = dead->list; child; child = child->next) { + if (child == info) + continue; + pidfd = pidfd_open(pid, child->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", child->pidfe->nspid); + goto err_close; + } + + if (send_desc_to_peer(pidfd, &child->d)) { + pr_perror("Can't send file descriptor"); + close(pidfd); + return -1; + } + close(pidfd); + } + + pidfd = pidfd_open(pid, info->pidfe->flags); + if (pidfd < 0) { + pr_perror("Could not open pidfd for %d", info->pidfe->nspid); + goto err_close; + } + if (kill_helper(pid)) + goto err_close; +out: + if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { + goto err_close; + } + + *new_fd = pidfd; + return 0; +err_close: + pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", + info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); + return -1; +} + +static struct file_desc_ops pidfd_desc_ops = { + .type = FD_TYPES__PIDFD, + .open = open_one_pidfd +}; + +static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) +{ + struct dead_pidfd *dead; + struct pidfd_info *info = obj; + + info->pidfe = pb_msg(msg, PidfdEntry); + pr_info_pidfd("Collected ", info->pidfe); + + info->dead = NULL; + if (info->pidfe->nspid != -1) + goto out; + + dead = lookup_dead_pidfd(info->pidfe->ino); + if (!dead) { + dead = xmalloc(sizeof(*dead)); + if (!dead) { + pr_err("Could not allocate memory..\n"); + return -1; + } + + INIT_HLIST_NODE(&dead->hash); + dead->list = NULL; + dead->ino = info->pidfe->ino; + dead->creator_id = info->pidfe->id; + hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); + } + + info->dead = dead; + info->next = dead->list; + dead->list = info; + if (dead->creator_id > info->pidfe->id) + dead->creator_id = info->pidfe->id; + +out: + return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); +} + +struct collect_image_info pidfd_cinfo = { + .fd_type = CR_FD_PIDFD, + .pb_type = PB_PIDFD, + .priv_size = sizeof(struct pidfd_info), + .collect = collect_one_pidfd, +}; diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 265dcf82b..60c7f1e94 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -18,6 +18,15 @@ ifeq ($(ARCH),mips) ccflags-y += -mno-abicalls -fno-pic endif +# -mshstk required for CET instructions +ifeq ($(ARCH),x86) + ccflags-y += -mshstk +endif + +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif + LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o @@ -38,6 +47,10 @@ ifeq ($(ARCH),ppc64) restorer-obj-y += ./$(ARCH_DIR)/vdso-trampoline.o endif +ifeq ($(ARCH),riscv64) + restorer-obj-y += ./$(ARCH_DIR)/vdso-lookup.o +endif + define gen-pie-rules $(1)-obj-y += $(1).o $(1)-obj-e += pie.lib.a diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library index da2a2fab3..d96a7ac32 100644 --- a/criu/pie/Makefile.library +++ b/criu/pie/Makefile.library @@ -27,3 +27,7 @@ CFLAGS += $(CFLAGS_PIE) ifeq ($(ARCH),mips) CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic endif + +ifeq ($(ARCH),riscv64) + ccflags-y += -fno-stack-protector +endif \ No newline at end of file diff --git a/criu/pie/parasite-vdso.c b/criu/pie/parasite-vdso.c index 355007fa9..f3ad3107f 100644 --- a/criu/pie/parasite-vdso.c +++ b/criu/pie/parasite-vdso.c @@ -45,6 +45,7 @@ static int remap_one(char *who, unsigned long *from, unsigned long to, size_t si static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) { unsigned long vvar_size = rt->sym.vvar_size; + unsigned long vvar_vclock_size = rt->sym.vvar_vclock_size; unsigned long vdso_size = rt->sym.vdso_size; int ret; @@ -54,8 +55,24 @@ static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) std_log_set_gettimeofday(NULL); /* stop using vdso for timings */ - if (vvar) + if (vvar) { + /* + * v6.13-rc1~172^2~9 splits the vvar vma in two parts vvar and + * vvar_clock. The last one is mapped right after the first + * one. + */ + if (vvar_vclock_size) { + unsigned long from; + + vvar_size -= vvar_vclock_size; + from = rt->vvar_start + vvar_size; + + ret = remap_one("rt-vvar", &from, vvar + vvar_size, vvar_vclock_size); + if (ret) + return ret; + } ret = remap_one("rt-vvar", &rt->vvar_start, vvar, vvar_size); + } if (!ret) vdso_update_gtod_addr(rt); diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index e7eb1fcb6..c966e9e62 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -14,6 +13,7 @@ #include "int.h" #include "types.h" #include +#include "linux/mount.h" #include "parasite.h" #include "fcntl.h" #include "prctl.h" @@ -101,7 +101,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) } if (spliced_bytes != args->nr_pages * PAGE_SIZE) { sys_close(p); - pr_err("Can't splice all pages to pipe (%ld/%d)\n", spliced_bytes, args->nr_pages); + pr_err("Can't splice all pages to pipe (%ld/%ld)\n", spliced_bytes, args->nr_pages); return -1; } @@ -211,6 +211,63 @@ out: return ret; } +/* + * Returns a membarrier() registration command (it is a bitmask) if the process + * was registered for specified (as a bit index) membarrier()-issuing command; + * returns zero otherwise. + */ +static int get_membarrier_registration_mask(int cmd_bit) +{ + unsigned cmd = 1 << cmd_bit; + int ret; + + /* + * Issuing a barrier will be successful only if the process was registered + * for this type of membarrier. All errors are a sign that the type issued + * was not registered (EPERM) or not supported by kernel (EINVAL or ENOSYS). + */ + ret = sys_membarrier(cmd, 0, 0); + if (ret && ret != -EPERM && ret != -EINVAL && ret != -ENOSYS) { + pr_err("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + return -1; + } + pr_debug("membarrier(1 << %d) returned %d\n", cmd_bit, ret); + /* + * For supported registrations, MEMBARRIER_CMD_REGISTER_xxx = MEMBARRIER_CMD_xxx << 1. + * See: enum membarrier_cmd in include/uapi/linux/membarrier.h in kernel sources. + */ + return ret ? 0 : cmd << 1; +} + +/* + * It would be better to check the following with BUILD_BUG_ON, but we might + * have an old linux/membarrier.h header without necessary enum values. + */ +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 +#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 +#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 + +static int dump_membarrier_compat(int *membarrier_registration_mask) +{ + int ret; + + *membarrier_registration_mask = 0; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); + if (ret < 0) + return -1; + *membarrier_registration_mask |= ret; + return 0; +} + static int dump_misc(struct parasite_dump_misc *args) { int ret; @@ -225,6 +282,19 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (args->has_membarrier_get_registrations) { + ret = sys_membarrier(1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + pr_err("membarrier(1 << %d) returned %d\n", MEMBARRIER_CMDBIT_GET_REGISTRATIONS, ret); + return -1; + } + args->membarrier_registration_mask = ret; + } else { + ret = dump_membarrier_compat(&args->membarrier_registration_mask); + if (ret) + return ret; + } + ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) pr_err("PR_GET_CHILD_SUBREAPER failed (%d)\n", ret); @@ -254,6 +324,7 @@ static int dump_creds(struct parasite_dump_creds *args) args->cap_prm[i] = data[i].prm; args->cap_inh[i] = data[i].inh; args->cap_bnd[i] = 0; + args->cap_amb[i] = 0; for (j = 0; j < 32; j++) { if (j + i * 32 > args->cap_last_cap) @@ -266,8 +337,21 @@ static int dump_creds(struct parasite_dump_creds *args) if (ret) args->cap_bnd[i] |= (1 << j); } + + for (j = 0; j < 32; j++) { + if (j + i * 32 > args->cap_last_cap) + break; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, j + i * 32, 0, 0); + if (ret < 0) { + pr_err("Unable to read ambient capability %d: %d\n", j + i * 32, ret); + return -1; + } + if (ret) + args->cap_amb[i] |= (1 << j); + } } + args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); @@ -745,7 +829,7 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return -1; } - cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0); + cgroup = sys_openat(proc, args->thread_cgrp, O_RDONLY, 0); sys_close(proc); if (cgroup < 0) { pr_err("can't get /proc/self/cgroup fd\n"); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index f80b68359..0a8aba41b 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "linux/userfaultfd.h" @@ -27,6 +28,7 @@ #include #include #include +#include "mman.h" #include "signal.h" #include "prctl.h" #include "criu-log.h" @@ -48,7 +50,17 @@ #include "images/inventory.pb-c.h" #include "shmem.h" -#include "restorer.h" + +/* + * sys_getgroups() buffer size. Not too much, to avoid stack overflow. + */ +#define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) + +/* + * Memory overhead limit for reading VMA when auto_dedup is enabled. + * An arbitrarily chosen trade-off point between speed and memory usage. + */ +#define AUTO_DEDUP_OVERHEAD_BYTES (128 << 20) #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 @@ -66,6 +78,10 @@ #define FALLOC_FL_PUNCH_HOLE 0x02 #endif +#ifndef ARCH_RT_SIGRETURN_RST +#define ARCH_RT_SIGRETURN_RST ARCH_RT_SIGRETURN +#endif + #define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ @@ -92,7 +108,7 @@ bool fault_injected(enum faults f) * Hint: compel on aarch64 shall learn relocs for that. */ static unsigned __page_size; -unsigned page_size(void) +unsigned long page_size(void) { return __page_size; } @@ -184,37 +200,58 @@ static int lsm_set_label(char *label, char *type, int procfd) return 0; } -static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type) +static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type, uid_t uid) { CredsEntry *ce = &args->creds; int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; - - /* - * We're still root here and thus can do it without failures. - */ + int ruid, euid, suid, fsuid; + int rgid, egid, sgid, fsgid; /* * Setup supplementary group IDs early. */ if (args->groups) { - ret = sys_setgroups(ce->n_groups, args->groups); - if (ret) { - pr_err("Can't setup supplementary group IDs: %d\n", ret); - return -1; + /* + * We may be in an unprivileged user namespace where setgroups + * is disabled. If the current list of groups is already what + * we want, skip the call to setgroups. + */ + unsigned int gids[MAX_GETGROUPS_CHECKED]; + int n = sys_getgroups(MAX_GETGROUPS_CHECKED, gids); + if (n != ce->n_groups || memcmp(gids, args->groups, n * sizeof(*gids))) { + ret = sys_setgroups(ce->n_groups, args->groups); + if (ret) { + pr_err("Can't setgroups([%zu gids]): %d\n", ce->n_groups, ret); + return -1; + } } } + /* + * Compare xids with current values. If all match then we can skip + * setting them (which requires extra capabilities). + */ + fsuid = sys_setfsuid(-1); + fsgid = sys_setfsgid(-1); + if (sys_getresuid(&ruid, &euid, &suid) == 0 && sys_getresgid(&rgid, &egid, &sgid) == 0 && ruid == ce->uid && + euid == ce->euid && suid == ce->suid && rgid == ce->gid && egid == ce->egid && sgid == ce->sgid && + fsuid == ce->fsuid && fsgid == ce->fsgid) { + goto skip_xids; + } + /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. */ - ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); - if (ret) { - pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); - return -1; + if (!uid) { + ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); + if (ret) { + pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); + return -1; + } } /* @@ -247,15 +284,18 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } +skip_xids: /* * Third -- restore securebits. We don't need them in any * special state any longer. */ - ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); - if (ret) { - pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); - return -1; + if (sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0) != ce->secbits) { + ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); + return -1; + } } /* @@ -271,10 +311,18 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); - if (ret) { + if (!ret) + continue; + if (!ce->has_no_new_privs || !ce->no_new_privs || args->cap_prm[b] & (1 << i)) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } + /* + * If prctl(NO_NEW_PRIVS) is going to be set then it + * will prevent inheriting the capabilities not in + * the permitted set. + */ + pr_warn("Unable to drop capability %d from bset: %d (but NO_NEW_PRIVS will drop it)\n", i + b * 32, ret); } } @@ -300,6 +348,22 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } + for (b = 0; b < CR_CAP_SIZE; b++) { + for (i = 0; i < 32; i++) { + if (b * 32 + i > args->cap_last_cap) + break; + if ((args->cap_amb[b] & (1 << i)) == 0) + /* don't set */ + continue; + ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + if (!ret) + continue; + pr_err("Unable to raise ambient capability %d: %d\n", i + b * 32, ret); + return -1; + } + } + + if (lsm_type != LSMTYPE__SELINUX) { /* * SELinux does not support setting the process context for @@ -315,6 +379,14 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ if (lsm_set_label(args->lsm_sockcreate, "sockcreate", procfd) < 0) return -1; + if (ce->has_no_new_privs && ce->no_new_privs) { + ret = sys_prctl(PR_SET_NO_NEW_PRIVS, ce->no_new_privs, 0, 0, 0); + if (ret) { + pr_err("Unable to set no_new_privs=%d: %d\n", ce->no_new_privs, ret); + return -1; + } + } + return 0; } @@ -579,14 +651,99 @@ static int restore_thread_common(struct thread_restore_args *args) static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sigframe) { - ARCH_RT_SIGRETURN(new_sp, sigframe); + ARCH_RT_SIGRETURN_RST(new_sp, sigframe); +} + +static int send_cg_set(int sk, int cg_set) +{ + struct cmsghdr *ch; + struct msghdr h; + /* + * 0th is the dummy call address for compatibility with userns helper + * 1st is the cg_set + */ + struct iovec iov[2]; + char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {}; + int ret, *dummy = NULL; + struct ucred *ucred; + + iov[0].iov_base = &dummy; + iov[0].iov_len = sizeof(dummy); + iov[1].iov_base = &cg_set; + iov[1].iov_len = sizeof(cg_set); + + h.msg_iov = iov; + h.msg_iovlen = sizeof(iov) / sizeof(struct iovec); + h.msg_name = NULL; + h.msg_namelen = 0; + h.msg_flags = 0; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + ch = CMSG_FIRSTHDR(&h); + ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); + ch->cmsg_level = SOL_SOCKET; + ch->cmsg_type = SCM_CREDENTIALS; + + ucred = (struct ucred *)CMSG_DATA(ch); + /* + * We still have privilege in this namespace so we can send + * thread id instead of pid of main thread, uid, gid as 0 + * since these 2 are ignored in cgroupd + */ + ucred->pid = sys_gettid(); + ucred->uid = 0; + ucred->gid = 0; + + ret = sys_sendmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to send packet to cgroupd %d\n", ret); + return -1; + } + + return 0; +} + +/* + * As the cgroupd socket is shared among threads and processes, this + * should be called with task_entries->cgroupd_sync_lock held. + */ +static int recv_cg_set_restore_ack(int sk) +{ + struct cmsghdr *ch; + struct msghdr h = {}; + char cmsg[CMSG_SPACE(sizeof(struct ucred))]; + struct ucred *cred; + int ret; + + h.msg_control = cmsg; + h.msg_controllen = sizeof(cmsg); + + ret = sys_recvmsg(sk, &h, 0); + if (ret < 0) { + pr_err("Unable to receive from cgroupd %d\n", ret); + return -1; + } + + if (h.msg_controllen != sizeof(cmsg)) { + pr_err("The message from cgroupd is truncated\n"); + return -1; + } + + ch = CMSG_FIRSTHDR(&h); + cred = (struct ucred *)CMSG_DATA(ch); + if (cred->pid != sys_gettid()) { + pr_err("cred pid %d != gettid\n", cred->pid); + return -1; + } + return 0; } /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. */ -long __export_restore_thread(struct thread_restore_args *args) +__visible long __export_restore_thread(struct thread_restore_args *args) { struct rt_sigframe *rt_sigframe; k_rtsigset_t to_block; @@ -599,6 +756,10 @@ long __export_restore_thread(struct thread_restore_args *args) goto core_restore_end; } + /* restore original shadow stack */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + /* All signals must be handled by thread leader */ ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); @@ -609,6 +770,24 @@ long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; + if (args->cg_set != -1) { + int err = 0; + + mutex_lock(&task_entries_local->cgroupd_sync_lock); + + pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); + + err = send_cg_set(args->cgroupd_sk, args->cg_set); + if (!err) + err = recv_cg_set_restore_ack(args->cgroupd_sk); + + mutex_unlock(&task_entries_local->cgroupd_sync_lock); + sys_close(args->cgroupd_sk); + + if (err) + goto core_restore_end; + } + if (restore_thread_common(args)) goto core_restore_end; @@ -634,7 +813,7 @@ long __export_restore_thread(struct thread_restore_args *args) if (restore_seccomp(args)) BUG(); - ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); + ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type, args->ta->uid); ret = ret || restore_dumpable_flag(&args->ta->mm); ret = ret || restore_pdeath_sig(args); if (ret) @@ -933,6 +1112,23 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) pr_info("Remap %lx->%lx len %lx\n", src, dst, len); + /* + * SHSTK VMAs are a bit special, in fact we create shstk vma right in the + * shstk_vma_restore() and populate it with contents from a premapped VMA + * (which in turns is just a normal anonymous VMA!). Then, we munmap() this + * premapped VMA. After, we need to adjust vma_premmaped_start(vma_entry) + * to point to a created shstk vma and treat it as a premmaped one in vma_remap(). + */ + if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) { + if (shstk_vma_restore(vma_entry)) { + pr_err("Unable to prepare shadow stack vma for remap %lx -> %lx\n", src, dst); + return -1; + } + + /* shstk_vma_restore() modifies vma premapped address */ + src = vma_premmaped_start(vma_entry); + } + if (src - dst < len) guard = dst; else if (dst - src < len) @@ -957,7 +1153,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) * |G|----tgt----| | * * 3. remap src to any other place. - * G prevents src from being remaped on tgt again + * G prevents src from being remapped on tgt again * | |-------------| -> |+++++src+++++| * |G|---tgt-----| | * @@ -1057,9 +1253,23 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { - int ret, i; - kernel_timer_t next_id; + int ret, i, exit_code = -1; + kernel_timer_t next_id = 0, timer_id; struct sigevent sev; + bool create_restore_ids = false; + + if (!args->posix_timers_n) + return 0; + + /* prctl returns EINVAL if PR_TIMER_CREATE_RESTORE_IDS isn't supported. */ + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0); + if (ret == 0) { + create_restore_ids = true; + } else if (ret != -EINVAL) { + pr_err("Can't enabled PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + return -1; + } for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; @@ -1071,30 +1281,61 @@ static int create_posix_timers(struct task_restore_args *args) #endif sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; + if (create_restore_ids) { + /* + * With enabled PR_TIMER_CREATE_RESTORE_IDS, the + * timer_create syscall creates a new timer with the + * specified ID. + */ + timer_id = args->posix_timers[i].spt.it_id; + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); + if (ret < 0) { + pr_err("Can't create posix timer - %d: %d\n", i, ret); + goto out; + } + if (timer_id != args->posix_timers[i].spt.it_id) { + pr_err("Unexpected timer id %u (expected %lu)\n", + timer_id, args->posix_timers[i].spt.it_id); + goto out; + } + continue; + } + while (1) { - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); - return ret; + goto out; } - if (next_id == args->posix_timers[i].spt.it_id) + if (timer_id != next_id) { + pr_err("Can't create timers, kernel don't give them consequently\n"); + goto out; + } + next_id++; + + if (timer_id == args->posix_timers[i].spt.it_id) break; - ret = sys_timer_delete(next_id); + ret = sys_timer_delete(timer_id); if (ret < 0) { - pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); - return ret; - } - - if ((long)next_id > args->posix_timers[i].spt.it_id) { - pr_err("Can't create timers, kernel don't give them consequently\n"); - return -1; + pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); + goto out; } } } - return 0; + exit_code = 0; +out: + if (create_restore_ids) { + ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, + PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0); + if (ret != 0) { + pr_err("Can't disable PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); + exit_code = -1; + } + } + return exit_code; } static void restore_posix_timers(struct task_restore_args *args) @@ -1117,18 +1358,24 @@ unsigned long vdso_rt_size = 0; void *bootstrap_start = NULL; unsigned int bootstrap_len = 0; -void __export_unmap(void) +__visible void __export_unmap(void) { sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } -static void unregister_libc_rseq(struct rst_rseq_param *rseq) +static int unregister_libc_rseq(struct rst_rseq_param *rseq) { - if (!rseq->rseq_abi_pointer) - return; + long ret; - /* can't fail if rseq is registered */ - sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 1, rseq->signature); + if (!rseq->rseq_abi_pointer) + return 0; + + ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 1, rseq->signature); + if (ret) { + pr_err("Failed to unregister libc rseq %ld\n", ret); + return -1; + } + return 0; } /* @@ -1324,6 +1571,40 @@ static int fd_poll(int inotify_fd) return sys_ppoll(&pfd, 1, &tmo, NULL, sizeof(sigset_t)); } +/* + * Call preadv() but limit size of the read. Zero `max_to_read` skips the limit. + */ +static ssize_t preadv_limited(int fd, struct iovec *iovs, int nr, off_t offs, size_t max_to_read) +{ + size_t saved_last_iov_len = 0; + ssize_t ret; + + if (max_to_read) { + for (int i = 0; i < nr; ++i) { + if (iovs[i].iov_len <= max_to_read) { + max_to_read -= iovs[i].iov_len; + continue; + } + + if (!max_to_read) { + nr = i; + break; + } + + saved_last_iov_len = iovs[i].iov_len; + iovs[i].iov_len = max_to_read; + nr = i + 1; + break; + } + } + + ret = sys_preadv(fd, iovs, nr, offs); + if (saved_last_iov_len) + iovs[nr - 1].iov_len = saved_last_iov_len; + + return ret; +} + /* * In the worst case buf size should be: * sizeof(struct inotify_event) * 2 + PATH_MAX @@ -1384,6 +1665,54 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; } +/* + * Restore membarrier() registrations. + */ +static int restore_membarrier_registrations(int mask) +{ + unsigned long bitmap[1] = { mask }; + int i, err, ret = 0; + + if (!mask) + return 0; + + pr_info("Restoring membarrier() registrations %x\n", mask); + + for_each_bit(i, bitmap) { + err = sys_membarrier(1 << i, 0, 0); + if (!err) + continue; + pr_err("Can't restore membarrier(1 << %d) registration: %d\n", i, err); + ret = -1; + } + + return ret; +} + +static int restore_madv_guard_regions(struct task_restore_args *args) +{ + int i, ret; + + for (i = 0; i < args->vmas_n; i++) { + VmaEntry *vma_entry = args->vmas + i; + size_t len; + + if (!vma_entry_is(vma_entry, VMA_AREA_GUARD)) + continue; + + len = vma_entry->end - vma_entry->start; + ret = sys_madvise(vma_entry->start, len, MADV_GUARD_INSTALL); + if (ret) { + pr_err("madvise(%" PRIx64 ", %zu, MADV_GUARD_INSTALL) " + "failed with %d\n", + vma_entry->start, len, ret); + return -1; + } + } + + return 0; +} + /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1391,7 +1720,7 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) * and jump execution to some predefined ip read from * core file. */ -long __export_restore_task(struct task_restore_args *args) +__visible long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; @@ -1451,6 +1780,9 @@ long __export_restore_task(struct task_restore_args *args) pr_debug("lazy-pages: uffd %d\n", args->uffd); } + if (arch_shstk_switch_to_restorer(&args->shstk)) + goto core_restore_end; + /* * Park vdso/vvar in a safe place if architecture doesn't support * mapping them with arch_prctl(). @@ -1477,7 +1809,8 @@ long __export_restore_task(struct task_restore_args *args) * for instance once the kernel will want to update (struct rseq).cpu_id field: * https://github.com/torvalds/linux/blob/ce522ba9ef7e/kernel/rseq.c#L89 */ - unregister_libc_rseq(&args->libc_rseq); + if (unregister_libc_rseq(&args->libc_rseq)) + goto core_restore_end; if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, bootstrap_start, bootstrap_len, args->task_size)) @@ -1523,17 +1856,13 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } - if (args->uffd > -1) { - /* re-enable THP if we disabled it previously */ - if (args->has_thp_enabled) { - int ret; - ret = sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0); - if (ret) { - pr_err("Cannot re-enable THP: %d\n", ret); - goto core_restore_end; - } - } + ret = sys_prctl(PR_SET_THP_DISABLE, args->thp_disabled, 0, 0, 0); + if (ret) { + pr_err("Cannot restore THP_DISABLE=%d flag: %ld\n", args->thp_disabled, ret); + goto core_restore_end; + } + if (args->uffd > -1) { pr_debug("lazy-pages: closing uffd %d\n", args->uffd); /* * All userfaultfd configuration has finished at this point. @@ -1575,7 +1904,12 @@ long __export_restore_task(struct task_restore_args *args) while (nr) { pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); + /* + * If we're requested to punch holes in the file after reading we do + * it to save memory. Limit the reads then to an arbitrary block size. + */ + r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, + args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); if (r < 0) { pr_err("Can't read pages data (%d)\n", (int)r); goto core_restore_end; @@ -1662,6 +1996,9 @@ long __export_restore_task(struct task_restore_args *args) for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { + if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) + continue; + ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%" PRIx64 ", %" PRIu64 ", %ld) " @@ -1673,6 +2010,13 @@ long __export_restore_task(struct task_restore_args *args) } } + /* + * Restore madvise(MADV_GUARD_INSTALL) + */ + ret = restore_madv_guard_regions(args); + if (ret) + goto core_restore_end; + /* * Tune up the task fields. */ @@ -1702,6 +2046,24 @@ long __export_restore_task(struct task_restore_args *args) .exe_fd = args->fd_exe_link, }; ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0); + if (ret) { + pr_debug("prctl PR_SET_MM_MAP failed with %d\n", (int)ret); + pr_debug(" .start_code = %" PRIx64 "\n", prctl_map.start_code); + pr_debug(" .end_code = %" PRIx64 "\n", prctl_map.end_code); + pr_debug(" .start_data = %" PRIx64 "\n", prctl_map.start_data); + pr_debug(" .end_data = %" PRIx64 "\n", prctl_map.end_data); + pr_debug(" .start_stack = %" PRIx64 "\n", prctl_map.start_stack); + pr_debug(" .start_brk = %" PRIx64 "\n", prctl_map.start_brk); + pr_debug(" .brk = %" PRIx64 "\n", prctl_map.brk); + pr_debug(" .arg_start = %" PRIx64 "\n", prctl_map.arg_start); + pr_debug(" .arg_end = %" PRIx64 "\n", prctl_map.arg_end); + pr_debug(" .env_start = %" PRIx64 "\n", prctl_map.env_start); + pr_debug(" .env_end = %" PRIx64 "\n", prctl_map.env_end); + pr_debug(" .auxv_size = %" PRIu32 "\n", prctl_map.auxv_size); + for (i = 0; i < prctl_map.auxv_size / sizeof(uint64_t); i++) + pr_debug(" .auxv[%d] = %" PRIx64 "\n", i, prctl_map.auxv[i]); + pr_debug(" .exe_fd = %" PRIu32 "\n", prctl_map.exe_fd); + } if (ret == -EINVAL) { ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); @@ -1831,6 +2193,7 @@ long __export_restore_task(struct task_restore_args *args) } if (ret != thread_args[i].pid) { pr_err("Unable to create a thread: %ld\n", ret); + sys_close(fd); mutex_unlock(&task_entries_local->last_pid_mutex); goto core_restore_end; } @@ -1855,6 +2218,9 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } + if (restore_membarrier_registrations(args->membarrier_registration_mask) < 0) + goto core_restore_end; + pr_info("%ld: Restored\n", sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); @@ -1915,13 +2281,21 @@ long __export_restore_task(struct task_restore_args *args) * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ - ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); + ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type, args->uid); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper); futex_set_and_wake(&thread_inprogress, args->nr_threads); + /* + * Shadow stack of the leader can be locked only after all other + * threads were cloned, otherwise they may start with read-only + * shadow stack. + */ + if (arch_shstk_restore(&args->shstk)) + goto core_restore_end; + restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); if (ret) @@ -1938,7 +2312,7 @@ long __export_restore_task(struct task_restore_args *args) * code below doesn't fail due to bad timing values. */ -#define itimer_armed(args, i) (args->itimers[i].it_interval.tv_sec || args->itimers[i].it_interval.tv_usec) +#define itimer_armed(args, i) (args->itimers[i].it_value.tv_sec || args->itimers[i].it_value.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index f1e3239ff..45fb6a648 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -48,10 +49,25 @@ static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, uintptr_t start, return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } +/* Local strlen implementation */ +static size_t __strlen(const char *str) +{ + const char *ptr; + + if (!str) + return 0; + + ptr = str; + while (*ptr != '\0') + ptr++; + + return ptr - str; +} + /* * Elf hash, see format specification. */ -static unsigned long elf_hash(const unsigned char *name) +static unsigned long elf_sysv_hash(const unsigned char *name) { unsigned long h = 0, g; @@ -65,6 +81,15 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* * The GNU hash format. Taken from glibc. */ +static unsigned long elf_gnu_hash(const unsigned char *name) +{ + unsigned long h = 5381; + for (unsigned char c = *name; c != '\0'; c = *++name) + h = h * 33 + c; + return h; +} + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BORD ELFDATA2MSB /* 0x02 */ #else @@ -73,30 +98,51 @@ static unsigned long elf_hash(const unsigned char *name) static int has_elf_identity(Ehdr_t *ehdr) { - /* - * See Elf specification for this magic values. - */ + /* check ELF magic */ + + if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || + ehdr->e_ident[EI_MAG1] != ELFMAG1 || + ehdr->e_ident[EI_MAG2] != ELFMAG2 || + ehdr->e_ident[EI_MAG3] != ELFMAG3) { + pr_err("Invalid ELF magic\n"); + return false; + }; + + /* check ELF class */ #if defined(CONFIG_VDSO_32) - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS32) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #else - static const char elf_ident[] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); + return false; }; #endif - BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); - - if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { - pr_err("ELF header magic mismatch\n"); + /* check ELF data encoding */ + if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { + pr_err("Unsupported ELF data encoding: %d\n", ehdr->e_ident[EI_DATA]); return false; - } + }; + /* check ELF version */ + if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) { + pr_err("Unsupported ELF version: %d\n", ehdr->e_ident[EI_VERSION]); + return false; + }; + /* check ELF OSABI */ + if (ehdr->e_ident[EI_OSABI] != ELFOSABI_NONE && + ehdr->e_ident[EI_OSABI] != ELFOSABI_LINUX) { + pr_err("Unsupported OSABI version: %d\n", ehdr->e_ident[EI_OSABI]); + return false; + }; return true; } -static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t **load) +static int parse_elf_phdr(uintptr_t mem, size_t size, + Phdr_t **dynamic, Phdr_t **load, bool *is_32bit) { Ehdr_t *ehdr = (void *)mem; uintptr_t addr; @@ -111,6 +157,8 @@ static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t * if (!has_elf_identity(ehdr)) return -EINVAL; + *is_32bit = ehdr->e_ident[EI_CLASS] != ELFCLASS64; + addr = mem + ehdr->e_phoff; if (__ptr_oob(addr, mem, size)) goto err_oob; @@ -149,11 +197,14 @@ err_oob: * Output parameters are: * @dyn_strtab - address of the symbol table * @dyn_symtab - address of the string table section - * @dyn_hash - address of the symbol hash table + * @dyn_hash - address of the symbol hash table + * @use_gnu_hash - the format of hash DT_HASH or DT_GNU_HASH */ -static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, - Dyn_t **dyn_hash) +static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, + Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, + Dyn_t **dyn_hash, bool *use_gnu_hash) { + Dyn_t *dyn_gnu_hash = NULL, *dyn_sysv_hash = NULL; Dyn_t *dyn_syment = NULL; Dyn_t *dyn_strsz = NULL; uintptr_t addr; @@ -184,16 +235,52 @@ static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t dyn_syment = d; pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); } else if (d->d_tag == DT_HASH) { - *dyn_hash = d; + dyn_sysv_hash = d; pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); + } else if (d->d_tag == DT_GNU_HASH) { + /* + * This is complicated. + * + * Looking at the Linux kernel source, the following can be seen + * regarding which hashing style the VDSO uses on each arch: + * + * aarch64: not specified (depends on linker, can be + * only GNU hash style) + * arm: --hash-style=sysv + * loongarch: --hash-style=sysv + * mips: --hash-style=sysv + * powerpc: --hash-style=both + * riscv: --hash-style=both + * s390: --hash-style=both + * x86: --hash-style=both + * + * Some architectures are using both hash-styles, that + * is the easiest for CRIU. Some architectures are only + * using the old style (sysv), that is what CRIU supports. + * + * Starting with Linux 6.11, aarch64 unfortunately decided + * to switch from '--hash-style=sysv' to ''. Specifying + * nothing unfortunately may mean GNU hash style only and not + * 'both' (depending on the linker). + */ + dyn_gnu_hash = d; + pr_debug("DT_GNU_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); } } - if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) { + if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || + (!dyn_gnu_hash && !dyn_sysv_hash)) { pr_err("Not all dynamic entries are present\n"); return -EINVAL; } + /* + * Prefer DT_HASH over DT_GNU_HASH as it's been more tested and + * as a result more stable. + */ + *use_gnu_hash = !dyn_sysv_hash; + *dyn_hash = dyn_sysv_hash ?: dyn_gnu_hash; + return 0; err_oob: @@ -208,60 +295,156 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif -static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, - uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab) +typedef uint32_t Hash32_t; + +static bool elf_symbol_match(uintptr_t mem, size_t size, + uintptr_t dynsymbol_names, Sym_t *sym, + const char *symbol, const size_t vdso_symbol_length) +{ + uintptr_t addr = (uintptr_t)sym; + char *name; + + if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) + return false; + + if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) + return false; + + addr = dynsymbol_names + sym->st_name; + if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) + return false; + name = (void *)addr; + + return !std_strncmp(name, symbol, vdso_symbol_length); +} + + +static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, + const char *symbol, uint32_t symbol_hash, unsigned int sym_off, + uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, + uint32_t nbucket, uint32_t nchain, void *_bucket, Hash_t *chain, + const size_t vdso_symbol_length, bool use_gnu_hash) +{ + unsigned int j; + uintptr_t addr; + + addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + + if (use_gnu_hash) { + Hash32_t *h, hash_val, *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + h = bucket + nbucket + (j - sym_off); + + symbol_hash |= 1; + do { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + hash_val = *h++; + if ((hash_val | 1) == symbol_hash && + elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + j++; + } while (!(hash_val & 1)); + } else { + Hash_t *bucket = _bucket; + + j = bucket[symbol_hash % nbucket]; + if (j == STN_UNDEF) + return 0; + + for (; j < nchain && j != STN_UNDEF; j = chain[j]) { + Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; + + if (elf_symbol_match(mem, size, dynsymbol_names, sym, + symbol, vdso_symbol_length)) + return sym->st_value; + } + } + return 0; +} + +static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, + struct vdso_symtable *t, uintptr_t dynsymbol_names, + Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash, + bool is_32bit) { ARCH_VDSO_SYMBOLS_LIST const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - Hash_t nbucket, nchain; - Hash_t *bucket, *chain; + void *bucket = NULL; + Hash_t *chain = NULL; + uint32_t nbucket, nchain = 0; - unsigned int i, j, k; - uintptr_t addr; + unsigned int sym_off = 0; + unsigned int i = 0; - nbucket = hash[0]; - nchain = hash[1]; - bucket = &hash[2]; - chain = &hash[nbucket + 2]; + unsigned long (*elf_hash)(const unsigned char *); + + if (use_gnu_hash) { + uint32_t *gnu_hash = (uint32_t *)hash; + uint32_t bloom_sz; + + nbucket = gnu_hash[0]; + sym_off = gnu_hash[1]; + bloom_sz = gnu_hash[2]; + if (is_32bit) { + uint32_t *bloom; + bloom = (uint32_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } else { + uint64_t *bloom; + bloom = (uint64_t *)&gnu_hash[4]; + bucket = (Hash_t *)(&bloom[bloom_sz]); + } + elf_hash = &elf_gnu_hash; + pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bucket %lx\n", + (unsigned long)nbucket, (unsigned long)sym_off, + (unsigned long)bloom_sz, + (unsigned long)bucket); + } else { + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + elf_hash = &elf_sysv_hash; + pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", + (unsigned long)nbucket, (unsigned long)nchain, + (unsigned long)bucket, (unsigned long)chain); + } - pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", (long)nbucket, (long)nchain, (unsigned long)bucket, - (unsigned long)chain); for (i = 0; i < VDSO_SYMBOL_MAX; i++) { const char *symbol = vdso_symbols[i]; - k = elf_hash((const unsigned char *)symbol); + unsigned long addr, symbol_hash; + const size_t symbol_length = __strlen(symbol); - for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { - Sym_t *sym; - char *name; + symbol_hash = elf_hash((const unsigned char *)symbol); + addr = elf_symbol_lookup(mem, size, symbol, symbol_hash, + sym_off, dynsymbol_names, dyn_symtab, load, + nbucket, nchain, bucket, chain, + vdso_symbol_length, use_gnu_hash); + pr_debug("symbol %s at address %lx\n", symbol, addr); + if (!addr) + continue; - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; - - addr += sizeof(Sym_t) * j; - if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) - continue; - sym = (void *)addr; - - if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) - continue; - - addr = dynsymbol_names + sym->st_name; - if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) - continue; - name = (void *)addr; - - if (std_strncmp(name, symbol, vdso_symbol_length)) - continue; - - /* XXX: provide strncpy() implementation for PIE */ - memcpy(t->symbols[i].name, name, vdso_symbol_length); - t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; - break; + /* XXX: provide strncpy() implementation for PIE */ + if (symbol_length > vdso_symbol_length) { + pr_err("strlen(%s) %zd, only %zd bytes available\n", + symbol, symbol_length, vdso_symbol_length); + return -EINVAL; } + memcpy(t->symbols[i].name, symbol, symbol_length); + t->symbols[i].offset = addr - load->p_vaddr; } + + return 0; } int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) @@ -271,6 +454,8 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_symtab = NULL; Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; + bool use_gnu_hash; + bool is_32bit; uintptr_t dynsymbol_names; uintptr_t addr; @@ -281,7 +466,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) /* * We need PT_LOAD and PT_DYNAMIC here. Each once. */ - ret = parse_elf_phdr(mem, size, &dynamic, &load); + ret = parse_elf_phdr(mem, size, &dynamic, &load, &is_32bit); if (ret < 0) return ret; if (!load || !dynamic) { @@ -296,7 +481,8 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) * needed. Note that we're interested in a small set of tags. */ - ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, &dyn_hash); + ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, + &dyn_hash, &use_gnu_hash); if (ret < 0) return ret; @@ -310,7 +496,11 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) goto err_oob; hash = (void *)addr; - parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab); + ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, + use_gnu_hash, is_32bit); + + if (ret <0) + return ret; return 0; diff --git a/criu/pipes.c b/criu/pipes.c index 43ff06e3d..daada8830 100644 --- a/criu/pipes.c +++ b/criu/pipes.c @@ -434,7 +434,7 @@ int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms /* steal_pipe has to be able to fit all data from a target pipe */ if (fcntl(steal_pipe[1], F_SETPIPE_SZ, pipe_size) < 0) { pr_perror("Unable to set a pipe size"); - goto err; + goto err_close; } bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); diff --git a/criu/plugin.c b/criu/plugin.c index f3fea2856..f9322a3c2 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -57,6 +57,11 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); + __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); + __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); + __assign_hook(POST_FORKING, "cr_plugin_post_forking"); + __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); + __assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late"); #undef __assign_hook @@ -254,6 +259,17 @@ int cr_plugin_init(int stage) goto err; } + if (stage == CR_PLUGIN_STAGE__RESTORE) { + int ret; + + if (check_inventory_plugins()) + goto err; + + ret = run_plugins(RESTORE_INIT); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + exit_code = 0; err: closedir(d); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index b3badb6e4..f51f2e801 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -42,10 +42,12 @@ #include "fault-injection.h" #include "memfd.h" #include "hugetlb.h" +#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" +#include "pidfd.pb-c.h" #include "plugin.h" #include @@ -72,6 +74,8 @@ struct buffer { static struct buffer __buf; static char *buf = __buf.buf; +/* only ever goes from false to true, if at all */ +static bool uprobes_vma_exists = false; /* * This is how AIO ring buffers look like in proc @@ -118,7 +122,8 @@ bool handle_vma_plugin(int *fd, struct stat *stat) return true; } -static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) +static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, + int *shstk) { char *tok; @@ -141,6 +146,8 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -157,11 +164,16 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) *io_pf = 1; + if (_vmflag_match(tok, "ss")) + *shstk = 1; + /* * Anything else is just ignored. */ @@ -172,25 +184,49 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { - __parse_vmflags(buf, flags, madv, io_pf); + int shstk = 0; + + __parse_vmflags(buf, flags, madv, io_pf, &shstk); } static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) { int io_pf = 0; + int shstk = 0; - __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); + __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf, + &shstk); + + if (shstk) + vma_area->e->status |= VMA_AREA_SHSTK; /* * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP + * + * The uprobes vma is also mapped by the kernel with VM_IO, among other flags */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED) + && !vma_area_is(vma_area, VMA_AREA_UPROBES)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) vma_area->e->has_madv = true; + + /* + * We set MAP_PRIVATE flag on vma_area->e->flags right after parsing + * a first line of VMA entry in /proc//smaps file: + * 7fa84fa70000-7fa84fa95000 rw-p 00000000 00:00 0 + * but it's too early and we can't distinguish between MAP_DROPPABLE + * and MAP_PRIVATE mappings yet, as they both private mappings in nature + * and at this point we haven't yet read "VmFlags:" line in smaps. + * + * Let's detect this situation and drop MAP_PRIVATE flag while keep + * MAP_DROPPABLE, otherwise restorer's restore_mapping() helper will fail. + */ + if ((vma_area->e->flags & MAP_PRIVATE) && (vma_area->e->flags & MAP_DROPPABLE)) + vma_area->e->flags &= ~MAP_PRIVATE; } static inline int is_anon_shmem_map(dev_t dev) @@ -313,25 +349,8 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct vfi_dev = makedev(vfi->dev_maj, vfi->dev_min); - if (is_memfd(vfi_dev)) { - char tmp[PATH_MAX]; - strlcpy(tmp, fname, PATH_MAX); - strip_deleted(tmp, strlen(tmp)); - - /* - * The error EPERM will be shown in the following pr_perror(). - * It comes from the previous open() call. - */ - pr_perror("Can't open mapped [%s]", tmp); - - /* - * TODO Perhaps we could do better than failing and dump the - * memory like what is being done in shmem.c - */ - return -1; - } - if (is_hugetlb_dev(vfi_dev, &hugetlb_flag) || is_anon_shmem_map(vfi_dev)) { + vma->e->status |= VMA_AREA_REGULAR; if (!(vma->e->flags & MAP_SHARED)) vma->e->status |= VMA_ANON_PRIVATE; else @@ -359,18 +378,44 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct } if (vma_stat(vma, fd)) { - close(fd); - return -1; + goto closefd; } - if (vma->vmst->st_dev != vfi_dev || vma->vmst->st_ino != vfi->ino) { - pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); - close(fd); - return -1; + if (vma->vmst->st_ino != vfi->ino) { + goto errmsg; + } + + /* + * If devices don't match it could be because file is on a btrfs subvolume, + * which means that device number returned by stat will not match what is + * seen in smaps and other places. To deal with that we need a more involved + * check. + */ + if (vma->vmst->st_dev != vfi_dev) { + int mnt_id; + struct ns_id *ns; + + if (get_fd_mntid(fd, &mnt_id)) + goto errmsg; + + ns = lookup_nsid_by_mnt_id(mnt_id); + if (!ns) + goto errmsg; + + if (!phys_stat_dev_match(vma->vmst->st_dev, vfi_dev, ns, fname)) + goto errmsg; + + vma->mnt_id = mnt_id; } *vm_file_fd = fd; return 0; + +errmsg: + pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); +closefd: + close(fd); + return -1; } static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi, @@ -557,11 +602,20 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat } else if (!strcmp(file_path, "[vdso]")) { if (handle_vdso_vma(vma_area)) goto err; - } else if (!strcmp(file_path, "[vvar]")) { + } else if (!strcmp(file_path, "[vvar]") || + !strcmp(file_path, "[vvar_vclock]")) { if (handle_vvar_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; + } else if (!strcmp(file_path, "[uprobes]")) { + uprobes_vma_exists = true; + if (!opts.allow_uprobes) { + pr_err("PID %d has uprobes vma. Consider using --" OPT_ALLOW_UPROBES ".\n", + pid); + goto err; + } + vma_area->e->status |= VMA_AREA_UPROBES; } else { vma_area->e->status = VMA_AREA_REGULAR; } @@ -620,17 +674,16 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat pr_info("path: %s\n", file_path); vma_area->e->status |= VMA_AREA_SYSVIPC; } else { - /* Dump shmem dev, hugetlb dev (private and share) mappings the same way as memfd - * when possible. + /* We dump memfd backed mapping, both normal and hugepage anonymous share + * mapping using memfd approach when possible. */ if (is_memfd(st_buf->st_dev) || is_anon_shmem_map(st_buf->st_dev) || - (kdat.has_memfd_hugetlb && is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag))) { + can_dump_with_memfd_hugetlb(st_buf->st_dev, &hugetlb_flag, file_path, vma_area)) { vma_area->e->status |= VMA_AREA_MEMFD; vma_area->e->flags |= hugetlb_flag; if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } else if (is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag)) { - /* hugetlb mapping but memfd does not support HUGETLB */ vma_area->e->flags |= hugetlb_flag; vma_area->e->flags |= MAP_ANONYMOUS; @@ -699,6 +752,10 @@ static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area */ pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", vma_area->e->start, vma_area->e->end); + } else if (vma_area->e->status & VMA_AREA_UPROBES) { + pr_debug("Skipping uprobes vma %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + return 0; } else if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); @@ -750,7 +807,7 @@ static int task_size_check(pid_t pid, VmaEntry *entry) int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap) { - struct vma_area *vma_area = NULL; + struct vma_area *vma_area = NULL, *prev_vma_area = NULL; unsigned long start, end, pgoff, prev_end = 0; char r, w, x, s; int ret = -1, vm_file_fd = -1; @@ -792,8 +849,22 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du continue; } - if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) - goto err; + if (vma_area && vma_area_is(vma_area, VMA_AREA_VVAR) && + prev_vma_area && vma_area_is(prev_vma_area, VMA_AREA_VVAR)) { + if (prev_vma_area->e->end != vma_area->e->start) { + pr_err("two nonconsecutive vvar vma-s: " + "%" PRIx64 "-%" PRIx64 " %" PRIx64 "-%" PRIx64 "\n", + prev_vma_area->e->start, prev_vma_area->e->end, + vma_area->e->start, vma_area->e->end); + goto err; + } + /* Merge all vvar vma-s into one. */ + prev_vma_area->e->end = vma_area->e->end; + } else { + if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) + goto err; + prev_vma_area = vma_area; + } if (eof) break; @@ -833,6 +904,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du goto err; } + pr_debug("Handling VMA with the following smaps entry: %s\n", str); if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; @@ -891,7 +963,7 @@ int parse_pid_stat(pid_t pid, struct proc_pid_stat *s) *tok = '\0'; *p = '\0'; - strlcpy(s->comm, tok + 1, sizeof(s->comm)); + __strlcpy(s->comm, tok + 1, sizeof(s->comm)); n = sscanf(p + 1, " %c %d %d %d %d %d %u %lu %lu %lu %lu " @@ -1028,12 +1100,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) cr->s.sigpnd = 0; cr->s.shdpnd = 0; + cr->s.sigblk = 0; cr->s.seccomp_mode = SECCOMP_MODE_DISABLED; if (bfdopenr(&f)) return -1; - while (done < 13) { + while (done < 14) { str = breadline(&f); if (str == NULL) break; @@ -1117,6 +1190,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) continue; } + if (!strncmp(str, "CapAmb:", 7)) { + if (cap_parse(str + 8, cr->cap_amb)) + goto err_parse; + done++; + continue; + } + if (!strncmp(str, "Seccomp:", 8)) { if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { goto err_parse; @@ -1144,13 +1224,23 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) goto err_parse; cr->s.sigpnd |= sigpnd; + done++; + continue; + } + if (!strncmp(str, "SigBlk:", 7)) { + unsigned long long sigblk = 0; + + if (sscanf(str + 7, "%llx", &sigblk) != 1) + goto err_parse; + cr->s.sigblk |= sigblk; + done++; continue; } } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 11 : 10); + expected_done = (parsed_seccomp ? 13 : 12); if (kdat.has_nspid) expected_done++; if (done == expected_done) @@ -1387,7 +1477,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -1952,10 +2042,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) " pos:%lli ino:%lx sdev:%x", &e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos, (long *)&e->inode, &e->dev); - if (ret < 3 || ret > 6) { - eventpoll_tfd_entry__free_unpacked(e, NULL); - goto parse_err; - } else if (ret == 3) { + if (ret == 3) { e->has_dev = false; e->has_inode = false; e->has_pos = false; @@ -1963,7 +2050,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) e->has_dev = true; e->has_inode = true; e->has_pos = true; - } else if (ret < 6) { + } else { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } @@ -2137,6 +2224,33 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; + entry_met = true; + continue; + } + if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { + struct pidfd_dump_info *pidfd_info = arg; + + if (type != FD_TYPES__PIDFD) + continue; + + if (fdinfo_field(str, "ino")) { + ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "Pid")) { + ret = sscanf(str, "%*s %d", &pidfd_info->pid); + if (ret != 1) + goto parse_err; + } else if (fdinfo_field(str, "NSpid")) { + char *last; + + last = strrchr(str, '\t'); + if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { + pr_err("Unable to parse: %s\n", str); + goto parse_err; + } + } + entry_met = true; continue; } @@ -2188,10 +2302,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } @@ -2539,7 +2653,8 @@ err: return -1; } -int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) +int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *retl, + unsigned int *n) { FILE *f; int ret; @@ -2547,7 +2662,7 @@ int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct li unsigned int n_internal = 0; struct cg_ctl *intern, *ext; - f = fopen_proc(pid, "cgroup"); + f = fopen_proc(pid, "task/%d/cgroup", tid); if (!f) return -1; @@ -2831,3 +2946,8 @@ int parse_uptime(uint64_t *upt) fclose(f); return 0; } + +bool found_uprobes_vma(void) +{ + return uprobes_vma_exists; +} diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5b..e0dbfccc2 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; diff --git a/criu/pstree.c b/criu/pstree.c index f4d77b3a4..cee8b5741 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -63,6 +63,7 @@ CoreEntry *core_entry_alloc(int th, int tsk) sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); + sz += CR_CAP_SIZE * sizeof(ce->cap_amb[0]); /* * @groups are dynamic and allocated * on demand. @@ -122,10 +123,12 @@ CoreEntry *core_entry_alloc(int th, int tsk) ce->n_cap_prm = CR_CAP_SIZE; ce->n_cap_eff = CR_CAP_SIZE; ce->n_cap_bnd = CR_CAP_SIZE; + ce->n_cap_amb = CR_CAP_SIZE; ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0])); ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0])); ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + ce->cap_amb = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_amb[0])); if (arch_alloc_thread_info(core)) { xfree(core); @@ -179,7 +182,7 @@ void free_pstree(struct pstree_item *root_item) struct pstree_item *item = root_item, *parent; while (item) { - if (!list_empty(&item->children)) { + if (has_children(item)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } @@ -222,6 +225,7 @@ struct pstree_item *__alloc_pstree_item(bool rst) item->pid->ns[0].virt = -1; item->pid->real = -1; item->pid->state = TASK_UNDEF; + item->pid->stop_signo = -1; item->born_sid = -1; item->pid->item = item; futex_init(&item->task_st); @@ -233,17 +237,21 @@ int init_pstree_helper(struct pstree_item *ret) { BUG_ON(!ret->parent); ret->pid->state = TASK_HELPER; - rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; - if (shared_fdt_prepare(ret) < 0) - return -1; + rsti(ret)->clone_flags = 0; + INIT_LIST_HEAD(&rsti(ret)->fds); task_entries->nr_helpers++; return 0; } +bool has_children(struct pstree_item *item) +{ + return !list_empty(&item->children); +} + /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { - if (!list_empty(&item->children)) + if (has_children(item)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { @@ -380,20 +388,26 @@ static int prepare_pstree_for_shell_job(pid_t pid) } for_each_pstree_item(pi) { + if (pi->sid == current_sid) { + pr_err("Current sid %d intersects with sid of (%d) in images\n", current_sid, vpid(pi)); + return -1; + } if (pi->sid == old_sid) pi->sid = current_sid; + if (pi->pgid == current_sid) { + pr_err("Current sid %d intersects with pgid of (%d) in images\n", current_sid, + vpid(pi)); + return -1; + } if (pi->pgid == old_sid) pi->pgid = current_sid; } - - if (lookup_create_item(current_sid) == NULL) - return -1; } /* root_item is a group leader */ if (root_item->pgid == vpid(root_item)) - return 0; + goto add_fake_session_leader; old_gid = root_item->pgid; if (old_gid != current_gid) { @@ -406,14 +420,21 @@ static int prepare_pstree_for_shell_job(pid_t pid) } for_each_pstree_item(pi) { + if (current_gid != current_sid && pi->pgid == current_gid) { + pr_err("Current gid %d intersects with pgid of (%d) in images\n", current_gid, + vpid(pi)); + return -1; + } if (pi->pgid == old_gid) pi->pgid = current_gid; } - - if (lookup_create_item(current_gid) == NULL) - return -1; } + if (old_gid != current_gid && !lookup_create_item(current_gid)) + return -1; +add_fake_session_leader: + if (old_sid != current_sid && !lookup_create_item(current_sid)) + return -1; return 0; } @@ -945,7 +966,7 @@ static int prepare_pstree_kobj_ids(void) * this namespace is either inherited from the * criu or is created for the init task (only) */ - pr_err("Can't restore sub-task in NS\n"); + pr_err("Can't restore sub-task in NS (cflags %lx)\n", cflags); return -1; } } diff --git a/criu/seize.c b/criu/seize.c index 58564ca74..d0cf7b36c 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -16,6 +16,7 @@ #include "pstree.h" #include "criu-log.h" #include +#include "plugin.h" #include "proc_parse.h" #include "seccomp.h" #include "seize.h" @@ -24,13 +25,72 @@ #include "xmalloc.h" #include "util.h" +static bool compel_interrupt_only_mode; + +/* + * Disables the use of freeze cgroups for process seizing, even if explicitly + * requested via the --freeze-cgroup option or already set in a frozen state. + * This is necessary for plugins (e.g., CUDA) that do not function correctly + * when processes are frozen using cgroups. + */ +void __attribute__((used)) set_compel_interrupt_only_mode(void) +{ + compel_interrupt_only_mode = true; +} + +char *task_comm_info(pid_t pid, char *comm, size_t size) +{ + bool is_read = false; + + if (!pr_quelled(LOG_INFO)) { + int saved_errno = errno; + char path[32]; + int fd; + + snprintf(path, sizeof(path), "/proc/%d/comm", pid); + fd = open(path, O_RDONLY); + if (fd >= 0) { + ssize_t n = read(fd, comm, size); + if (n > 0) { + is_read = true; + /* Replace '\n' printed by kernel with '\0' */ + comm[n - 1] = '\0'; + } else { + pr_warn("Failed to read %s: %s\n", path, strerror(errno)); + } + close(fd); + } else { + pr_warn("Failed to open %s: %s\n", path, strerror(errno)); + } + errno = saved_errno; + } + + if (!is_read) + comm[0] = '\0'; + + return comm; +} + +/* + * NOTE: Don't run simultaneously, it uses local static buffer! + */ +char *__task_comm_info(pid_t pid) +{ + static char comm[32]; + + return task_comm_info(pid, comm, sizeof(comm)); +} + #define NR_ATTEMPTS 5 static const char frozen[] = "FROZEN"; static const char freezing[] = "FREEZING"; static const char thawed[] = "THAWED"; -enum freezer_state { FREEZER_ERROR = -1, THAWED, FROZEN, FREEZING }; +enum freezer_state { FREEZER_ERROR = -1, + THAWED, + FROZEN, + FREEZING }; /* Track if we are running on cgroup v2 system. */ static bool cgroup_v2 = false; @@ -146,12 +206,12 @@ static int freezer_write_state(int fd, enum freezer_state new_state) if (new_state == THAWED) { if (cgroup_v2) state[0] = '0'; - else if (strlcpy(state, thawed, sizeof(state)) >= sizeof(state)) + else if (__strlcpy(state, thawed, sizeof(state)) >= sizeof(state)) return -1; } else if (new_state == FROZEN) { if (cgroup_v2) state[0] = '1'; - else if (strlcpy(state, frozen, sizeof(state)) >= sizeof(state)) + else if (__strlcpy(state, frozen, sizeof(state)) >= sizeof(state)) return -1; } else { return -1; @@ -249,13 +309,13 @@ static int seize_cgroup_tree(char *root_path, enum freezer_state state) if (ret == 0) continue; if (errno != ESRCH) { - pr_perror("Unexpected error"); + pr_perror("Unexpected error for pid %d (comm %s)", pid, __task_comm_info(pid)); fclose(f); return -1; } if (!compel_interrupt_task(pid)) { - pr_debug("SEIZE %d: success\n", pid); + pr_debug("SEIZE %d (comm %s): success\n", pid, __task_comm_info(pid)); processes_to_wait++; } else if (state == FROZEN) { char buf[] = "/proc/XXXXXXXXXX/exe"; @@ -272,7 +332,7 @@ static int seize_cgroup_tree(char *root_path, enum freezer_state state) * before it compete exit procedure. The caller simply * should wait a bit and try freezing again. */ - pr_err("zombie found while seizing\n"); + pr_err("zombie %d (comm %s) found while seizing\n", pid, __task_comm_info(pid)); fclose(f); return -EAGAIN; } @@ -353,7 +413,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || compel_interrupt_only_mode) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -448,13 +508,73 @@ static int log_unfrozen_stacks(char *root) return 0; } +static int prepare_freezer_for_interrupt_only_mode(void) +{ + enum freezer_state state = THAWED; + int fd; + int exit_code = -1; + + BUG_ON(!compel_interrupt_only_mode); + + fd = freezer_open(); + if (fd < 0) + return -1; + + state = get_freezer_state(fd); + if (state == FREEZER_ERROR) { + goto err; + } + + origin_freezer_state = state == FREEZING ? FROZEN : state; + + if (state != THAWED) { + pr_warn("unfreezing cgroup for plugin compatibility\n"); + if (freezer_write_state(fd, THAWED)) + goto err; + } + + exit_code = 0; +err: + close(fd); + return exit_code; +} + +static void cgroupv1_freezer_kludges(int fd, int iter, const struct timespec *req) { + /* As per older kernel docs (freezer-subsystem.txt before + * the kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + * userspace should either retry or thaw. While current + * kernel cgroup v1 docs no longer mention a need to retry, + * even recent kernels can't reliably freeze a cgroup v1. + * + * Let's keep asking the kernel to freeze from time to time. + * In addition, do occasional thaw/sleep/freeze. + * + * This is still a game of chances (the real fix belongs to the kernel) + * but these kludges might improve the probability of success. + * + * Cgroup v2 does not have this problem. + */ + switch (iter % 32) { + case 9: + case 20: + freezer_write_state(fd, FROZEN); + break; + case 31: + freezer_write_state(fd, THAWED); + nanosleep(req, NULL); + freezer_write_state(fd, FROZEN); + break; + } +} + static int freeze_processes(void) { int fd, exit_code = -1; enum freezer_state state = THAWED; static const unsigned long step_ms = 100; - unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; + /* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */ + unsigned long nr_attempts = (opts.timeout * 1000) / step_ms; unsigned long i = 0; const struct timespec req = { @@ -463,14 +583,12 @@ static int freeze_processes(void) }; if (unlikely(!nr_attempts)) { - /* - * If timeout is turned off, lets - * wait for at least 10 seconds. - */ - nr_attempts = (10 * 1000000) / step_ms; + /* If the timeout is 0, wait for at least 10 seconds. */ + nr_attempts = (10 * 1000) / step_ms; } - pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms); + pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n", + opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout); fd = freezer_open(); if (fd < 0) @@ -497,22 +615,25 @@ static int freeze_processes(void) * not read @tasks pids while freezer in * transition stage. */ - for (; i <= nr_attempts; i++) { + while (1) { state = get_freezer_state(fd); if (state == FREEZER_ERROR) { close(fd); return -1; } - if (state == FROZEN) + if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) break; - if (alarm_timeouted()) - goto err; + + if (!cgroup_v2) + cgroupv1_freezer_kludges(fd, i, &req); + nanosleep(&req, NULL); } - if (i > nr_attempts) { - pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); + if (state != FROZEN) { + pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n", + opts.freeze_cgroup, i, step_ms, opts.timeout); if (!pr_quelled(LOG_DEBUG)) log_unfrozen_stacks(opts.freeze_cgroup); goto err; @@ -535,8 +656,10 @@ static int freeze_processes(void) } err: - if (exit_code == 0 || origin_freezer_state == THAWED) - exit_code = freezer_write_state(fd, THAWED); + if (exit_code == 0 || origin_freezer_state == THAWED) { + if (freezer_write_state(fd, THAWED)) + exit_code = -1; + } if (close(fd)) { pr_perror("Unable to thaw tasks"); @@ -584,15 +707,18 @@ static int collect_children(struct pstree_item *item) goto free; } - pr_info("Seized task %d, state %d\n", pid, ret); - c = alloc_pstree_item(); if (c == NULL) { ret = -1; goto free; } - if (!opts.freeze_cgroup) + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto free; + } + + if (!opts.freeze_cgroup || compel_interrupt_only_mode) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -615,6 +741,11 @@ static int collect_children(struct pstree_item *item) else processes_to_wait--; + if (ret == TASK_STOPPED) + c->pid->stop_signo = compel_parse_stop_signo(pid); + + pr_info("Seized task %d, state %d\n", pid, ret); + c->pid->real = pid; c->parent = item; c->pid->state = ret; @@ -646,7 +777,7 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) * the item->state is the state task was in when we seized one. */ - compel_resume_task(item->pid->real, item->pid->state, st); + compel_resume_task_sig(item->pid->real, item->pid->state, st, item->pid->stop_signo); if (st == TASK_DEAD) return; @@ -777,7 +908,8 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) + if ((!opts.freeze_cgroup || compel_interrupt_only_mode) && + compel_interrupt_task(pid)) continue; ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); @@ -833,7 +965,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup) + if (opts.freeze_cgroup && !compel_interrupt_only_mode) attempts = 1; /* @@ -876,7 +1008,7 @@ static int collect_task(struct pstree_item *item) if (ret < 0) goto err_close; - if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { + if ((item->pid->state == TASK_DEAD) && has_children(item)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } @@ -916,7 +1048,7 @@ static int cgroup_version(void) int collect_pstree(void) { pid_t pid = root_item->pid->real; - int ret = -1; + int ret, exit_code = -1; struct proc_status_creds creds; timing_start(TIME_FREEZING); @@ -933,12 +1065,31 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && freeze_processes()) - goto err; + if (opts.freeze_cgroup && !compel_interrupt_only_mode) { + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { - set_cr_errno(ESRCH); - goto err; + if (freeze_processes()) + goto err; + } else { + if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) + goto err; + + /* + * Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode() + * to be able to checkpoint containers in a frozen state. + */ + ret = run_plugins(PAUSE_DEVICES, pid); + if (ret < 0 && ret != -ENOTSUP) { + goto err; + } + + if (compel_interrupt_task(pid)) { + set_cr_errno(ESRCH); + goto err; + } } ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL); @@ -950,6 +1101,9 @@ int collect_pstree(void) else processes_to_wait--; + if (ret == TASK_STOPPED) + root_item->pid->stop_signo = compel_parse_stop_signo(pid); + pr_info("Seized task %d, state %d\n", pid, ret); root_item->pid->state = ret; @@ -961,17 +1115,35 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && freezer_wait_processes()) { - ret = -1; + if (opts.freeze_cgroup && !compel_interrupt_only_mode && + freezer_wait_processes()) { goto err; } - ret = 0; + exit_code = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); err: /* Freezing stage finished in time - disable timer. */ alarm(0); - return ret; + return exit_code; +} + +int checkpoint_devices(void) +{ + struct pstree_item *iter; + int ret, exit_code = -1; + + for_each_pstree_item(iter) { + if (!task_alive(iter)) + continue; + ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); + if (ret < 0 && ret != -ENOTSUP) + goto err; + } + + exit_code = 0; +err: + return exit_code; } diff --git a/criu/servicefd.c b/criu/servicefd.c index 06a8d3eba..dfb019066 100644 --- a/criu/servicefd.c +++ b/criu/servicefd.c @@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me) ret = 0; return ret; -} +} \ No newline at end of file diff --git a/criu/setproctitle.c b/criu/setproctitle.c new file mode 100644 index 000000000..9e01678d2 --- /dev/null +++ b/criu/setproctitle.c @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#ifdef CONFIG_HAS_LIBBSD +#include +#else + +#include "setproctitle.h" + +/* + * setproctitle_init is in the libbsd since v0.6.0. This macro allows to + * compile criu with libbsd<0.6.0. + */ +#ifndef CONFIG_HAS_SETPROCTITLE_INIT +#define setproctitle_init(argc, argv, envp) +#endif + +#define setproctitle(fmt, ...) +#endif + +void __setproctitle_init(int argc, char *argv[], char *envp[]) +{ + setproctitle_init(argc, argv, envp); +} + +#ifndef SPT_MAXTITLE +#define SPT_MAXTITLE 255 +#endif + +void __setproctitle(const char *fmt, ...) +{ + char buf[SPT_MAXTITLE + 1]; + va_list args; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + setproctitle("%s", buf); +} diff --git a/criu/shmem.c b/criu/shmem.c index 81e701586..bc7aa3669 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -206,26 +206,34 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) +static int update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; + u64 vaddr; if (!is_shmem_tracking_en()) - return; + return 0; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); - for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { - if (!should_dump_page(vma, map[vma_pfn])) + for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { + struct page_info page_info = {}; + + if (should_dump_page(pmc, vma, vaddr, &page_info)) + return -1; + + if (page_info.next != vaddr) { + vaddr = page_info.next - PAGE_SIZE; continue; + } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (map[vma_pfn] & PME_SOFT_DIRTY) + if (page_info.softdirty) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); - else if (page_is_zero(map[vma_pfn])) - set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } + + return 0; } int collect_sysv_shmem(unsigned long shmid, unsigned long size) @@ -648,7 +656,7 @@ err: return -1; } -int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) +int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); @@ -662,7 +670,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; } - update_shmem_pmaps(si, map, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } @@ -679,7 +689,9 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) if (expand_shmem(si, size)) return -1; - update_shmem_pmaps(si, map, vma); + + if (update_shmem_pmaps(si, pmc, vma)) + return -1; return 0; } @@ -750,7 +762,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si) unsigned long pgaddr; int st = -1; - if (pfn >= next_hole_pfn && next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) + if (fd >= 0 && pfn >= next_hole_pfn && next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) goto err_xfer; if (si->pstate_map && is_shmem_tracking_en()) { @@ -808,24 +820,62 @@ static int dump_one_shmem(struct shmem_info *si) { int fd, ret = -1; void *addr; + unsigned long cur, remaining; pr_info("Dumping shared memory %ld\n", si->shmid); - fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end); - if (fd < 0) - goto err; + fd = __open_proc(si->pid, EPERM, O_RDONLY, "map_files/%lx-%lx", si->start, si->end); + if (fd >= 0) { + addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't map shmem 0x%lx (0x%lx-0x%lx)", si->shmid, si->start, si->end); + goto errc; + } + } else { + if (errno != EPERM || !opts.unprivileged) { + goto err; + } - addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); - goto errc; + pr_debug("Could not access map_files/ link, falling back to /proc/$pid/mem\n"); + + fd = open_proc(si->pid, "mem"); + if (fd < 0) { + goto err; + } + + addr = mmap(NULL, si->size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't map empty space for shmem 0x%lx (0x%lx-0x%lx)", si->shmid, si->start, si->end); + goto errc; + } + + if (lseek(fd, si->start, SEEK_SET) < 0) { + pr_perror("Can't seek virtual memory"); + goto errc; + } + + cur = 0; + remaining = si->size; + do { + ret = read(fd, addr + cur, remaining); + if (ret <= 0) { + pr_perror("Can't read virtual memory"); + goto errc; + } + remaining -= ret; + cur += ret; + } while (remaining > 0); + + close(fd); + fd = -1; } ret = do_dump_one_shmem(fd, addr, si); munmap(addr, si->size); errc: - close(fd); + if (fd >= 0) + close(fd); err: return ret; } diff --git a/criu/sigact.c b/criu/sigact.c new file mode 100644 index 000000000..5174644d2 --- /dev/null +++ b/criu/sigact.c @@ -0,0 +1,319 @@ +#include "types.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "parasite.h" +#include "restorer.h" +#include "sigact.h" + +/* + * If parent's sigaction has blocked SIGKILL (which is non-sense), + * this parent action is non-valid and shouldn't be inherited. + * Used to mark parent_act* no more valid. + */ +static rt_sigaction_t parent_act[SIGMAX]; +#ifdef CONFIG_COMPAT +static rt_sigaction_t_compat parent_act_compat[SIGMAX]; +#endif + +static bool sa_inherited(int sig, rt_sigaction_t *sa) +{ + rt_sigaction_t *pa; + int i; + + if (current == root_item) + return false; /* XXX -- inherit from CRIU? */ + + pa = &parent_act[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static void *stack32; +rt_sigaction_t sigchld_act; + +#ifdef CONFIG_COMPAT +static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) +{ + rt_sigaction_t_compat *pa; + int i; + + if (current == root_item) + return false; + + pa = &parent_act_compat[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t_compat act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); + + if (sig == SIGCHLD) { + memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); + return 0; + } + + if (sa_compat_inherited(sig - 1, &act)) + return 1; + + if (!stack32) { + stack32 = alloc_compat_syscall_stack(); + if (!stack32) + return -1; + } + + ret = arch_compat_rt_sigaction(stack32, sig, &act); + if (ret < 0) { + pr_err("Can't restore compat sigaction: %d\n", ret); + return ret; + } + + parent_act_compat[sig - 1] = act; + /* Mark SIGKILL blocked which makes native sigaction non-valid */ + parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; + + return 1; +} +#else +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + return -1; +} +#endif + +static int restore_native_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); +#ifdef CONFIG_MIPS + e->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); + + memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); + memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); +#else + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); +#endif + if (sig == SIGCHLD) { + sigchld_act = act; + return 0; + } + + if (sa_inherited(sig - 1, &act)) + return 1; + + /* + * A pure syscall is used, because glibc + * sigaction overwrites se_restorer. + */ + ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); + if (ret < 0) { + pr_perror("Can't restore sigaction"); + return ret; + } + + parent_act[sig - 1] = act; + /* Mark SIGKILL blocked which makes compat sigaction non-valid */ +#ifdef CONFIG_COMPAT + parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; +#endif + + return 1; +} + +static int prepare_sigactions_from_core(TaskCoreEntry *tc) +{ + int sig, i; + + if (tc->n_sigactions != SIGMAX - 2) { + pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); + return -1; + } + + pr_info("Restore on-core sigactions for %d\n", vpid(current)); + + for (sig = 1, i = 0; sig <= SIGMAX; sig++) { + int ret; + SaEntry *e; + bool sigaction_is_compat; + + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + e = tc->sigactions[i++]; + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + if (ret < 0) + return ret; + } + + return 0; +} + +/* Returns number of restored signals, -1 or negative errno on fail */ +static int restore_one_sigaction(int sig, struct cr_img *img, int pid) +{ + bool sigaction_is_compat; + SaEntry *e; + int ret = 0; + + BUG_ON(sig == SIGKILL || sig == SIGSTOP); + + ret = pb_read_one_eof(img, &e, PB_SIGACT); + if (ret == 0) { + if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ + pr_err("Unexpected EOF %d\n", sig); + return -1; + } + pr_warn("This format of sigacts-%d.img is deprecated\n", pid); + return -1; + } + if (ret < 0) + return ret; + + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + sa_entry__free_unpacked(e, NULL); + + return ret; +} + +static int prepare_sigactions_from_image(void) +{ + int pid = vpid(current); + struct cr_img *img; + int sig, rst = 0; + int ret = 0; + + pr_info("Restore sigacts for %d\n", pid); + + img = open_image(CR_FD_SIGACT, O_RSTR, pid); + if (!img) + return -1; + + for (sig = 1; sig <= SIGMAX; sig++) { + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + ret = restore_one_sigaction(sig, img, pid); + if (ret < 0) + break; + if (ret) + rst++; + } + + pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); + + close_image(img); + return ret; +} + +int prepare_sigactions(CoreEntry *core) +{ + int ret; + + if (!task_alive(current)) + return 0; + + if (core->tc->n_sigactions != 0) + ret = prepare_sigactions_from_core(core->tc); + else + ret = prepare_sigactions_from_image(); + + if (stack32) { + free_compat_syscall_stack(stack32); + stack32 = NULL; + } + + return ret; +} + +int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + TaskCoreEntry *tc = item->core[0]->tc; + struct parasite_dump_sa_args *args; + int ret, sig; + SaEntry *sa, **psa; + + args = compel_parasite_args(ctl, struct parasite_dump_sa_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); + if (ret < 0) + return ret; + + psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); + if (!psa) + return -1; + + sa = (SaEntry *)(psa + SIGMAX - 2); + + tc->n_sigactions = SIGMAX - 2; + tc->sigactions = psa; + + for (sig = 1; sig <= SIGMAX; sig++) { + int i = sig - 1; + + if (sig == SIGSTOP || sig == SIGKILL) + continue; + + sa_entry__init(sa); + ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); + ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); + ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); +#ifdef CONFIG_MIPS + sa->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); + memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); +#else + BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); +#endif + sa->has_compat_sigaction = true; + sa->compat_sigaction = !compel_mode_native(ctl); + + *(psa++) = sa++; + } + + return 0; +} diff --git a/criu/sk-inet.c b/criu/sk-inet.c index e52b198c3..422edc656 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include "../soccr/soccr.h" @@ -42,6 +44,11 @@ #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 +/* Definition for older kernels without MPTCP support (e.g. Ubuntu 20.04) */ +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif + static LIST_HEAD(inet_ports); struct inet_port { @@ -123,9 +130,13 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); + if (proto == IPPROTO_MPTCP) + pr_err("For Go programs, consider using \"GODEBUG=multipathtcp=0\" to disable MPTCP\n"); return 0; } @@ -388,6 +399,10 @@ static int dump_ip_raw_opts(int sk, int family, int proto, IpOptsRawEntry *r) return ret; } +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *ioe) { int ret = 0; @@ -398,12 +413,26 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io * and fetch additional options. */ ret |= dump_ip_raw_opts(sk, family, proto, ioe->raw); - } else { - /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ - ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); - ioe->has_freebind = ioe->freebind; } + if (family == AF_INET6) { + if (kdat.has_ipv6_freebind) + ret |= dump_opt(sk, SOL_IPV6, IPV6_FREEBIND, &ioe->freebind); + else if (type != SOCK_RAW) + /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ + ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + ret |= dump_opt(sk, SOL_IPV6, IPV6_RECVPKTINFO, &ioe->pktinfo); + } else { + ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); + ret |= dump_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); + } + ioe->has_freebind = ioe->freebind; + ioe->has_pktinfo = !!ioe->pktinfo; + ioe->has_tos = !!ioe->tos; + ioe->has_ttl = !!ioe->ttl; + return ret; } @@ -434,6 +463,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT; IpOptsRawEntry ipopts_raw = IP_OPTS_RAW_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; + TcpOptsEntry tcpopts = TCP_OPTS_ENTRY__INIT; int ret = -1, err = -1, proto, aux, type; ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL, &proto, sizeof(proto)); @@ -501,6 +531,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa ie.opts = &skopts; ie.ip_opts = &ipopts; ie.ip_opts->raw = &ipopts_raw; + ie.tcp_opts = &tcpopts; ie.n_src_addr = PB_ALEN_INET; ie.n_dst_addr = PB_ALEN_INET; @@ -550,7 +581,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa if (dump_ip_opts(lfd, family, type, proto, &ipopts)) goto err; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, family, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); @@ -561,9 +592,20 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa switch (proto) { case IPPROTO_TCP: - err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk, &skopts) : 0; if (sk->shutdown) sk_encode_shutdown(&ie, sk->shutdown); + + if (type == SOCK_RAW) { + err = 0; + } else { + err = dump_tcp_opts(lfd, &tcpopts); + if (err < 0) + goto err; + + err = dump_one_tcp(lfd, sk, &skopts); + if (err < 0) + goto err; + } break; case IPPROTO_UDP: case IPPROTO_UDPLITE: @@ -787,8 +829,21 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) { int ret = 0; - if (ioe->has_freebind) - ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + if (family == AF_INET6) { + if (ioe->has_freebind) + ret |= restore_opt(sk, SOL_IPV6, IPV6_FREEBIND, &ioe->freebind); + if (ioe->has_pktinfo) + ret |= restore_opt(sk, SOL_IPV6, IPV6_RECVPKTINFO, &ioe->pktinfo); + } else { + if (ioe->has_freebind) + ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); + if (ioe->has_pktinfo) + ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); + if (ioe->has_tos) + ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); + if (ioe->has_ttl) + ret |= restore_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); + } if (ioe->raw) ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); @@ -869,8 +924,9 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) } if (ie->src_port) { - if (inet_bind(sk, ii)) - goto err; + if (ie->proto != IPPROTO_ICMP && ie->proto != IPPROTO_ICMPV6) + if (inet_bind(sk, ii)) + goto err; } /* @@ -906,6 +962,9 @@ done: if (restore_socket_opts(sk, ie->opts)) goto err; + if (ie->proto == IPPROTO_TCP && restore_tcp_opts(sk, ie->tcp_opts)) + goto err; + if (ie->has_shutdown && (ie->proto == IPPROTO_UDP || ie->proto == IPPROTO_UDPLITE || ie->proto == IPPROTO_TCP)) { if (shutdown(sk, sk_decode_shutdown(ie->shutdown))) { diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index 754eed932..dc2baa1b8 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -161,11 +161,11 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.protocol = val; } - + ne.flags = p->flags; ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_NETLINK, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 1d2e23522..6530bff58 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -173,7 +173,7 @@ static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; - if (dump_socket_opts(lfd, &skopts)) + if (dump_socket_opts(lfd, AF_PACKET, &skopts)) return -1; psk.protocol = sd->proto; diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 96d5d13bf..9c8bad1c3 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -39,6 +39,8 @@ static int lock_connection(struct inet_sk_desc *sk) return iptables_lock_connection(sk); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) return nftables_lock_connection(sk); + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -50,6 +52,8 @@ static int unlock_connection(struct inet_sk_desc *sk) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } @@ -131,7 +135,8 @@ void cpt_unlock_tcp_connections(void) static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; - int ret, aux; + int exit_code = -1; + int ret; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; char *buf; @@ -140,11 +145,11 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) ret = libsoccr_save(socr, &data, sizeof(data)); if (ret < 0) { pr_err("libsoccr_save() failed with %d\n", ret); - goto err_r; + goto err; } if (ret != sizeof(data)) { pr_err("This libsocr is not supported (%d vs %d)\n", ret, (int)sizeof(data)); - goto err_r; + goto err; } sk->state = data.state; @@ -181,43 +186,22 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) tse.rcv_wup = data.rcv_wup; } - /* - * TCP socket options - */ - - if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) - goto err_opt; - - if (aux) { - tse.has_nodelay = true; - tse.nodelay = true; - } - - if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) - goto err_opt; - - if (aux) { - tse.has_cork = true; - tse.cork = true; - } - /* * Push the stuff to image */ - img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino); if (!img) - goto err_img; + goto err; ret = pb_write_one(img, &tse, PB_TCP_STREAM); if (ret < 0) - goto err_iw; + goto err_close; buf = libsoccr_get_queue_bytes(socr, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); if (buf) { ret = write_img_buf(img, buf, tse.inq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } @@ -226,40 +210,40 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) if (buf) { ret = write_img_buf(img, buf, tse.outq_len); if (ret < 0) - goto err_iw; + goto err_close; xfree(buf); } pr_info("Done\n"); -err_iw: + exit_code = 0; +err_close: close_image(img); -err_img: -err_opt: -err_r: +err: + return exit_code; +} + +int dump_tcp_opts(int fd, TcpOptsEntry *toe) +{ + int ret = 0; + + ret |= dump_opt(fd, SOL_TCP, TCP_NODELAY, &toe->nodelay); + ret |= dump_opt(fd, SOL_TCP, TCP_CORK, &toe->cork); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + ret |= dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); + + toe->has_nodelay = !!toe->nodelay; + toe->has_cork = !!toe->cork; + toe->has_keepcnt = !!toe->keepcnt; + toe->has_keepidle = !!toe->keepidle; + toe->has_keepintvl = !!toe->keepintvl; + return ret; } int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) { - soe->has_tcp_keepcnt = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt)) { - pr_perror("Can't read TCP_KEEPCNT"); - return -1; - } - - soe->has_tcp_keepidle = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &soe->tcp_keepidle)) { - pr_perror("Can't read TCP_KEEPIDLE"); - return -1; - } - - soe->has_tcp_keepintvl = true; - if (dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &soe->tcp_keepintvl)) { - pr_perror("Can't read TCP_KEEPINTVL"); - return -1; - } - if (sk->dst_port == 0) return 0; @@ -393,6 +377,11 @@ static int restore_tcp_conn_state(int sk, struct libsoccr_sk *socr, struct inet_ if (libsoccr_restore(socr, &data, sizeof(data))) goto err_c; + /* + * Restoring TCP socket options in TcpStreamEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (tse->has_nodelay && tse->nodelay) { aux = 1; if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux)) @@ -445,6 +434,27 @@ int prepare_tcp_socks(struct task_restore_args *ta) return 0; } +int restore_tcp_opts(int sk, TcpOptsEntry *toe) +{ + int ret = 0; + + if(!toe) + return ret; + + if (toe->has_nodelay) + ret |= restore_opt(sk, SOL_TCP, TCP_NODELAY, &toe->nodelay); + if (toe->has_cork) + ret |= restore_opt(sk, SOL_TCP, TCP_CORK, &toe->cork); + if (toe->has_keepcnt) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); + if (toe->has_keepidle) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); + if (toe->has_keepintvl) + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); + + return ret; +} + int restore_one_tcp(int fd, struct inet_sk_info *ii) { struct libsoccr_sk *sk; @@ -483,6 +493,8 @@ static int unlock_connection_info(struct inet_sk_info *si) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; + else if (opts.network_lock_method == NETWORK_LOCK_SKIP) + return 0; return -1; } diff --git a/criu/sk-unix.c b/criu/sk-unix.c index c6021bc1f..6145fe734 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -221,7 +221,7 @@ int kerndat_socket_unix_file(void) } fd = ioctl(sk, SIOCUNIXFILE); if (fd < 0 && errno != ENOENT) { - pr_warn("Unable to open a socket file: %m\n"); + pr_warn("Unable to open a socket file: %s\n", strerror(errno)); kdat.sk_unix_file = false; close(sk); return 0; @@ -497,12 +497,37 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) goto err; } + if (sk->wqlen != 0) { + /* + * There's no known way to get data out of the write + * queue of an icon socket. The only good solution for + * now is to fail the migration. + */ + pr_err("Non-empty write queue on an in-flight socket %#x\n", ue->ino); + goto err; + } + ue->peer = e->sk_desc->sd.ino; pr_debug("\t\tFixed inflight socket %u peer %u)\n", ue->ino, ue->peer); + } else if (ue->state == TCP_LISTEN) { + int i; + + for (i = 0; i < sk->nr_icons; i++) + if (sk->icons[i] == 0) { + /* + * Inode of an icon socket equal to 0 means + * it's already been closed. That means we have + * no simple way to check if it sent any data. + * The only good solution for now is to fail + * the migration. + */ + pr_err("Found a closed in-flight socket to %#x\n", ue->ino); + goto err; + } } dump: - if (dump_socket_opts(lfd, skopts)) + if (dump_socket_opts(lfd, AF_UNIX, skopts)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); @@ -570,14 +595,14 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U else ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); if (!ns) { - ret = -ENOENT; - goto out; + pr_err("Failed to lookup ns by mnt id %d\n", ue->mnt_id); + return -1; } mntns_root = mntns_get_root_fd(ns); if (mntns_root < 0) { - ret = -ENOENT; - goto out; + pr_err("Failed to lookup mntns root for ns %d\n", ns->id); + return -1; } if (name[0] != '/') { @@ -588,15 +613,15 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U ret = resolve_rel_name(id, d, p, &ue->name_dir); if (ret < 0) - goto out; - goto postprone; + return -1; + return 0; } snprintf(rpath, sizeof(rpath), ".%s", name); if (fstatat(mntns_root, rpath, &st, 0)) { if (errno != ENOENT) { - pr_warn("Can't stat socket %#" PRIx32 "(%s), skipping: %m (err %d)\n", id, rpath, errno); - goto skip; + pr_perror("Can't stat socket %#" PRIx32 "(%s)", id, rpath); + return -1; } pr_info("unix: Dropping path %s for unlinked sk %#x\n", name, id); @@ -614,92 +639,77 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U d->deleted = deleted; -postprone: return 0; - -out: - xfree(name); - return ret; -skip: - ret = 1; - goto out; } static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, UnixSkEntry *ue, const struct fd_parms *p) { char *name = d->name; - char path[PATH_MAX], tmp[PATH_MAX]; + char path[PATH_MAX]; struct stat st; - int fd, proc_fd, mnt_id, ret; + int fd, ret; + int exit_code = -1; if (d->namelen == 0 || name[0] == '\0') return 0; - if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { - if (get_mnt_id(lfd, &mnt_id)) + if (!kdat.sk_unix_file) { + pr_warn("Trying to resolve unix socket with obsolete method\n"); + if (unix_resolve_name_old(lfd, id, d, ue, p)) { + pr_err("Unable to resolve unix socket name with obsolete method. " + "Try a linux kernel newer than 4.10\n"); return -1; - ue->mnt_id = mnt_id; - ue->has_mnt_id = true; + } + return 0; } fd = ioctl(lfd, SIOCUNIXFILE); if (fd < 0) { - pr_warn("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl: %m\n"); - goto fallback; + pr_perror("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl"); + return -1; } - ret = fstat(fd, &st); - if (ret) { + if (root_ns_mask & CLONE_NEWNS) { + struct fdinfo_common fdinfo = { .mnt_id = -1 }; + + if (parse_fdinfo(fd, FD_TYPES__UND, &fdinfo)) + goto out; + + ue->mnt_id = fdinfo.mnt_id; + ue->has_mnt_id = true; + } + + if (fstat(fd, &st)) { pr_perror("Unable to fstat socket fd"); - return -1; + goto out; } d->mode = st.st_mode; d->uid = st.st_uid; d->gid = st.st_gid; - proc_fd = get_service_fd(PROC_FD_OFF); - if (proc_fd < 0) { - pr_err("Unable to get service fd for proc\n"); - return -1; - } - - snprintf(tmp, sizeof(tmp), "self/fd/%d", fd); - ret = readlinkat(proc_fd, tmp, path, PATH_MAX); - if (ret < 0 && ret >= PATH_MAX) { - pr_perror("Unable to readlink %s", tmp); + ret = read_fd_link(fd, path, sizeof(path)); + if (ret < 0) goto out; - } - path[ret] = 0; d->deleted = strip_deleted(path, ret); if (name[0] != '/') { - ret = cut_path_ending(path, name); - if (ret) { - pr_err("Unable too resolve %s from %s\n", name, path); + if (cut_path_ending(path, name)) { + pr_err("Unable too cut %s from %s\n", name, path); goto out; } ue->name_dir = xstrdup(path); - if (!ue->name_dir) { - ret = -ENOMEM; + if (!ue->name_dir) goto out; - } pr_debug("Resolved socket relative name %s to %s/%s\n", name, ue->name_dir, name); } - ret = 0; + exit_code = 0; out: close(fd); - return ret; - -fallback: - pr_warn("Trying to resolve unix socket with obsolete method\n"); - ret = unix_resolve_name_old(lfd, id, d, ue, p); - if (ret < 0) - pr_err("Unable to resolve unix socket name with obsolete method. Try a linux kernel newer than 4.10\n"); - return ret; + return exit_code; } /* @@ -868,7 +878,8 @@ static int __dump_external_socket(struct unix_sk_desc *sk, struct unix_sk_desc * if (peer->type != SOCK_DGRAM) { show_one_unix("Ext stream not supported", peer); - pr_err("Can't dump half of stream unix connection.\n"); + pr_err("Can't dump half of stream unix connection. name: %s; peer name: %s\n", + sk->name, peer->name); return -1; } @@ -1021,8 +1032,8 @@ static struct unix_sk_info *find_queuer_for(int id) struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { - if (ui->queuer && ui->queuer->ue->id == id) - return ui; + if (ui->queuer && ui->ue->id == id) + return ui->queuer; } return NULL; @@ -1420,32 +1431,22 @@ err_revert_and_exit: static int restore_file_perms(struct unix_sk_info *ui) { - if (ui->ue->file_perms) { - FilePermsEntry *perms = ui->ue->file_perms; - char fname[PATH_MAX]; + FilePermsEntry *perms = ui->ue->file_perms; + char fname[PATH_MAX]; - if (ui->ue->name.len >= sizeof(fname)) { - pr_err("The file name is too long\n"); - return -E2BIG; - } + if (!perms) + return 0; - memcpy(fname, ui->name, ui->ue->name.len); - fname[ui->ue->name.len] = '\0'; - - if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file owner and group"); - return -errno_cpy; - } - - if (fchmodat(AT_FDCWD, fname, perms->mode, 0) < 0) { - int errno_cpy = errno; - pr_perror("Unable to change file mode bits"); - return -errno_cpy; - } + if (ui->ue->name.len >= sizeof(fname)) { + pr_err("The file name is too long\n"); + errno = -E2BIG; + return -1; } - return 0; + memcpy(fname, ui->name, ui->ue->name.len); + fname[ui->ue->name.len] = '\0'; + + return cr_fchpermat(AT_FDCWD, fname, perms->uid, perms->gid, perms->mode, 0); } static int keep_deleted(struct unix_sk_info *ui) diff --git a/criu/sockets.c b/criu/sockets.c index db772707b..e4adae03c 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -29,6 +29,7 @@ #include "pstree.h" #include "util.h" #include "fdstore.h" +#include "cr_options.h" #undef LOG_PREFIX #define LOG_PREFIX "sockets: " @@ -37,7 +38,7 @@ #define SOCK_DIAG_BY_FAMILY 20 #endif -#define SK_HASH_SIZE 32 +#define SK_HASH_SIZE (1 << 14) #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER @@ -64,7 +65,7 @@ const char *socket_proto_name(unsigned int proto, char *nm, size_t size) [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), [IPPROTO_AH] = __stringify_1(IPPROTO_AH), [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), - [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), [IPPROTO_ICMPV6] = __stringify_1(IPPROTO_ICMPV6), }; return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); } @@ -130,10 +131,12 @@ enum socket_cl_bits { INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET_RAW_CL_BIT, + INET_ICMP_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, INET6_RAW_CL_BIT, + INET6_ICMP_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, @@ -160,6 +163,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET_RAW_CL_BIT; + if (proto == IPPROTO_ICMP) + return INET_ICMP_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) @@ -170,6 +175,8 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET6_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET6_RAW_CL_BIT; + if (proto == IPPROTO_ICMPV6) + return INET6_ICMP_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); @@ -281,6 +288,12 @@ void preload_socket_modules(void) req.r.i.sdiag_protocol = IPPROTO_RAW; probe_diag(nl, &req, -ENOENT); + req.r.i.sdiag_protocol = IPPROTO_ICMP; + probe_diag(nl, &req, -ENOENT); + + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + probe_diag(nl, &req, -ENOENT); + close(nl); pr_info("Done probing\n"); } @@ -465,18 +478,33 @@ int do_restore_opt(int sk, int level, int name, void *val, int len) return 0; } -static int sk_setbufs(void *arg, int fd, pid_t pid) +int sk_setbufs(int sk, uint32_t *bufs) { - u32 *buf = (u32 *)arg; + uint32_t sndbuf = bufs[0], rcvbuf = bufs[1]; - if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0])) - return -1; - if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1])) - return -1; + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &rcvbuf, sizeof(rcvbuf))) { + if (opts.unprivileged) { + pr_info("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE, falling back to SO_SNDBUF/SO_RCVBUF\n"); + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)) || + setsockopt(sk, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf))) { + pr_perror("Unable to set socket SO_SNDBUF/SO_RCVBUF"); + return -1; + } + } else { + pr_perror("Unable to set socket SO_SNDBUFFORCE/SO_RCVBUFFORCE"); + return -1; + } + } return 0; } +static int sk_setbufs_ns(void *arg, int fd, pid_t pid) +{ + return sk_setbufs(fd, (uint32_t *)arg); +} + /* * Set sizes of buffers to maximum and prevent blocking * Caller of this fn should call other socket restoring @@ -489,7 +517,7 @@ int restore_prepare_socket(int sk) /* In kernel a bufsize has type int and a value is doubled. */ u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 }; - if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk)) + if (userns_call(sk_setbufs_ns, 0, maxbuf, sizeof(maxbuf), sk)) return -1; /* Prevent blocking on restore */ @@ -517,7 +545,7 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ - ret |= userns_call(sk_setbufs, 0, bufs, sizeof(bufs), sk); + ret |= userns_call(sk_setbufs_ns, 0, bufs, sizeof(bufs), sk); if (soe->has_so_buf_lock) { pr_debug("\trestore buf_lock %d for socket\n", soe->so_buf_lock); @@ -569,6 +597,12 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_debug("\tset keepalive for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val); } + + /* + * Restoring TCP socket options in SkOptsEntry is + * for backward compatibility only, newer versions + * of CRIU use TcpOptsEntry. + */ if (soe->has_tcp_keepcnt) { pr_debug("\tset keepcnt for socket\n"); ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt); @@ -615,7 +649,7 @@ int do_dump_opt(int sk, int level, int name, void *val, int len) return 0; } -int dump_socket_opts(int sk, SkOptsEntry *soe) +int dump_socket_opts(int sk, int family, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; @@ -631,8 +665,12 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); soe->has_so_rcvlowat = true; ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); - soe->has_so_mark = true; + /* + * Restoring SO_MARK requires root or CAP_NET_ADMIN. Avoid saving it + * in unprivileged mode if still has its default value. + */ ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); + soe->has_so_mark = !!soe->so_mark; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; @@ -650,13 +688,15 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) soe->so_reuseport = val ? true : false; soe->has_so_reuseport = true; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); - soe->has_so_passcred = true; - soe->so_passcred = val ? true : false; + if (family == AF_UNIX || family == AF_NETLINK) { + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); - soe->has_so_passsec = true; - soe->so_passsec = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; + } ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; @@ -747,6 +787,10 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) case IPPROTO_RAW: type = SOCK_RAW; break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + type = SOCK_DGRAM; + break; default: BUG_ON(1); return -1; @@ -771,7 +815,7 @@ static int collect_err(int err, struct ns_id *ns, void *arg) char family[32], proto[32]; char msg[256]; - snprintf(msg, sizeof(msg), "Sockects collect procedure family %s proto %s", + snprintf(msg, sizeof(msg), "Sockets collect procedure family %s proto %s", socket_family_name(gr->family, family, sizeof(family)), socket_proto_name(gr->protocol, proto, sizeof(proto))); @@ -879,6 +923,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv4 ICMP sockets */ + req.r.i.sdiag_family = AF_INET; + req.r.i.sdiag_protocol = IPPROTO_ICMP; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; @@ -918,6 +969,13 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; + /* Collect IPv6 ICMP sockets */ + req.r.i.sdiag_family = AF_INET6; + req.r.i.sdiag_protocol = IPPROTO_ICMPV6; + req.r.i.idiag_ext = 0; + req.r.i.idiag_states = -1; /* All */ + set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); + req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; diff --git a/criu/string.c b/criu/string.c index 7df0b3e09..7edd35363 100644 --- a/criu/string.c +++ b/criu/string.c @@ -6,7 +6,6 @@ #include "string.h" -#ifndef CONFIG_HAS_STRLCPY /** * strlcpy - Copy a %NUL terminated string into a sized buffer * @dest: Where to copy the string to @@ -18,7 +17,7 @@ * of course, the buffer size is zero). It does not pad * out the result like strncpy() does. */ -size_t strlcpy(char *dest, const char *src, size_t size) +size_t __strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); @@ -29,16 +28,14 @@ size_t strlcpy(char *dest, const char *src, size_t size) } return ret; } -#endif -#ifndef CONFIG_HAS_STRLCAT /** * strlcat - Append a length-limited, %NUL-terminated string to another * @dest: The string to be appended to * @src: The string to append to it * @count: The size of the destination buffer. */ -size_t strlcat(char *dest, const char *src, size_t count) +size_t __strlcat(char *dest, const char *src, size_t count) { size_t dsize = strlen(dest); size_t len = strlen(src); @@ -57,4 +54,3 @@ size_t strlcat(char *dest, const char *src, size_t count) dest[len] = 0; return res; } -#endif diff --git a/criu/sysctl.c b/criu/sysctl.c index b06688712..99026acf4 100644 --- a/criu/sysctl.c +++ b/criu/sysctl.c @@ -203,6 +203,17 @@ static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) * 2. forks a task * 3. setns()es to the UTS/IPC namespace of the caller * 4. write()s to the files and exits + * + * For the IPC namespace, since + * https://github.com/torvalds/linux/commit/5563cabdde, user with + * enough capability can open IPC sysctl files and write to it. Later + * commit https://github.com/torvalds/linux/commit/1f5c135ee5 and + * https://github.com/torvalds/linux/commit/0889f44e28 bind the IPC + * namespace at the open() time so the changed value does not depend + * on the IPC namespace at the write() time. Also, the permission check + * changes a little bit which makes the above approach unusable but we + * can simply use nonuserns version for restoring as IPC sysctl as the + * restored process currently has enough capability. */ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY); if (dir < 0) { @@ -335,9 +346,12 @@ out: return ret; } -static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) +/* exit_code = 1 in case nonuserns failed but we want to fallback to userns approach */ +static int __nonuserns_sysctl_op(struct sysctl_req **orig_req, size_t *orig_nr_req, int op) { int ret, exit_code = -1; + struct sysctl_req *req = *orig_req; + size_t nr_req = *orig_nr_req; while (nr_req--) { int fd; @@ -351,6 +365,14 @@ static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) req++; continue; } + if (errno == EACCES && (req->flags & CTL_FLAGS_IPC_EACCES_SKIP)) { + /* The remaining requests are restored using userns approach */ + *orig_req = req; + *orig_nr_req = nr_req + 1; + exit_code = 1; + goto out; + } + pr_perror("Can't open sysctl %s", req->name); goto out; } @@ -404,7 +426,16 @@ int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns) * so we can do those in process as well. */ if (!ns || ns & CLONE_NEWNET || op == CTL_READ) - return __nonuserns_sysctl_op(req, nr_req, op); + return __nonuserns_sysctl_op(&req, &nr_req, op); + + /* Try to use nonuserns for restoring IPC sysctl and fallback to + * userns approach when the returned code is 1. + */ + if (ns & CLONE_NEWIPC && op == CTL_WRITE) { + ret = __nonuserns_sysctl_op(&req, &nr_req, op); + if (ret <= 0) + return ret; + } /* * In order to avoid lots of opening of /proc/sys for each struct sysctl_req, diff --git a/criu/timens.c b/criu/timens.c index 5803fc359..257782e5a 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -5,6 +5,7 @@ #include "proc_parse.h" #include "namespaces.h" #include "timens.h" +#include "cr_options.h" #include "protobuf.h" #include "images/timens.pb-c.h" @@ -57,6 +58,9 @@ int prepare_timens(int id) struct timespec ts; struct timespec prev_moff = {}, prev_boff = {}; + if (opts.unprivileged) + return 0; + img = open_image(CR_FD_TIMENS, O_RSTR, id); if (!img) return -1; @@ -92,8 +96,8 @@ int prepare_timens(int id) ts.tv_nsec = te->monotonic->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: monotonic %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_MONOTONIC, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: monotonic %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_MONOTONIC, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a monotonic clock offset"); goto err; } @@ -107,8 +111,8 @@ int prepare_timens(int id) ts.tv_nsec = te->boottime->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: boottime %ld %ld\n", ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %ld %ld\n", CLOCK_BOOTTIME, ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: boottime %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_BOOTTIME, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a boottime clock offset"); goto err; } diff --git a/criu/timer.c b/criu/timer.c new file mode 100644 index 000000000..856501be6 --- /dev/null +++ b/criu/timer.c @@ -0,0 +1,402 @@ +#include "types.h" +#include "crtools.h" +#include "infect.h" +#include "protobuf.h" +#include "pstree.h" +#include "posix-timer.h" +#include "parasite.h" +#include "namespaces.h" +#include "rst-malloc.h" +#include "restorer.h" + +static inline int timeval_valid(struct timeval *tv) +{ + return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); +} + +static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) +{ + if (ie->isec == 0 && ie->iusec == 0 && ie->vsec == 0 && ie->vusec == 0) { + memzero_p(val); + return 0; + } + + val->it_interval.tv_sec = ie->isec; + val->it_interval.tv_usec = ie->iusec; + + if (!timeval_valid(&val->it_interval)) { + pr_err("Invalid timer interval\n"); + return -1; + } + + if (ie->vsec == 0 && ie->vusec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + val->it_value.tv_sec = ie->isec; + val->it_value.tv_usec = ie->iusec; + } else { + val->it_value.tv_sec = ie->vsec; + val->it_value.tv_usec = ie->vusec; + } + + if (!timeval_valid(&val->it_value)) { + pr_err("Invalid timer value\n"); + return -1; + } + + pr_info("Restored %s timer to %" PRId64 ".%" PRId64 " -> %" PRId64 ".%" PRId64 "\n", n, + (int64_t)val->it_value.tv_sec, (int64_t)val->it_value.tv_usec, + (int64_t)val->it_interval.tv_sec, (int64_t)val->it_interval.tv_usec); + + return 0; +} + +/* + * Legacy itimers restore from CR_FD_ITIMERS + */ + +int prepare_itimers_from_fd(int pid, struct task_restore_args *args) +{ + int ret = -1; + struct cr_img *img; + ItimerEntry *ie; + + if (!deprecated_ok("Itimers")) + return -1; + + img = open_image(CR_FD_ITIMERS, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("real", ie, &args->itimers[0]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("virt", ie, &args->itimers[1]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("prof", ie, &args->itimers[2]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; +out: + close_image(img); + return ret; +} + +int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) +{ + int ret = 0; + TaskTimersEntry *tte = core->tc->timers; + + if (!tte) + return prepare_itimers_from_fd(pid, args); + + ret |= decode_itimer("real", tte->real, &args->itimers[0]); + ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); + ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); + + return ret; +} + +static inline int timespec_valid(struct timespec *ts) +{ + return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); +} + +static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) +{ + pt->val.it_interval.tv_sec = pte->isec; + pt->val.it_interval.tv_nsec = pte->insec; + + if (!timespec_valid(&pt->val.it_interval)) { + pr_err("Invalid timer interval(posix)\n"); + return -1; + } + + if (pte->vsec == 0 && pte->vnsec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + pt->val.it_value.tv_sec = pte->isec; + pt->val.it_value.tv_nsec = pte->insec; + } else { + pt->val.it_value.tv_sec = pte->vsec; + pt->val.it_value.tv_nsec = pte->vnsec; + } + + if (!timespec_valid(&pt->val.it_value)) { + pr_err("Invalid timer value(posix)\n"); + return -1; + } + + pt->spt.it_id = pte->it_id; + pt->spt.clock_id = pte->clock_id; + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); + pt->spt.notify_thread_id = pte->notify_thread_id; + pt->overrun = pte->overrun; + + return 0; +} + +static int cmp_posix_timer_proc_id(const void *p1, const void *p2) +{ + return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; +} + +static void sort_posix_timers(struct task_restore_args *ta) +{ + void *tmem; + + /* + * This is required for restorer's create_posix_timers(), + * it will probe them one-by-one for the desired ID, since + * kernel doesn't provide another API for timer creation + * with given ID. + */ + + if (ta->posix_timers_n > 0) { + tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); + qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); + } +} + +/* + * Legacy posix timers restoration from CR_FD_POSIX_TIMERS + */ + +int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) +{ + struct cr_img *img; + int ret = -1; + struct restore_posix_timer *t; + + if (!deprecated_ok("Posix timers")) + return -1; + + img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); + if (!img) + return -1; + + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; + ta->posix_timers_n = 0; + while (1) { + PosixTimerEntry *pte; + + ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); + if (ret <= 0) + break; + + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + break; + + ret = decode_posix_timer(pte, t); + if (ret < 0) + break; + + posix_timer_entry__free_unpacked(pte, NULL); + ta->posix_timers_n++; + } + + close_image(img); + if (!ret) + sort_posix_timers(ta); + + return ret; +} + +int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) +{ + int i, ret = -1; + TaskTimersEntry *tte = core->tc->timers; + struct restore_posix_timer *t; + + ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); + + if (!tte) + return prepare_posix_timers_from_fd(pid, ta); + + ta->posix_timers_n = tte->n_posix; + ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; + for (i = 0; i < ta->posix_timers_n; i++) { + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + goto out; + + if (decode_posix_timer(tte->posix[i], t)) + goto out; + } + + ret = 0; + sort_posix_timers(ta); +out: + return ret; +} + +static void encode_itimer(struct itimerval *v, ItimerEntry *ie) +{ + ie->isec = v->it_interval.tv_sec; + ie->iusec = v->it_interval.tv_usec; + ie->vsec = v->it_value.tv_sec; + ie->vusec = v->it_value.tv_usec; +} + +int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + struct parasite_dump_itimers_args *args; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); + if (ret < 0) + return ret; + + encode_itimer((&args->real), (core->tc->timers->real)); + encode_itimer((&args->virt), (core->tc->timers->virt)); + encode_itimer((&args->prof), (core->tc->timers->prof)); + + return 0; +} + +static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) +{ + int sz; + + /* + * Will be free()-ed in core_entry_free() + */ + + sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); + tte->posix = xmalloc(sz); + if (!tte->posix) + return -1; + + tte->n_posix = n; + *pte = (PosixTimerEntry *)(tte->posix + n); + return 0; +} + +static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) +{ + pid_t vtid = 0; + int i; + + if (rtid == 0) + return 0; + + if (!(root_ns_mask & CLONE_NEWPID)) { + /* Non-pid-namespace case */ + pte->notify_thread_id = rtid; + pte->has_notify_thread_id = true; + return 0; + } + + /* Pid-namespace case */ + if (!kdat.has_nspid) { + pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + if (item->threads[i].real != rtid) + continue; + + vtid = item->threads[i].ns[0].virt; + break; + } + + if (vtid == 0) { + pr_err("Unable to convert the notify thread id %d\n", rtid); + return -1; + } + + pte->notify_thread_id = vtid; + pte->has_notify_thread_id = true; + return 0; +} + +static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, + PosixTimerEntry *pte) +{ + pte->it_id = vp->spt.it_id; + pte->clock_id = vp->spt.clock_id; + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); + + pte->overrun = v->overrun; + + pte->isec = v->val.it_interval.tv_sec; + pte->insec = v->val.it_interval.tv_nsec; + pte->vsec = v->val.it_value.tv_sec; + pte->vnsec = v->val.it_value.tv_nsec; + + if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) + return -1; + + return 0; +} + +int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + TaskTimersEntry *tte = core->tc->timers; + PosixTimerEntry *pte; + struct proc_posix_timer *temp; + struct parasite_dump_posix_timers_args *args; + int ret, exit_code = -1; + int args_size; + int i; + + if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) + return -1; + + args_size = posix_timers_dump_size(proc_args->timer_n); + args = compel_parasite_args_s(ctl, args_size); + args->timer_n = proc_args->timer_n; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + args->timer[i].it_id = temp->spt.it_id; + i++; + } + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); + if (ret < 0) + goto end_posix; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + posix_timer_entry__init(&pte[i]); + if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) + goto end_posix; + tte->posix[i] = &pte[i]; + i++; + } + + exit_code = 0; +end_posix: + free_posix_timers(proc_args); + return exit_code; +} diff --git a/criu/tty.c b/criu/tty.c index 13f645f3a..9a4520d53 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -22,6 +22,7 @@ #include "rst-malloc.h" #include "log.h" #include "common/list.h" +#include "util.h" #include "util-pie.h" #include "proc_parse.h" #include "file-ids.h" @@ -258,7 +259,7 @@ static int pts_fd_get_index(int fd, const struct fd_parms *p) { int index; const struct fd_link *link = p->link; - char *pos = strrchr(link->name, '/'); + const char *pos = strrchr(link->name, '/'); if (!pos || pos == (link->name + link->len - 1)) { pr_err("Unexpected format on path %s\n", link->name + 1); @@ -398,8 +399,7 @@ static int tty_verify_active_pairs(void) { unsigned long i, unpaired_slaves = 0; - for_each_bit(i, tty_active_pairs) - { + for_each_bit(i, tty_active_pairs) { if ((i % 2) == 0) { if (test_bit(i + 1, tty_active_pairs)) { i++; @@ -817,8 +817,26 @@ static int do_restore_tty_parms(void *arg, int fd, pid_t pid) * on termios too. Just to be on the safe side. */ - if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) - goto err; + if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) { + struct termios t; + + if (errno != EPERM) + goto err; + + memzero(&t, sizeof(t)); + if (ioctl(fd, TIOCGLCKTRMIOS, &t) < 0) { + pr_perror("Can't get tty locked params on %#x", p->tty_id); + goto err; + } + + /* + * The ioctl(TIOCSLCKTRMIOS) requires a CRIU process to be privileged + * in the init_user_ns, but if the current "termios_locked" value equal + * to the "termios_locked" value from the image, we can safely skip setting it. + */ + if (memcmp(&t, &p->tl, sizeof(struct termios)) != 0) + goto err; + } if ((p->has & HAS_TERMIOS) && ioctl(fd, TCSETS, &p->t) < 0) goto err; @@ -868,7 +886,7 @@ static int restore_tty_params(int fd, struct tty_info *info) } if (info->tie->has_uid && info->tie->has_gid) { - if (fchown(fd, info->tie->uid, info->tie->gid)) { + if (cr_fchown(fd, info->tie->uid, info->tie->gid)) { pr_perror("Can't setup uid %d gid %d on %#x", (int)info->tie->uid, (int)info->tie->gid, info->tfe->id); return -1; diff --git a/criu/tun.c b/criu/tun.c index 573137091..9d66f9929 100644 --- a/criu/tun.c +++ b/criu/tun.c @@ -121,7 +121,7 @@ static int list_tun_link(NetDeviceEntry *nde, unsigned ns_id) if (!tl) return -1; - strlcpy(tl->name, nde->name, sizeof(tl->name)); + __strlcpy(tl->name, nde->name, sizeof(tl->name)); /* * Keep tun-flags not only for persistency fixup (see * comment below), but also for TUNSETIFF -- we must @@ -153,7 +153,7 @@ static struct tun_link *__dump_tun_link_fd(int fd, char *name, unsigned ns_id, u tl = xmalloc(sizeof(*tl)); if (!tl) goto err; - strlcpy(tl->name, name, sizeof(tl->name)); + __strlcpy(tl->name, name, sizeof(tl->name)); tl->ns_id = ns_id; INIT_LIST_HEAD(&tl->l); @@ -241,7 +241,7 @@ static int open_tun_dev(char *name, unsigned int idx, unsigned flags) } memset(&ifr, 0, sizeof(ifr)); - strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); + __strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_flags = flags; if (ioctl(fd, TUNSETIFF, &ifr)) { @@ -393,7 +393,7 @@ static int tunfile_open(struct file_desc *d, int *new_fd) } memset(&ifr, 0, sizeof(ifr)); - strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); + __strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); ifr.ifr_flags = tl->rst.flags; if (ioctl(fd, TUNSETIFF, &ifr) < 0) { @@ -455,27 +455,26 @@ int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **in TunLinkEntry tle = TUN_LINK_ENTRY__INIT; char spath[64]; char buf[64]; - int ret = 0; struct tun_link *tl; sprintf(spath, "class/net/%s/tun_flags", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.flags = strtol(buf, NULL, 0); sprintf(spath, "class/net/%s/owner", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.owner = strtol(buf, NULL, 10); sprintf(spath, "class/net/%s/group", nde->name); - ret |= read_ns_sys_file(spath, buf, sizeof(buf)); + if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) + return -1; tle.group = strtol(buf, NULL, 10); - if (ret < 0) - return ret; - tl = get_tun_link_fd(nde->name, nde->peer_nsid, tle.flags); if (!tl) - return ret; + return -1; tle.vnethdr = tl->dmp.vnethdr; tle.sndbuf = tl->dmp.sndbuf; diff --git a/criu/uffd.c b/criu/uffd.c index e07b21b69..8e12dcd63 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -668,12 +668,11 @@ static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned */ static int collect_iovs(struct lazy_pages_info *lpi) { + unsigned long start, end, len, nr_pages = 0; + int n_vma = 0, max_iov_len = 0, ret = -1; struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; - int nr_pages = 0, n_vma = 0, max_iov_len = 0; - int ret = -1; - unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) @@ -728,7 +727,7 @@ free_mm: return ret; } -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, unsigned long nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { @@ -822,7 +821,7 @@ static bool uffd_recoverable_error(int mcopy_rc) return false; } -static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int *nr_pages, long mcopy_rc) +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long *nr_pages, long mcopy_rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); @@ -844,7 +843,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int return 0; } -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = *nr_pages * page_size(); @@ -865,12 +864,12 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) return 0; } -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsigned long nr) { struct lazy_pages_info *lpi; - unsigned long addr = 0; - int req_pages, ret; + unsigned long addr = 0, req_pages; struct lazy_iov *req; + int ret; lpi = container_of(pr, struct lazy_pages_info, pr); @@ -920,7 +919,7 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr return drop_iovs(lpi, addr, nr * PAGE_SIZE); } -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; @@ -946,7 +945,7 @@ static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr) { int ret; @@ -961,7 +960,7 @@ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) return 0; } -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr, unsigned flags) { int ret; @@ -1003,7 +1002,7 @@ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) static int xfer_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; - unsigned int nr_pages; + unsigned long nr_pages; unsigned long len; int err; @@ -1098,6 +1097,8 @@ static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg) lpi_get(lpi->parent); + page_read_disable_dedup(&parent_lpi->pr); + page_read_disable_dedup(&lpi->pr); return 1; out: diff --git a/criu/unittest/mock.c b/criu/unittest/mock.c index e517720e4..b2d507278 100644 --- a/criu/unittest/mock.c +++ b/criu/unittest/mock.c @@ -5,6 +5,8 @@ #include #include +#include "compel/infect-util.h" + int add_external(char *key) { return 0; @@ -141,4 +143,4 @@ int check_mount_v2(void) return 0; } -uint64_t compel_run_id; +char compel_run_id[RUN_ID_HASH_LENGTH]; diff --git a/criu/util.c b/criu/util.c index 5f69465b4..2eaad35bb 100644 --- a/criu/util.c +++ b/criu/util.c @@ -24,11 +24,11 @@ #include #include #include -#include #include #include #include #include +#include #include "linux/mount.h" @@ -40,6 +40,7 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" +#include "util-caps.h" #include "clone-noasan.h" #include "cr_options.h" @@ -53,6 +54,7 @@ #include "action-scripts.h" #include "compel/infect-util.h" +#include #define VMA_OPT_LEN 128 @@ -193,6 +195,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_UPROBES, "uprobes"); #undef opt2s } @@ -219,10 +222,9 @@ int close_safe(int *fd) if (*fd > -1) { ret = close(*fd); - if (!ret) - *fd = -1; - else - pr_perror("Unable to close fd %d", *fd); + if (ret) + pr_perror("Failed closing fd %d", *fd); + *fd = -1; } return ret; @@ -517,12 +519,25 @@ int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned return cr_system_userns(in, out, err, cmd, argv, flags, -1); } -static int close_fds(int minfd) +int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) +{ + return syscall(__NR_close_range, fd, max_fd, flags); +} + +int close_fds(int minfd) { DIR *dir; struct dirent *de; int fd, ret, dfd; + if (kdat.has_close_range) { + if (cr_close_range(minfd, ~0, 0)) { + pr_perror("close_range failed"); + return -1; + } + return 0; + } + dir = opendir("/proc/self/fd"); if (dir == NULL) { pr_perror("Can't open /proc/self/fd"); @@ -660,40 +675,54 @@ out: return ret; } +struct child_args { + int *sk_pair; + int (*child_setup)(void); +}; + +static int child_func(void *_args) +{ + struct child_args *args = _args; + int sk, *sk_pair = args->sk_pair; + char c = 0; + + sk = sk_pair[1]; + close(sk_pair[0]); + + if (args->child_setup && args->child_setup() != 0) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + while (1) + sleep(1000); + exit(1); +} + pid_t fork_and_ptrace_attach(int (*child_setup)(void)) { pid_t pid; int sk_pair[2], sk; char c = 0; + struct child_args cargs = { + .sk_pair = sk_pair, + .child_setup = child_setup, + }; if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { pr_perror("socketpair"); return -1; } - pid = fork(); + pid = clone_noasan(child_func, CLONE_UNTRACED | SIGCHLD, &cargs); if (pid < 0) { pr_perror("fork"); return -1; } - if (pid == 0) { - sk = sk_pair[1]; - close(sk_pair[0]); - - if (child_setup && child_setup() != 0) - exit(1); - - if (write(sk, &c, 1) != 1) { - pr_perror("write"); - exit(1); - } - - while (1) - sleep(1000); - exit(1); - } - sk = sk_pair[0]; close(sk_pair[1]); @@ -950,6 +979,89 @@ FILE *fopenat(int dirfd, char *path, char *cflags) return fdopen(tmp, cflags); } +int cr_fchown(int fd, uid_t new_uid, gid_t new_gid) +{ + struct stat st; + + if (!fchown(fd, new_uid, new_gid)) + return 0; + if (errno != EPERM) + return -1; + + if (fstat(fd, &st) < 0) { + pr_perror("fstat() after fchown() for fd %d", fd); + goto out_eperm; + } + pr_debug("fstat(%d): uid %u gid %u\n", fd, st.st_uid, st.st_gid); + + if (new_uid != st.st_uid || new_gid != st.st_gid) + goto out_eperm; + + return 0; +out_eperm: + errno = EPERM; + return -1; +} + +int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags) +{ + struct stat st; + int ret; + + if (fchownat(dirfd, path, new_uid, new_gid, flags) < 0 && errno != EPERM) { + int errno_cpy = errno; + pr_perror("Unable to change [%d]/%s ownership to (%d, %d)", + dirfd, path, new_uid, new_gid); + errno = errno_cpy; + return -1; + } + + if (fstatat(dirfd, path, &st, flags) < 0) { + int errno_cpy = errno; + pr_perror("Unable to stat [%d]/%s", dirfd, path); + errno = errno_cpy; + return -1; + } + + if (new_uid != st.st_uid || new_gid != st.st_gid) { + errno = EPERM; + pr_perror("Unable to change [%d]/%s ownership (%d, %d) to (%d, %d)", + dirfd, path, st.st_uid, st.st_gid, new_uid, new_gid); + errno = EPERM; + return -1; + } + + if (new_mode == st.st_mode) + return 0; + + if (S_ISLNK(st.st_mode)) { + /* + * We have no lchmod() function, and fchmod() will fail on + * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() + * function and flag AT_SYMLINK_NOFOLLOW described in + * man 2 fchmodat, but it is not currently implemented. %) + */ + return 0; + } + + if (!*path && flags & AT_EMPTY_PATH) + ret = fchmod(dirfd, new_mode); + else + ret = fchmodat(dirfd, path, new_mode, flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)); + if (ret < 0) { + int errno_cpy = errno; + pr_perror("Unable to set perms %o on [%d]/%s", new_mode, dirfd, path); + errno = errno_cpy; + } + + return ret; +} + +int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode) +{ + return cr_fchpermat(fd, "", new_uid, new_gid, new_mode, AT_EMPTY_PATH); +} + void split(char *str, char token, char ***out, int *n) { int i; @@ -1070,20 +1182,6 @@ const char *ns_to_string(unsigned int ns) } } -void tcp_cork(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_perror("Unable to restore TCP_CORK (%d)", val); -} - -void tcp_nodelay(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_perror("Unable to restore TCP_NODELAY (%d)", val); -} - static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) { memset(addr, 0, sizeof(*addr)); @@ -1425,6 +1523,9 @@ void rlimit_unlimit_nofile(void) { struct rlimit new; + if (opts.unprivileged && !has_cap_sys_resource(opts.cap_eff)) + return; + new.rlim_cur = kdat.sysctl_nr_open; new.rlim_max = kdat.sysctl_nr_open; @@ -1455,23 +1556,78 @@ void print_stack_trace(pid_t pid) } #endif +int cr_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + int ret = syscall(__NR_fsconfig, fd, cmd, key, value, aux); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + int ret = syscall(__NR_fsmount, fd, flags, attr_flags); + if (ret) + fsfd_dump_messages(fd); + return ret; +} + +void fsfd_dump_messages(int fd) +{ + char buf[4096]; + int err, n; + + err = errno; + + for (;;) { + n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) { + if (errno != ENODATA) + pr_perror("Unable to read from fs descriptor"); + break; + } + buf[n] = 0; + + switch (buf[0]) { + case 'w': + pr_warn("%s\n", buf); + break; + case 'i': + pr_info("%s\n", buf); + break; + case 'e': + /* fallthrough */ + default: + pr_err("%s\n", buf); + break; + } + } + + errno = err; +} + int mount_detached_fs(const char *fsname) { int fsfd, fd; - fsfd = sys_fsopen(fsname, 0); + fsfd = cr_fsopen(fsname, 0); if (fsfd < 0) { pr_perror("Unable to open the %s file system", fsname); return -1; } - if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { pr_perror("Unable to create the %s file system", fsname); close(fsfd); return -1; } - fd = sys_fsmount(fsfd, 0, 0); + fd = cr_fsmount(fsfd, 0, 0); if (fd < 0) pr_perror("Unable to mount the %s file system", fsname); close(fsfd); @@ -1561,7 +1717,7 @@ static int is_iptables_nft(char *bin) goto err; } - ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, 0); + ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, CRS_CAN_FAIL); if (ret) { pr_err("%s -V failed\n", cmd[0]); goto err; @@ -1589,44 +1745,46 @@ err: return ret; } -char *get_legacy_iptables_bin(bool ipv6) +char *get_legacy_iptables_bin(bool ipv6, bool restore) { - static char iptables_bin[2][32]; + static char iptables_bin[2][2][32]; /* 0 - means we don't know yet, * -1 - not present, * 1 - present. */ - static int iptables_present[2] = { 0, 0 }; - char bins[2][2][32] = { { "iptables-save", "iptables-legacy-save" }, - { "ip6tables-save", "ip6tables-legacy-save" } }; + static int iptables_present[2][2] = { { 0, 0 }, { 0, 0 } }; + char bins[2][2][2][32] = { { { "iptables-save", "iptables-legacy-save" }, + { "iptables-restore", "iptables-legacy-restore" } }, + { { "ip6tables-save", "ip6tables-legacy-save" }, + { "ip6tables-restore", "ip6tables-legacy-restore" } } }; int ret; - if (iptables_present[ipv6] == -1) + if (iptables_present[ipv6][restore] == -1) return NULL; - if (iptables_present[ipv6] == 1) - return iptables_bin[ipv6]; + if (iptables_present[ipv6][restore] == 1) + return iptables_bin[ipv6][restore]; - memcpy(iptables_bin[ipv6], bins[ipv6][0], strlen(bins[ipv6][0]) + 1); - ret = is_iptables_nft(iptables_bin[ipv6]); + memcpy(iptables_bin[ipv6][restore], bins[ipv6][restore][0], strlen(bins[ipv6][restore][0]) + 1); + ret = is_iptables_nft(iptables_bin[ipv6][restore]); /* * iptables on host uses nft backend (or not installed), * let's try iptables-legacy */ if (ret < 0 || ret == 1) { - memcpy(iptables_bin[ipv6], bins[ipv6][1], strlen(bins[ipv6][1]) + 1); - ret = is_iptables_nft(iptables_bin[ipv6]); + memcpy(iptables_bin[ipv6][restore], bins[ipv6][restore][1], strlen(bins[ipv6][restore][1]) + 1); + ret = is_iptables_nft(iptables_bin[ipv6][restore]); if (ret < 0 || ret == 1) { - iptables_present[ipv6] = -1; + iptables_present[ipv6][restore] = -1; return NULL; } } /* we can come here with iptables-save or iptables-legacy-save */ - iptables_present[ipv6] = 1; + iptables_present[ipv6][restore] = 1; - return iptables_bin[ipv6]; + return iptables_bin[ipv6][restore]; } /* @@ -1869,15 +2027,16 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) return fret; } -uint64_t criu_run_id; +char criu_run_id[RUN_ID_HASH_LENGTH]; -void util_init() +void util_init(void) { - struct timespec tp; + uuid_t uuid; - clock_gettime(CLOCK_MONOTONIC, &tp); - criu_run_id = ((uint64_t)getpid() << 32) + tp.tv_sec + tp.tv_nsec; - compel_run_id = criu_run_id; + uuid_generate(uuid); + uuid_unparse(uuid, criu_run_id); + pr_info("CRIU run id = %s\n", criu_run_id); + memcpy(compel_run_id, criu_run_id, sizeof(criu_run_id)); } /* @@ -2063,3 +2222,21 @@ out: xfree(free_path); return mp_path; } + +int set_opts_cap_eff(void) +{ + struct __user_cap_header_struct cap_header; + struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3]; + int i; + + cap_header.version = _LINUX_CAPABILITY_VERSION_3; + cap_header.pid = getpid(); + + if (capget(&cap_header, &cap_data[0])) + return -1; + + for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++) + memcpy(&opts.cap_eff[i], &cap_data[i].effective, sizeof(u32)); + + return 0; +} diff --git a/criu/vdso.c b/criu/vdso.c index 1a51f1451..2d9e57c4d 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -145,6 +145,9 @@ static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); @@ -276,6 +279,9 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list } list_for_each_entry(vma, &vma_area_list->h, list) { + if (vma_area_is(vma, VMA_AREA_GUARD)) + continue; + /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO @@ -310,7 +316,7 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) while (1) { unsigned long start, end; - char *has_vdso, *has_vvar; + char *has_vdso, *has_vvar, *has_vvar_vclock; buf = breadline(&f); if (buf == NULL) @@ -318,13 +324,19 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) if (IS_ERR(buf)) goto err; - has_vdso = strstr(buf, "[vdso]"); - if (!has_vdso) + has_vvar = NULL; + has_vvar_vclock = NULL; + do { + has_vdso = strstr(buf, "[vdso]"); + if (has_vdso) + break; has_vvar = strstr(buf, "[vvar]"); - else - has_vvar = NULL; + if (has_vvar) + break; + has_vvar_vclock = strstr(buf, "[vvar_vclock]"); + } while (0); - if (!has_vdso && !has_vvar) + if (!has_vdso && !has_vvar && !has_vvar_vclock) continue; if (sscanf(buf, "%lx-%lx", &start, &end) != 2) { @@ -339,13 +351,21 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) } s->vdso_start = start; s->sym.vdso_size = end - start; - } else { + } else if (has_vvar) { if (s->vvar_start != VVAR_BAD_ADDR) { pr_err("Got second VVAR entry\n"); goto err; } s->vvar_start = start; s->sym.vvar_size = end - start; + } else { + if (s->vvar_start == VDSO_BAD_ADDR || + s->vvar_start + s->sym.vvar_size != start) { + pr_err("VVAR and VVAR_VCLOCK entries are not subsequent\n"); + goto err; + } + s->sym.vvar_vclock_size = end - start; + s->sym.vvar_size += s->sym.vvar_vclock_size; } } @@ -479,7 +499,7 @@ out_close: return ret; } -#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 2) +#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 4) static int vdso_fill_compat_symtable(struct vdso_maps *native, struct vdso_maps *compat) { void *vdso_mmap; diff --git a/flake.lock b/flake.lock new file mode 100644 index 000000000..90c914452 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1731533236, + "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1744463964, + "narHash": "sha256-LWqduOgLHCFxiTNYi3Uj5Lgz0SR+Xhw3kr/3Xd0GPTM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2631b0b7abcea6e640ce31cd78ea58910d31e650", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 000000000..dc2429ffc --- /dev/null +++ b/flake.nix @@ -0,0 +1,77 @@ +{ + description = "CRIU development environment"; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + + # Dependencies for CRIU + criuDeps = with pkgs; [ + # Compiler and build essentials + gcc + gnumake + pkg-config + + # Protocol Buffers + protobuf + protobufc + python3Packages.protobuf + + # Other required libraries + libuuid + libbsd + iproute2 + nftables + libcap + libnet + libnl + libaio + gnutls + libdrm + + # ZDTM + python3Packages.pyyaml + ]; + + # Multilib support for 32-bit compatibility + # criuDeps32bit = with pkgs; [ + # glibc.dev + # glibc + # gcc-unwrapped + # ]; + + devShell = pkgs.mkShell { + buildInputs = criuDeps; # ++ (if pkgs.stdenv.isx86_64 then criuDeps32bit else []); + + shellHook = '' + echo "CRIU development environment" + echo "==============================" + echo "" + echo "Useful commands:" + echo " make - Build CRIU" + echo " make test - Run tests (requires ZDTM dependencies)" + echo "" + ''; + + # Add proper flags for multilib support + # NIX_CFLAGS_COMPILE = pkgs.lib.optional pkgs.stdenv.isx86_64 "-m32"; + + # Make sure the shell can find headers for multilib + # PKG_CONFIG_PATH = pkgs.lib.makeSearchPath "lib/pkgconfig" criuDeps; + }; + in + { + # Export the development shell + devShells.default = devShell; + + # Build CRIU package as well + packages.default = pkgs.criu; + } + ); +} diff --git a/images/Makefile b/images/Makefile index 004e22ec3..2c33152e9 100644 --- a/images/Makefile +++ b/images/Makefile @@ -2,10 +2,12 @@ proto-obj-y += stats.o proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-mips.o +proto-obj-y += core-loongarch64.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o proto-obj-y += core-s390.o +proto-obj-y += core-riscv64.o proto-obj-y += cpuinfo.o proto-obj-y += inventory.o proto-obj-y += fdinfo.o @@ -56,7 +58,6 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o -proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -72,6 +73,7 @@ proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o proto-obj-y += rseq.o +proto-obj-y += pidfd.o CFLAGS += -iquote $(obj)/ @@ -88,12 +90,27 @@ endef makefile-deps := Makefile $(obj)/Makefile +# +# Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. +DESCRIPTOR_DIR := images/google/protobuf +GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf +$(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto + $(call msg-gen, $@) + $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< + +cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d + +submrproper: + $(Q) rm -f $(DESCRIPTOR_DIR)/* +.PHONY: submrproper +mrproper: submrproper + # # Generates rules needed to compile protobuf files. define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ - $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< + $$(Q) protoc --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) diff --git a/images/cgroup.proto b/images/cgroup.proto index ee0354124..02f226835 100644 --- a/images/cgroup.proto +++ b/images/cgroup.proto @@ -24,6 +24,7 @@ message cgroup_dir_entry { message cg_controller_entry { repeated string cnames = 1; repeated cgroup_dir_entry dirs = 2; + optional bool is_threaded = 3; } message cg_member_entry { diff --git a/images/core-aarch64.proto b/images/core-aarch64.proto index 3356e6b75..a94911c0b 100644 --- a/images/core-aarch64.proto +++ b/images/core-aarch64.proto @@ -17,9 +17,38 @@ message user_aarch64_fpsimd_context_entry { required uint32 fpcr = 3; } +message user_aarch64_gcs_entry { + required uint64 gcspr_el0 = 1 [(criu).hex = true]; + required uint64 features_enabled = 2 [(criu).hex = true]; +} + +message pac_address_keys { + required uint64 apiakey_lo = 1; + required uint64 apiakey_hi = 2; + required uint64 apibkey_lo = 3; + required uint64 apibkey_hi = 4; + required uint64 apdakey_lo = 5; + required uint64 apdakey_hi = 6; + required uint64 apdbkey_lo = 7; + required uint64 apdbkey_hi = 8; + required uint64 pac_enabled_key = 9; +} + +message pac_generic_keys { + required uint64 apgakey_lo = 1; + required uint64 apgakey_hi = 2; +} + +message pac_keys { + optional pac_address_keys pac_address_keys = 6; + optional pac_generic_keys pac_generic_keys = 7; +} + message thread_info_aarch64 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required uint64 tls = 2; required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; + optional pac_keys pac_keys = 5; + optional user_aarch64_gcs_entry gcs = 6; } diff --git a/images/core-loongarch64.proto b/images/core-loongarch64.proto new file mode 100755 index 000000000..8258f006e --- /dev/null +++ b/images/core-loongarch64.proto @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +message user_loongarch64_gpregs_entry { + repeated uint64 regs = 1; + required uint64 pc = 2; +} + +message user_loongarch64_fpregs_entry { + repeated uint64 regs = 1; + required uint64 fcc = 2; + required uint32 fcsr = 3; +} + +message thread_info_loongarch64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_loongarch64_gpregs_entry gpregs = 3[(criu).hex = true]; + required user_loongarch64_fpregs_entry fpregs = 4[(criu).hex = true]; +} diff --git a/images/core-mips.proto b/images/core-mips.proto old mode 100755 new mode 100644 diff --git a/images/core-riscv64.proto b/images/core-riscv64.proto new file mode 100644 index 000000000..1ddfdd8bd --- /dev/null +++ b/images/core-riscv64.proto @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; + +// Refer to riscv-gnu-toolchain/linux-headers/include/asm/ptrace.h +message user_riscv64_regs_entry { + required uint64 pc = 1; + required uint64 ra = 2; + required uint64 sp = 3; + required uint64 gp = 4; + required uint64 tp = 5; + required uint64 t0 = 6; + required uint64 t1 = 7; + required uint64 t2 = 8; + required uint64 s0 = 9; + required uint64 s1 = 10; + required uint64 a0 = 11; + required uint64 a1 = 12; + required uint64 a2 = 13; + required uint64 a3 = 14; + required uint64 a4 = 15; + required uint64 a5 = 16; + required uint64 a6 = 17; + required uint64 a7 = 18; + required uint64 s2 = 19; + required uint64 s3 = 20; + required uint64 s4 = 21; + required uint64 s5 = 22; + required uint64 s6 = 23; + required uint64 s7 = 24; + required uint64 s8 = 25; + required uint64 s9 = 26; + required uint64 s10 = 27; + required uint64 s11 = 28; + required uint64 t3 = 29; + required uint64 t4 = 30; + required uint64 t5 = 31; + required uint64 t6 = 32; +} + +message user_riscv64_d_ext_entry { + repeated uint64 f = 1; + required uint32 fcsr = 2; +} + +message thread_info_riscv64 { + required uint64 clear_tid_addr = 1[(criu).hex = true]; + required uint64 tls = 2; + required user_riscv64_regs_entry gpregs = 3[(criu).hex = true]; + required user_riscv64_d_ext_entry fpsimd = 4; +} diff --git a/images/core-x86.proto b/images/core-x86.proto index 815cf21ff..762418d73 100644 --- a/images/core-x86.proto +++ b/images/core-x86.proto @@ -41,6 +41,11 @@ message user_x86_regs_entry { optional user_x86_regs_mode mode = 28 [default = NATIVE]; } +message user_x86_cet_entry { + required uint64 cet = 1[(criu).hex = true]; + required uint64 ssp = 2[(criu).hex = true]; +} + message user_x86_xsave_entry { /* standard xsave features */ required uint64 xstate_bv = 1; @@ -60,6 +65,9 @@ message user_x86_xsave_entry { /* Protected keys */ repeated uint32 pkru = 8; + /* CET */ + optional user_x86_cet_entry cet = 9; + /* * Processor trace (PT) and hardware duty cycling (HDC) * are supervisor state components and only managed by diff --git a/images/core.proto b/images/core.proto index 35079f366..1fa23868b 100644 --- a/images/core.proto +++ b/images/core.proto @@ -8,6 +8,8 @@ import "core-aarch64.proto"; import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; +import "core-loongarch64.proto"; +import "core-riscv64.proto"; import "rlimit.proto"; import "timer.proto"; @@ -40,6 +42,7 @@ message task_core_entry { optional task_timers_entry timers = 7; optional task_rlimits_entry rlimits = 8; + /* This is deprecated, should be per-thread */ optional uint32 cg_set = 9; optional signal_queue_entry signals_s = 10; @@ -60,6 +63,10 @@ message task_core_entry { // Reserved for container relative start time //optional uint64 start_time = 19; optional uint64 blk_sigset_extended = 20[(criu).hex = true]; + + optional uint32 stop_signo = 21; + + optional uint32 membarrier_registration_mask = 22 [(criu).hex = true]; } message task_kobj_ids_entry { @@ -103,6 +110,7 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; optional rseq_entry rseq_entry = 15; + optional uint32 cg_set = 16; } message task_rlimits_entry { @@ -118,6 +126,8 @@ message core_entry { PPC64 = 4; S390 = 5; MIPS = 6; + LOONGARCH64 = 7; + RISCV64 = 8; } required march mtype = 1; @@ -127,6 +137,8 @@ message core_entry { optional thread_info_ppc64 ti_ppc64 = 9; optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; + optional thread_info_loongarch64 ti_loongarch64 = 12; + optional thread_info_riscv64 ti_riscv64 = 13; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; diff --git a/images/creds.proto b/images/creds.proto index 6228f7fcb..932a40ccf 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -24,4 +24,7 @@ message creds_entry { optional string lsm_profile = 15; optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; + optional uint32 no_new_privs = 18; + + repeated uint32 cap_amb = 19; } diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c1186..32ec13cf4 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -17,6 +17,7 @@ import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; +import "pidfd.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + PIDFD = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -78,4 +80,5 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; + optional pidfd_entry pidfd = 22; } diff --git a/images/google/protobuf/.gitignore b/images/google/protobuf/.gitignore new file mode 100644 index 000000000..68359a786 --- /dev/null +++ b/images/google/protobuf/.gitignore @@ -0,0 +1,2 @@ +*.c +*.h diff --git a/images/google/protobuf/descriptor.proto b/images/google/protobuf/descriptor.proto deleted file mode 120000 index 07a4c9add..000000000 --- a/images/google/protobuf/descriptor.proto +++ /dev/null @@ -1 +0,0 @@ -/usr/include/google/protobuf/descriptor.proto \ No newline at end of file diff --git a/images/inventory.proto b/images/inventory.proto index a735bad1d..feed5b850 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,6 +10,13 @@ enum lsmtype { APPARMOR = 2; } +// It is not possible to distinguish between an empty repeated field +// and unset repeated field. To solve this problem and provide backwards +// compabibility, we use the 'plugins_entry' message. +message plugins_entry { + repeated string plugins = 12; +}; + message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -21,4 +28,10 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; + optional plugins_entry plugins_entry = 12; + // Remember the criu_run_id when CRIU dumped the process. + // This is currently used to delete the correct nftables + // network locking rule. + optional string dump_criu_run_id = 13; + optional bool allow_uprobes = 14; } diff --git a/images/memfd.proto b/images/memfd.proto index 0e625416a..bb0be4a6f 100644 --- a/images/memfd.proto +++ b/images/memfd.proto @@ -22,4 +22,5 @@ message memfd_inode_entry { required uint32 seals = 6 [(criu).flags = "seals.flags"]; required uint64 inode_id = 7; optional uint32 hugetlb_flag = 8; + optional uint32 mode = 9; }; diff --git a/images/netdev.proto b/images/netdev.proto index 748fd0200..42e2bc7d7 100644 --- a/images/netdev.proto +++ b/images/netdev.proto @@ -74,4 +74,5 @@ message netns_entry { repeated netns_id nsids = 7; optional string ext_key = 8; repeated sysctl_entry unix_conf = 9; + repeated sysctl_entry ipv4_sysctl = 10; } diff --git a/images/pagemap.proto b/images/pagemap.proto index e6d341b0f..f2436a51a 100644 --- a/images/pagemap.proto +++ b/images/pagemap.proto @@ -10,7 +10,8 @@ message pagemap_head { message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; - required uint32 nr_pages = 2; + required uint32 compat_nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; + optional uint64 nr_pages = 5; } diff --git a/images/pidfd.proto b/images/pidfd.proto new file mode 100644 index 000000000..a9da3e454 --- /dev/null +++ b/images/pidfd.proto @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "fown.proto"; + +message pidfd_entry { + required uint32 id = 1; + required uint32 ino = 2; + required uint32 flags = 3; + required int32 nspid = 4; + required fown_entry fown = 5; +} diff --git a/images/rpc.proto b/images/rpc.proto index a6cc5da48..1a4722a9c 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -52,6 +52,7 @@ enum criu_cg_mode { enum criu_network_lock_method { IPTABLES = 1; NFTABLES = 2; + SKIP = 3; }; enum criu_pre_dump_mode { @@ -60,7 +61,8 @@ enum criu_pre_dump_mode { }; message criu_opts { - required int32 images_dir_fd = 1; + required int32 images_dir_fd = 1 [default = -1]; + optional string images_dir = 68; /* used only if images_dir_fd == -1 */ optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional bool leave_running = 3; @@ -138,6 +140,11 @@ message criu_opts { optional string lsm_mount_context = 63; optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; optional bool mntns_compat_mode = 65; + optional bool skip_file_rwx_check = 66; + optional bool unprivileged = 67; + optional bool leave_stopped = 69; + optional bool display_stats = 70; + optional bool log_to_stderr = 71; /* optional bool check_mounts = 128; */ } diff --git a/images/sk-inet.proto b/images/sk-inet.proto index 594e29c66..2c709e018 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -5,6 +5,7 @@ syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; +import "tcp-stream.proto"; message ip_opts_raw_entry { optional bool hdrincl = 1; @@ -17,6 +18,10 @@ message ip_opts_entry { optional bool freebind = 1; // Fields 2 and 3 are reserved for vz7 use optional ip_opts_raw_entry raw = 4; + + optional bool pktinfo = 5; + optional uint32 tos = 6; + optional uint32 ttl = 7; } message inet_sk_entry { @@ -52,4 +57,5 @@ message inet_sk_entry { optional string ifname = 17; optional uint32 ns_id = 18; optional sk_shutdown shutdown = 19; + optional tcp_opts_entry tcp_opts = 20; } diff --git a/images/sk-opts.proto b/images/sk-opts.proto index 1d24d47cc..2f9d4e5c3 100644 --- a/images/sk-opts.proto +++ b/images/sk-opts.proto @@ -26,9 +26,12 @@ message sk_opts_entry { optional bool so_reuseport = 17; optional bool so_broadcast = 18; optional bool so_keepalive = 19; + + /* These three are deprecated, use tcp_opts_entry instead */ optional uint32 tcp_keepcnt = 20; optional uint32 tcp_keepidle = 21; optional uint32 tcp_keepintvl = 22; + optional uint32 so_oobinline = 23; optional uint32 so_linger = 24; diff --git a/images/tcp-stream.proto b/images/tcp-stream.proto index c2244ba3b..3d834159f 100644 --- a/images/tcp-stream.proto +++ b/images/tcp-stream.proto @@ -4,6 +4,14 @@ syntax = "proto2"; import "opts.proto"; +message tcp_opts_entry { + optional bool cork = 1; + optional bool nodelay = 2; + optional uint32 keepcnt = 3; + optional uint32 keepidle = 4; + optional uint32 keepintvl = 5; +} + message tcp_stream_entry { required uint32 inq_len = 1; required uint32 inq_seq = 2; @@ -16,6 +24,7 @@ message tcp_stream_entry { optional uint32 rcv_wscale = 8; optional uint32 timestamp = 9; + /* These two are deprecated, use tcp_opts_entry instead */ optional bool cork = 10; optional bool nodelay = 11; diff --git a/include/common/arch/aarch64/asm/page.h b/include/common/arch/aarch64/asm/page.h index 90670d126..4555debbd 100644 --- a/include/common/arch/aarch64/asm/page.h +++ b/include/common/arch/aarch64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/loongarch64/asm/atomic.h b/include/common/arch/loongarch64/asm/atomic.h new file mode 100644 index 000000000..901725439 --- /dev/null +++ b/include/common/arch/loongarch64/asm/atomic.h @@ -0,0 +1,62 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +#include +#include "common/compiler.h" + +typedef struct { + int counter; +} atomic_t; + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(v->counter), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add(1, v) +#define atomic_inc_return(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(ptr->counter) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/loongarch64/asm/bitops.h b/include/common/arch/loongarch64/asm/bitops.h new file mode 100644 index 000000000..170e4f736 --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitops.h @@ -0,0 +1,24 @@ +#ifndef _LINUX_BITOPS_H +#define _LINUX_BITOPS_H +#include "common/asm-generic/bitops.h" + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ + +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((1UL << ((nr) / BITS_PER_LONG)) - 1) +static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long res, mask; + mask = BIT_MASK(nr); + asm volatile("amor_db.d %0, %2, %1" : "=&r"(res), "+ZB"(addr[BIT_WORD(nr)]) : "r"(mask) : "memory"); + return (res & mask) != 0; +} + +#endif diff --git a/include/common/arch/loongarch64/asm/bitsperlong.h b/include/common/arch/loongarch64/asm/bitsperlong.h new file mode 100644 index 000000000..13d06a384 --- /dev/null +++ b/include/common/arch/loongarch64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG _LOONGARCH_SZLONG + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/loongarch64/asm/linkage.h b/include/common/arch/loongarch64/asm/linkage.h new file mode 100644 index 000000000..448acc29f --- /dev/null +++ b/include/common/arch/loongarch64/asm/linkage.h @@ -0,0 +1,19 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + __ALIGN; \ + .type name, @function; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h new file mode 100644 index 000000000..4fcdb64dc --- /dev/null +++ b/include/common/arch/loongarch64/asm/page.h @@ -0,0 +1,39 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +static unsigned __page_size; +static unsigned __page_shift; + +static inline unsigned long page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SIZE page_size() +#define PAGE_SHIFT page_shift() +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) +#else /* CR_NOGLIBC */ + +extern unsigned long page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ + +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/include/common/arch/mips/asm/page.h b/include/common/arch/mips/asm/page.h index 25bdbc141..4fcdb64dc 100644 --- a/include/common/arch/mips/asm/page.h +++ b/include/common/arch/mips/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/ppc64/asm/bitops.h b/include/common/arch/ppc64/asm/bitops.h index dbfa6be7f..54d55da16 100644 --- a/include/common/arch/ppc64/asm/bitops.h +++ b/include/common/arch/ppc64/asm/bitops.h @@ -46,6 +46,7 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #define __stringify_in_c(...) #__VA_ARGS__ #define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " @@ -202,8 +203,8 @@ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ diff --git a/include/common/arch/ppc64/asm/page.h b/include/common/arch/ppc64/asm/page.h index a1ff6718a..2b0c0e504 100644 --- a/include/common/arch/ppc64/asm/page.h +++ b/include/common/arch/ppc64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned page_size(void) +static inline unsigned long page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned page_size(void); +extern unsigned long page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/riscv64/asm/atomic.h b/include/common/arch/riscv64/asm/atomic.h new file mode 100644 index 000000000..4b08bd9fd --- /dev/null +++ b/include/common/arch/riscv64/asm/atomic.h @@ -0,0 +1,109 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef struct { + int counter; +} atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)&(v)->counter); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + v->counter = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(v->counter), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(ptr->counter) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h new file mode 100644 index 000000000..eabab27c7 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitops.h @@ -0,0 +1,159 @@ +#ifndef __CR_ASM_BITOPS_H__ +#define __CR_ASM_BITOPS_H__ + +#include "common/compiler.h" +#include "common/asm/bitsperlong.h" + +#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) + +#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m"(*(volatile long *)(x)) +#else +#define BITOP_ADDR(x) "+m"(*(volatile long *)(x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +static inline void set_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr |= (1UL << (nr % BITS_PER_LONG)); +} + +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr ^= (1UL << (nr % BITS_PER_LONG)); +} + +static inline int test_bit(int nr, volatile const unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + return (*addr & (1UL << (nr % BITS_PER_LONG))) ? -1 : 0; +} + +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + addr += nr / BITS_PER_LONG; + *addr &= ~(1UL << (nr % BITS_PER_LONG)); +} + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + int p = 0; + + for (; p < 8*sizeof(word); ++p) { + if (word & 1) { + break; + } + + word >>= 1; + } + + return p; +} + +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/* + * Find the next set bit in a memory region. + */ +static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG - 1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) + + +#define BITS_PER_LONG 64 + +#define BIT_MASK(nr) ((1##UL) << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +#define __AMO(op) "amo" #op ".d" + +#define __test_and_op_bit_ord(op, mod, nr, addr, ord) \ + ({ \ + unsigned long __res, __mask; \ + __mask = BIT_MASK(nr); \ + __asm__ __volatile__(__AMO(op) #ord " %0, %2, %1" \ + : "=r"(__res), "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(__mask)) \ + : "memory"); \ + ((__res & __mask) != 0); \ + }) + +#define __op_bit_ord(op, mod, nr, addr, ord) \ + __asm__ __volatile__(__AMO(op) #ord " zero, %1, %0" \ + : "+A"(addr[BIT_WORD(nr)]) \ + : "r"(mod(BIT_MASK(nr))) \ + : "memory"); + +#define __test_and_op_bit(op, mod, nr, addr) __test_and_op_bit_ord(op, mod, nr, addr, .aqrl) +#define __op_bit(op, mod, nr, addr) __op_bit_ord(op, mod, nr, addr, ) + +/* Bitmask modifiers */ +#define __NOP(x) (x) +#define __NOT(x) (~(x)) + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation may be reordered on other architectures than x86. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + return __test_and_op_bit(or, __NOP, nr, addr); +} + +#endif /* __CR_ASM_BITOPS_H__ */ diff --git a/include/common/arch/riscv64/asm/bitsperlong.h b/include/common/arch/riscv64/asm/bitsperlong.h new file mode 100644 index 000000000..d95727d19 --- /dev/null +++ b/include/common/arch/riscv64/asm/bitsperlong.h @@ -0,0 +1,6 @@ +#ifndef __CR_BITSPERLONG_H__ +#define __CR_BITSPERLONG_H__ + +#define BITS_PER_LONG 64 + +#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/riscv64/asm/linkage.h b/include/common/arch/riscv64/asm/linkage.h new file mode 100644 index 000000000..c6d40f750 --- /dev/null +++ b/include/common/arch/riscv64/asm/linkage.h @@ -0,0 +1,23 @@ +#ifndef __CR_LINKAGE_H__ +#define __CR_LINKAGE_H__ + +#ifdef __ASSEMBLY__ + +#define __ALIGN .align 4, 0x00 +#define __ALIGN_STR ".align 4, 0x00" + +#define GLOBAL(name) \ + .globl name; \ +name: + +#define ENTRY(name) \ + .globl name; \ + .type name, @function; \ + __ALIGN; \ +name: + +#define END(sym) .size sym, .- sym + +#endif /* __ASSEMBLY__ */ + +#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/riscv64/asm/page.h b/include/common/arch/riscv64/asm/page.h new file mode 100644 index 000000000..5113cb6db --- /dev/null +++ b/include/common/arch/riscv64/asm/page.h @@ -0,0 +1,44 @@ +#ifndef __CR_ASM_PAGE_H__ +#define __CR_ASM_PAGE_H__ + +#define ARCH_HAS_LONG_PAGES + +#ifndef CR_NOGLIBC +#include /* ffsl() */ +#include /* _SC_PAGESIZE */ + +extern unsigned __page_size; +extern unsigned __page_shift; + +static inline unsigned page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +/* + * Don't add ifdefs for PAGE_SIZE: if any header defines it as a constant + * on aarch64, then we need refrain using PAGE_SIZE in criu and use + * page_size() across sources (as it may differ on aarch64). + */ +#define PAGE_SIZE page_size() +#define PAGE_MASK (~(PAGE_SIZE - 1)) +#define PAGE_SHIFT page_shift() + +#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) + +#else /* CR_NOGLIBC */ + +extern unsigned long page_size(void); +#define PAGE_SIZE page_size() + +#endif /* CR_NOGLIBC */ +#endif /* __CR_ASM_PAGE_H__ */ diff --git a/include/common/arch/s390/asm/bitops.h b/include/common/arch/s390/asm/bitops.h index f396721e9..22547c544 100644 --- a/include/common/arch/s390/asm/bitops.h +++ b/include/common/arch/s390/asm/bitops.h @@ -10,6 +10,7 @@ #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) static inline unsigned long *__bitops_word(unsigned long nr, volatile unsigned long *ptr) { @@ -143,8 +144,8 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo return _find_next_bit(addr, size, offset, 0UL); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* _S390_BITOPS_H */ diff --git a/include/common/arch/x86/asm/bitops.h b/include/common/arch/x86/asm/bitops.h index c13c1eb45..f3c7dbbdf 100644 --- a/include/common/arch/x86/asm/bitops.h +++ b/include/common/arch/x86/asm/bitops.h @@ -10,6 +10,7 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc @@ -119,8 +120,8 @@ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ diff --git a/include/common/asm-generic/bitops.h b/include/common/asm-generic/bitops.h index 004da4c4e..d8f38091d 100644 --- a/include/common/asm-generic/bitops.h +++ b/include/common/asm-generic/bitops.h @@ -14,6 +14,7 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] +#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc @@ -103,8 +104,8 @@ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ - i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ + i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) #endif /* __CR_GENERIC_BITOPS_H__ */ diff --git a/include/common/compiler.h b/include/common/compiler.h index bd3de01df..3e66709f9 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -30,6 +30,17 @@ #define __always_unused __attribute__((unused)) #define __must_check __attribute__((__warn_unused_result__)) +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +/* Not supported by clang */ +#if __has_attribute(__externally_visible__) +#define __visible __attribute__((__externally_visible__)) +#else +#define __visible +#endif + #define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline @@ -47,7 +58,9 @@ #define noinline __attribute__((noinline)) #endif +#ifndef __aligned #define __aligned(x) __attribute__((aligned(x))) +#endif /* * Macro to define stack alignment. @@ -76,6 +89,7 @@ #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) #define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) +#define ALIGN_DOWN(x, a) ALIGN((x) - ((a) - 1), (a)) #define min(x, y) \ ({ \ diff --git a/include/common/lock.h b/include/common/lock.h index ccfa468b8..4733d7287 100644 --- a/include/common/lock.h +++ b/include/common/lock.h @@ -2,6 +2,7 @@ #define __CR_COMMON_LOCK_H__ #include +#include #include #include #include @@ -162,6 +163,11 @@ static inline void mutex_lock(mutex_t *m) } } +static inline bool mutex_trylock(mutex_t *m) +{ + return atomic_inc_return(&m->raw) == 1; +} + static inline void mutex_unlock(mutex_t *m) { uint32_t c = 0; diff --git a/include/common/scm.h b/include/common/scm.h index bcb198882..5b6f78a8b 100644 --- a/include/common/scm.h +++ b/include/common/scm.h @@ -11,7 +11,7 @@ * Because of kernel doing kmalloc for user data passed * in SCM messages, and there is kernel's SCM_MAX_FD as a limit * for descriptors passed at once we're trying to reduce - * the pressue on kernel memory manager and use predefined + * the pressure on kernel memory manager and use predefined * known to work well size of the message buffer. */ #define CR_SCM_MSG_SIZE (1024) diff --git a/lib/.gitignore b/lib/.gitignore new file mode 100644 index 000000000..a10181b80 --- /dev/null +++ b/lib/.gitignore @@ -0,0 +1 @@ +pycriu.egg-info/ diff --git a/lib/Makefile b/lib/Makefile index 575a7bad3..4b8a6cbb8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,10 +2,6 @@ CRIU_SO := libcriu.so CRIU_A := libcriu.a UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/version.h -# -# File to keep track of files installed by setup.py -CRIT_SETUP_FILES := lib/.crit-setup.files - all-y += lib-c lib-a lib-py # @@ -29,23 +25,23 @@ lib-a: lib/c/$(CRIU_A) # # Python bindings. -lib/py/Makefile: ; -lib/py/%: .FORCE +lib/pycriu/Makefile: ; +lib/pycriu/%: .FORCE $(call msg-gen, $@) - $(Q) $(MAKE) $(build)=lib/py $@ + $(Q) $(MAKE) $(build)=lib/pycriu $@ lib-py: - $(Q) $(MAKE) $(build)=lib/py all + $(Q) $(MAKE) $(build)=lib/pycriu all .PHONY: lib-py clean-lib: $(Q) $(MAKE) $(build)=lib/c clean - $(Q) $(MAKE) $(build)=lib/py clean + $(Q) $(MAKE) $(build)=lib/pycriu clean .PHONY: clean-lib clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc mrproper: clean -install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in +install: lib-c lib-a lib-py lib/c/criu.pc.in $(E) " INSTALL " lib $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) @@ -58,8 +54,12 @@ install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig - $(E) " INSTALL " crit - $(Q) $(PYTHON) scripts/crit-setup.py install --prefix=$(DESTDIR)$(PREFIX) --record $(CRIT_SETUP_FILES) +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " INSTALL " pycriu + $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./lib +else + $(E) " SKIP INSTALL pycriu" +endif .PHONY: install uninstall: @@ -71,6 +71,10 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) - $(E) " UNINSTALL" crit - $(Q) while read -r file; do $(RM) "$$file"; done < $(CRIT_SETUP_FILES) +ifeq ($(SKIP_PIP_INSTALL),0) + $(E) " UNINSTALL" pycriu + $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu +else + $(E) " SKIP UNINSTALL pycriu" +endif .PHONY: uninstall diff --git a/lib/c/criu.c b/lib/c/criu.c index 7807d7bc5..485c8b178 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -352,8 +352,8 @@ int criu_set_parent_images(const char *path) int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode) { - opts->rpc->has_pre_dump_mode = true; if (mode == CRIU_PRE_DUMP_SPLICE || mode == CRIU_PRE_DUMP_READ) { + opts->rpc->has_pre_dump_mode = true; opts->rpc->pre_dump_mode = (CriuPreDumpMode)mode; return 0; } @@ -555,6 +555,28 @@ void criu_set_shell_job(bool shell_job) criu_local_set_shell_job(global_opts, shell_job); } +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check) +{ + opts->rpc->has_skip_file_rwx_check = true; + opts->rpc->skip_file_rwx_check = skip_file_rwx_check; +} + +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) +{ + criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); +} + +void criu_local_set_unprivileged(criu_opts *opts, bool unprivileged) +{ + opts->rpc->has_unprivileged = true; + opts->rpc->unprivileged = unprivileged; +} + +void criu_set_unprivileged(bool unprivileged) +{ + criu_local_set_unprivileged(global_opts, unprivileged); +} + void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; @@ -1845,8 +1867,8 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { - opts->rpc->has_network_lock = true; - if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES) { + if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { + opts->rpc->has_network_lock = true; opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } @@ -2008,3 +2030,33 @@ int criu_feature_check(struct criu_feature_check *features, size_t size) { return criu_local_feature_check(global_opts, features, size); } + +void criu_local_set_empty_ns(criu_opts *opts, int namespaces) +{ + opts->rpc->has_empty_ns = true; + opts->rpc->empty_ns = namespaces; +} + +void criu_set_empty_ns(int namespaces) +{ + criu_local_set_empty_ns(global_opts, namespaces); +} + +int criu_local_set_config_file(criu_opts *opts, const char *path) +{ + char *new; + + new = strdup(path); + if (!new) + return -ENOMEM; + + free(opts->rpc->config_file); + opts->rpc->config_file = new; + + return 0; +} + +int criu_set_config_file(const char *path) +{ + return criu_local_set_config_file(global_opts, path); +} diff --git a/lib/c/criu.h b/lib/c/criu.h index 7cc6a199c..44446f664 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -50,6 +50,7 @@ enum criu_cg_mode { enum criu_network_lock_method { CRIU_NETWORK_LOCK_IPTABLES = 1, CRIU_NETWORK_LOCK_NFTABLES = 2, + CRIU_NETWORK_LOCK_SKIP = 3, }; enum criu_pre_dump_mode { CRIU_PRE_DUMP_SPLICE = 1, CRIU_PRE_DUMP_READ = 2 }; @@ -78,6 +79,8 @@ void criu_set_tcp_close(bool tcp_close); void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); +void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); +void criu_set_unprivileged(bool unprivileged); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); @@ -113,6 +116,7 @@ void criu_set_pidfd_store_sk(int sk); int criu_set_network_lock(enum criu_network_lock_method method); int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt); void criu_set_mntns_compat_mode(bool val); +int criu_set_config_file(const char *path); /* * The criu_notify_arg_t na argument is an opaque @@ -238,6 +242,7 @@ void criu_local_set_tcp_close(criu_opts *opts, bool tcp_close); void criu_local_set_weak_sysctls(criu_opts *opts, bool val); void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices); void criu_local_set_shell_job(criu_opts *opts, bool shell_job); +void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check); void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master); void criu_local_set_file_locks(criu_opts *opts, bool file_locks); void criu_local_set_track_mem(criu_opts *opts, bool track_mem); @@ -277,6 +282,7 @@ void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk); int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method); int criu_local_join_ns_add(criu_opts *opts, const char *ns, const char *ns_file, const char *extra_opt); void criu_local_set_mntns_compat_mode(criu_opts *opts, bool val); +int criu_local_set_config_file(criu_opts *opts, const char *path); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); @@ -319,6 +325,9 @@ struct criu_feature_check { int criu_feature_check(struct criu_feature_check *features, size_t size); int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size); +void criu_local_set_empty_ns(criu_opts *opts, int namespaces); +void criu_set_empty_ns(int namespaces); + #ifdef __GNUG__ } #endif diff --git a/lib/py/.gitignore b/lib/py/.gitignore deleted file mode 100644 index d3090fca3..000000000 --- a/lib/py/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*_pb2.py -*.pyc diff --git a/lib/py/__init__.py b/lib/py/__init__.py deleted file mode 100644 index 96b3e9526..000000000 --- a/lib/py/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from . import rpc_pb2 as rpc -from . import images -from .criu import * diff --git a/lib/pycriu/.gitignore b/lib/pycriu/.gitignore new file mode 100644 index 000000000..111642787 --- /dev/null +++ b/lib/pycriu/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +*_pb2.py +*.pyc +version.py diff --git a/lib/py/Makefile b/lib/pycriu/Makefile similarity index 66% rename from lib/py/Makefile rename to lib/pycriu/Makefile index 691b6bdd3..5ce9bc8f7 100644 --- a/lib/py/Makefile +++ b/lib/pycriu/Makefile @@ -1,4 +1,4 @@ -all-y += libpy-images rpc_pb2.py +all-y += libpy-images rpc_pb2.py version.py $(obj)/images/Makefile: ; $(obj)/images/%: .FORCE @@ -11,7 +11,10 @@ libpy-images: rpc_pb2.py: $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) -cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) +version.py: + $(Q) echo "__version__ = '${CRIU_VERSION}'" > $(obj)/$@ + +cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc version.py) clean-lib-py: $(Q) $(MAKE) $(build)=$(obj)/images clean diff --git a/lib/pycriu/__init__.py b/lib/pycriu/__init__.py new file mode 100644 index 000000000..28f1e9424 --- /dev/null +++ b/lib/pycriu/__init__.py @@ -0,0 +1,15 @@ +from . import rpc_pb2 as rpc +from . import images +from .criu import criu, CRIUExceptionExternal, CRIUException +from .criu import CR_DEFAULT_SERVICE_ADDRESS +from .version import __version__ + +__all__ = ( + "rpc", + "images", + "criu", + "CRIUExceptionExternal", + "CRIUException", + "CR_DEFAULT_SERVICE_ADDRESS", + "__version__", +) \ No newline at end of file diff --git a/lib/py/criu.py b/lib/pycriu/criu.py similarity index 89% rename from lib/py/criu.py rename to lib/pycriu/criu.py index f3e018095..51a5c2902 100644 --- a/lib/py/criu.py +++ b/lib/pycriu/criu.py @@ -8,6 +8,7 @@ import struct import pycriu.rpc_pb2 as rpc +CR_DEFAULT_SERVICE_ADDRESS = "./criu_service.socket" class _criu_comm: """ @@ -45,7 +46,14 @@ class _criu_comm_sk(_criu_comm): def connect(self, daemon): self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) - self.sk.connect(self.comm) + try: + self.sk.connect(self.comm) + + except FileNotFoundError: + raise FileNotFoundError("Socket file not found.") + + except ConnectionRefusedError: + raise ConnectionRefusedError("Service not running.") return self.sk @@ -103,7 +111,7 @@ class _criu_comm_bin(_criu_comm): os.close(2) css[0].send(struct.pack('i', os.getpid())) - os.execv(self.comm, + os.execvp(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) os._exit(1) @@ -181,15 +189,14 @@ class CRIUExceptionExternal(CRIUException): if self.errno == errno.EBADRQC: s += "Bad options" - if self.typ == rpc.DUMP: - if self.errno == errno.ESRCH: - s += "No process with such pid" + elif self.typ == rpc.DUMP and self.errno == errno.ESRCH: + s += "No process with such pid" - if self.typ == rpc.RESTORE: - if self.errno == errno.EEXIST: - s += "Process with requested pid already exists" + elif self.typ == rpc.RESTORE and self.errno == errno.EEXIST: + s += "Process with requested pid already exists" - s += "Unknown" + else: + s += "Unknown" return s @@ -204,10 +211,11 @@ class criu: def __init__(self): self.use_binary('criu') - self.opts = rpc.criu_opts() + # images_dir_fd is required field with default value of -1 + self.opts = rpc.criu_opts(images_dir_fd=-1) self.sk = None - def use_sk(self, sk_name): + def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): """ Access criu using unix socket which that belongs to criu service daemon. """ @@ -234,7 +242,7 @@ class criu: # process resources from its own if criu is located in a same # process tree it is trying to dump. daemon = False - if req.type == rpc.DUMP and not req.opts.HasField('pid'): + if req.type == rpc.DUMP and (not req.opts.HasField('pid') or req.opts.pid == os.getpid()): daemon = True try: @@ -266,6 +274,7 @@ class criu: """ req = rpc.criu_req() req.type = rpc.CHECK + req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) diff --git a/lib/py/images/.gitignore b/lib/pycriu/images/.gitignore similarity index 100% rename from lib/py/images/.gitignore rename to lib/pycriu/images/.gitignore diff --git a/lib/py/images/Makefile b/lib/pycriu/images/Makefile similarity index 100% rename from lib/py/images/Makefile rename to lib/pycriu/images/Makefile diff --git a/lib/py/images/__init__.py b/lib/pycriu/images/__init__.py similarity index 100% rename from lib/py/images/__init__.py rename to lib/pycriu/images/__init__.py diff --git a/lib/py/images/images.py b/lib/pycriu/images/images.py similarity index 89% rename from lib/py/images/images.py rename to lib/pycriu/images/images.py index eda030a5c..9db506e1e 100644 --- a/lib/py/images/images.py +++ b/lib/pycriu/images/images.py @@ -42,7 +42,6 @@ import base64 import struct import os import array -import sys from . import magic from . import pb @@ -69,6 +68,16 @@ class MagicException(Exception): self.magic = magic +def decode_base64_data(data): + """A helper function to decode base64 data.""" + return base64.decodebytes(str.encode(data)) + + +def write_base64_data(f, data): + """A helper function to write base64 encoded data to a file.""" + f.write(base64.decodebytes(str.encode(data))) + + # Generic class to handle loading/dumping criu images entries from/to bin # format to/from dict(json). class entry_handler: @@ -98,7 +107,7 @@ class entry_handler: # Read payload pbuff = self.payload() buf = f.read(4) - if buf == b'': + if len(buf) == 0: break size, = struct.unpack('i', buf) pbuff.ParseFromString(f.read(size)) @@ -172,7 +181,7 @@ class entry_handler: while True: buf = f.read(4) - if buf == '': + if len(buf) == 0: break size, = struct.unpack('i', buf) f.seek(size, 1) @@ -195,7 +204,7 @@ class pagemap_handler: pbuff = pb.pagemap_head() while True: buf = f.read(4) - if buf == b'': + if len(buf) == 0: break size, = struct.unpack('i', buf) pbuff.ParseFromString(f.read(size)) @@ -285,15 +294,9 @@ class ghost_file_handler: size = len(pb_str) f.write(struct.pack('i', size)) f.write(pb_str) - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(item['extra']))) - else: - f.write(base64.decodebytes(item['extra'])) + write_base64_data(f, item['extra']) else: - if (sys.version_info > (3, 0)): - f.write(base64.decodebytes(str.encode(item['extra']))) - else: - f.write(base64.decodebytes(item['extra'])) + write_base64_data(f, item['extra']) def dumps(self, entries): f = io.BytesIO('') @@ -314,10 +317,7 @@ class pipes_data_extra_handler: return base64.encodebytes(data).decode('utf-8') def dump(self, extra, f, pload): - if (sys.version_info > (3, 0)): - data = base64.decodebytes(str.encode(extra)) - else: - data = base64.decodebytes(extra) + data = decode_base64_data(extra) f.write(data) def skip(self, f, pload): @@ -332,10 +332,7 @@ class sk_queues_extra_handler: return base64.encodebytes(data).decode('utf-8') def dump(self, extra, f, _unused): - if (sys.version_info > (3, 0)): - data = base64.decodebytes(str.encode(extra)) - else: - data = base64.decodebytes(extra) + data = decode_base64_data(extra) f.write(data) def skip(self, f, pload): @@ -356,12 +353,8 @@ class tcp_stream_extra_handler: return d def dump(self, extra, f, _unused): - if (sys.version_info > (3, 0)): - inq = base64.decodebytes(str.encode(extra['inq'])) - outq = base64.decodebytes(str.encode(extra['outq'])) - else: - inq = base64.decodebytes(extra['inq']) - outq = base64.decodebytes(extra['outq']) + inq = decode_base64_data(extra['inq']) + outq = decode_base64_data(extra['outq']) f.write(inq) f.write(outq) @@ -370,6 +363,7 @@ class tcp_stream_extra_handler: f.seek(0, os.SEEK_END) return pbuff.inq_len + pbuff.outq_len + class bpfmap_data_extra_handler: def load(self, f, pload): size = pload.keys_bytes + pload.values_bytes @@ -384,14 +378,13 @@ class bpfmap_data_extra_handler: f.seek(pload.bytes, os.SEEK_CUR) return pload.bytes + class ipc_sem_set_handler: def load(self, f, pbuff): entry = pb2dict.pb2dict(pbuff) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) - s = array.array('H') - if s.itemsize != sizeof_u16: - raise Exception("Array size mismatch") + s = self._get_sem_array() s.frombytes(f.read(size)) f.seek(rounded - size, 1) return s.tolist() @@ -400,9 +393,7 @@ class ipc_sem_set_handler: entry = pb2dict.pb2dict(pbuff) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) - s = array.array('H') - if s.itemsize != sizeof_u16: - raise Exception("Array size mismatch") + s = self._get_sem_array() s.fromlist(extra) if len(s) != entry['nsems']: raise Exception("Number of semaphores mismatch") @@ -415,23 +406,16 @@ class ipc_sem_set_handler: f.seek(round_up(size, sizeof_u64), os.SEEK_CUR) return size + def _get_sem_array(self): + s = array.array('H') + if s.itemsize != sizeof_u16: + raise Exception("Array size mismatch") + return s + class ipc_msg_queue_handler: def load(self, f, pbuff): - entry = pb2dict.pb2dict(pbuff) - messages = [] - for x in range(0, entry['qnum']): - buf = f.read(4) - if buf == '': - break - size, = struct.unpack('i', buf) - msg = pb.ipc_msg() - msg.ParseFromString(f.read(size)) - rounded = round_up(msg.msize, sizeof_u64) - data = f.read(msg.msize) - f.seek(rounded - msg.msize, 1) - messages.append(pb2dict.pb2dict(msg)) - messages.append(base64.encodebytes(data).decode('utf-8')) + messages, _ = self._read_messages(f, pbuff) return messages def dump(self, extra, f, pbuff): @@ -443,28 +427,37 @@ class ipc_msg_queue_handler: f.write(struct.pack('i', size)) f.write(msg_str) rounded = round_up(msg.msize, sizeof_u64) - if (sys.version_info > (3, 0)): - data = base64.decodebytes(str.encode(extra[i + 1])) - else: - data = base64.decodebytes(extra[i + 1]) + data = decode_base64_data(extra[i + 1]) f.write(data[:msg.msize]) f.write(b'\0' * (rounded - msg.msize)) def skip(self, f, pbuff): + _, pl_len = self._read_messages(f, pbuff, skip_data=True) + return pl_len + + def _read_messages(self, f, pbuff, skip_data=False): entry = pb2dict.pb2dict(pbuff) + messages = [] pl_len = 0 for x in range(0, entry['qnum']): buf = f.read(4) - if buf == '': + if len(buf) == 0: break size, = struct.unpack('i', buf) msg = pb.ipc_msg() msg.ParseFromString(f.read(size)) rounded = round_up(msg.msize, sizeof_u64) - f.seek(rounded, os.SEEK_CUR) pl_len += size + msg.msize - return pl_len + if skip_data: + f.seek(rounded, os.SEEK_CUR) + else: + data = f.read(msg.msize) + f.seek(rounded - msg.msize, 1) + messages.append(pb2dict.pb2dict(msg)) + messages.append(base64.encodebytes(data).decode('utf-8')) + + return messages, pl_len class ipc_shm_handler: @@ -560,7 +553,7 @@ handlers = { 'MEMFD_INODE': entry_handler(pb.memfd_inode_entry), 'BPFMAP_FILE': entry_handler(pb.bpfmap_file_entry), 'BPFMAP_DATA': entry_handler(pb.bpfmap_data_entry, - bpfmap_data_extra_handler()), + bpfmap_data_extra_handler()), 'APPARMOR': entry_handler(pb.apparmor_entry), } @@ -574,12 +567,12 @@ def __rhandler(f): try: m = magic.by_val[img_magic] - except: + except Exception: raise MagicException(img_magic) try: handler = handlers[m] - except: + except Exception: raise Exception("No handler found for image with magic " + m) return m, handler @@ -641,7 +634,7 @@ def dump(img, f): try: handler = handlers[m] - except: + except Exception: raise Exception("No handler found for image with such magic") handler.dump(img['entries'], f) diff --git a/lib/py/images/pb2dict.py b/lib/pycriu/images/pb2dict.py similarity index 95% rename from lib/py/images/pb2dict.py rename to lib/pycriu/images/pb2dict.py index 9d581c375..f22887a52 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/pycriu/images/pb2dict.py @@ -3,7 +3,6 @@ import collections import os import quopri import socket -import sys from ipaddress import IPv4Address, IPv6Address, ip_address from google.protobuf.descriptor import FieldDescriptor as FD @@ -84,6 +83,7 @@ mmap_prot_map = [ mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), + ('MAP_DROPPABLE', 0x08), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ] @@ -103,6 +103,9 @@ mmap_status_map = [ ('VMA_AREA_SOCKET', 1 << 11), ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), + ('VMA_AREA_MEMFD', 1 << 14), + ('VMA_AREA_SHSTK', 1 << 15), + ('VMA_AREA_UPROBES', 1 << 17), ('VMA_UNSUPP', 1 << 31), ] @@ -151,8 +154,9 @@ flags_maps = { gen_maps = { 'task_state': { 1: 'Alive', - 3: 'Zombie', - 6: 'Stopped' + 2: 'Dead', + 3: 'Stopped', + 6: 'Zombie', }, } @@ -247,17 +251,11 @@ def encode_dev(field, value): def encode_base64(value): - if (sys.version_info > (3, 0)): - return base64.encodebytes(value).decode() - else: - return base64.encodebytes(value) + return base64.encodebytes(value).decode() def decode_base64(value): - if (sys.version_info > (3, 0)): - return base64.decodebytes(str.encode(value)) - else: - return base64.decodebytes(value) + return base64.decodebytes(str.encode(value)) def encode_unix(value): @@ -309,7 +307,7 @@ def _pb2dict_cast(field, value, pretty=False, is_hex=False): return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] - if pretty and (cast == int): + if pretty and cast is int: if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. @@ -364,21 +362,24 @@ def pb2dict(pb, pretty=False, is_hex=False): else: d_val = _pb2dict_cast(field, value, pretty, is_hex) - d[field.name] = d_val.decode() if type(d_val) == bytes else d_val + try: + d[field.name] = d_val.decode() + except (UnicodeDecodeError, AttributeError): + d[field.name] = d_val return d def _dict2pb_cast(field, value): # Not considering TYPE_MESSAGE here, as repeated # and non-repeated messages need special treatment - # in this case, and are hadled separately. + # in this case, and are handled separately. if field.type == FD.TYPE_BYTES: return get_bytes_dec(field)(value) elif field.type == FD.TYPE_ENUM: return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] - if (cast == int) and is_string(value): + if cast is int and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) diff --git a/lib/pyproject.toml b/lib/pyproject.toml new file mode 100644 index 000000000..ea9f88dcc --- /dev/null +++ b/lib/pyproject.toml @@ -0,0 +1,20 @@ +[build-system] +requires = ["setuptools", "protobuf<4.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pycriu" +description = "Python bindings for CRIU" +authors = [ + {name = "CRIU team", email = "criu@lists.linux.dev"}, +] +license = {text = "LGPLv2.1"} +dynamic = ["version"] +requires-python = ">=3.6" +dependencies = ["protobuf"] + +[tool.setuptools] +packages = ["pycriu", "pycriu.images"] + +[tool.setuptools.dynamic] +version = {attr = "pycriu.__version__"} diff --git a/lib/setup.cfg b/lib/setup.cfg new file mode 100644 index 000000000..28c9e49c3 --- /dev/null +++ b/lib/setup.cfg @@ -0,0 +1,18 @@ +# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 +# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 +# For older versions of setuptools, we need to use the setup.cfg file +# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config + +[metadata] +name = pycriu +description = Python bindings for CRIU +author = CRIU team +author_email = criu@lists.linux.dev +license = LGPLv2.1 +version = attr: pycriu.__version__ + +[options] +packages = find: +python_requires = >=3.6 +install_requires = + protobuf diff --git a/crit/crit-python3 b/lib/setup.py old mode 100755 new mode 100644 similarity index 55% rename from crit/crit-python3 rename to lib/setup.py index 80467cba7..618ac1de4 --- a/crit/crit-python3 +++ b/lib/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import setuptools -from pycriu import cli if __name__ == '__main__': - cli.main() + setuptools.setup() diff --git a/plugins/amdgpu/.gitignore b/plugins/amdgpu/.gitignore new file mode 100644 index 000000000..4e5c8f58e --- /dev/null +++ b/plugins/amdgpu/.gitignore @@ -0,0 +1,3 @@ +*.pb-c.c +*.pb-c.h +test_topology_remap diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 367a52c99..250e7b0e7 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -12,10 +12,10 @@ LIBDRM_INC := -I/usr/include/libdrm DEPS_OK := amdgpu_plugin.so amdgpu_plugin_test DEPS_NOK := ; +__nmk_dir ?= ../../scripts/nmk/scripts/ include $(__nmk_dir)msg.mk -CC := gcc -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -DCR_PLUGIN_DEFAULT="$(PLUGINDIR)" +PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu ifeq ($(CONFIG_AMDGPU),y) @@ -25,10 +25,10 @@ else endif criu-amdgpu.pb-c.c: criu-amdgpu.proto - protoc-c --proto_path=. --c_out=. criu-amdgpu.proto + protoc --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_topology.c criu-amdgpu.pb-c.c - $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c + $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) amdgpu_plugin_clean: $(call msg-clean, $@) @@ -53,7 +53,7 @@ install: ifeq ($(CONFIG_AMDGPU),y) $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) endif .PHONY: install diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md index 6809ec8b9..b808fbc4f 100644 --- a/plugins/amdgpu/README.md +++ b/plugins/amdgpu/README.md @@ -3,7 +3,8 @@ Supporting ROCm with CRIU _Felix Kuehling _
_Rajneesh Bardwaj _
-_David Yat Sin _ +_David Yat Sin _
+_Yanning Yang _ # Introduction @@ -224,6 +225,26 @@ to resume execution on the GPUs. *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC patch series.* +## Restoring BO content in parallel + +Restoring the BO content is an important part in the restore of GPU state and +usually takes a significant amount of time. A possible location for this +procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook +blocks the target process from performing other restore operations, which +hinders further optimization of the restore process. + +Therefore, a new plugin hook that runs in the master restore process is +introduced, and it interacts with the `cr_plugin_restore_file` hook to complete +the restore of BO content. Specifically, the target process only needs to send +the relevant BOs to the master restore process, while this new hook handles all +the restore of buffer objects. Through this method, during the restore of the BO +content, the target process can perform other restore operations, thus +accelerating the restore procedure. This is an implementation of the gCROP +method proposed in the ACM SoCC'24 paper: [On-demand and Parallel +Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). + +*This optimization technique is enabled by the `__POST_FORKING` hook.* + ## Other CRIU changes In addition to the new plugins, we need to make some changes to CRIU itself to @@ -263,7 +284,7 @@ ROCm | Radeon Open Compute Platform Thunk | User-mode API interface to interact with amdgpu.ko KFD | AMD Kernel Fusion Driver Mesa | Open source OpenGL implementation -GTT | Graphis Translation Table, also used to denote kernel-managed system memory for GPU access +GTT | Graphics Translation Table, also used to denote kernel-managed system memory for GPU access VRAM | Video RAM BO | Buffer Object HMM | Heterogeneous Memory Management diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h new file mode 100644 index 000000000..69227a12b --- /dev/null +++ b/plugins/amdgpu/amdgpu_drm.h @@ -0,0 +1,1801 @@ +/* amdgpu_drm.h -- Public header for the amdgpu driver -*- linux-c -*- + * + * Copyright 2000 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Fremont, California. + * Copyright 2002 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Kevin E. Martin + * Gareth Hughes + * Keith Whitwell + */ + +#ifndef __AMDGPU_DRM_H__ +#define __AMDGPU_DRM_H__ + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_AMDGPU_GEM_CREATE 0x00 +#define DRM_AMDGPU_GEM_MMAP 0x01 +#define DRM_AMDGPU_CTX 0x02 +#define DRM_AMDGPU_BO_LIST 0x03 +#define DRM_AMDGPU_CS 0x04 +#define DRM_AMDGPU_INFO 0x05 +#define DRM_AMDGPU_GEM_METADATA 0x06 +#define DRM_AMDGPU_GEM_WAIT_IDLE 0x07 +#define DRM_AMDGPU_GEM_VA 0x08 +#define DRM_AMDGPU_WAIT_CS 0x09 +#define DRM_AMDGPU_GEM_OP 0x10 +#define DRM_AMDGPU_GEM_USERPTR 0x11 +#define DRM_AMDGPU_WAIT_FENCES 0x12 +#define DRM_AMDGPU_VM 0x13 +#define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 +#define DRM_AMDGPU_SCHED 0x15 +#define DRM_AMDGPU_USERQ 0x16 +#define DRM_AMDGPU_USERQ_SIGNAL 0x17 +#define DRM_AMDGPU_USERQ_WAIT 0x18 +#define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 +/* not upstream */ +#define DRM_AMDGPU_GEM_DGMA 0x5c + +/* hybrid specific ioctls */ +#define DRM_AMDGPU_SEM 0x5b + +#define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) +#define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) +#define DRM_IOCTL_AMDGPU_CTX DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CTX, union drm_amdgpu_ctx) +#define DRM_IOCTL_AMDGPU_BO_LIST DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_BO_LIST, union drm_amdgpu_bo_list) +#define DRM_IOCTL_AMDGPU_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CS, union drm_amdgpu_cs) +#define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info) +#define DRM_IOCTL_AMDGPU_GEM_METADATA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_METADATA, struct drm_amdgpu_gem_metadata) +#define DRM_IOCTL_AMDGPU_GEM_WAIT_IDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_WAIT_IDLE, union drm_amdgpu_gem_wait_idle) +#define DRM_IOCTL_AMDGPU_GEM_VA DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_VA, struct drm_amdgpu_gem_va) +#define DRM_IOCTL_AMDGPU_WAIT_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_CS, union drm_amdgpu_wait_cs) +#define DRM_IOCTL_AMDGPU_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_OP, struct drm_amdgpu_gem_op) +#define DRM_IOCTL_AMDGPU_GEM_USERPTR DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_USERPTR, struct drm_amdgpu_gem_userptr) +#define DRM_IOCTL_AMDGPU_WAIT_FENCES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_FENCES, union drm_amdgpu_wait_fences) +#define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_VM, union drm_amdgpu_vm) +#define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) +#define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ, union drm_amdgpu_userq) +#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) +#define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) +#define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) + +#define DRM_IOCTL_AMDGPU_GEM_DGMA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_DGMA, struct drm_amdgpu_gem_dgma) + +/** + * DOC: memory domains + * + * %AMDGPU_GEM_DOMAIN_CPU System memory that is not GPU accessible. + * Memory in this pool could be swapped out to disk if there is pressure. + * + * %AMDGPU_GEM_DOMAIN_GTT GPU accessible system memory, mapped into the + * GPU's virtual address space via gart. Gart memory linearizes non-contiguous + * pages of system memory, allows GPU access system memory in a linearized + * fashion. + * + * %AMDGPU_GEM_DOMAIN_VRAM Local video memory. For APUs, it is memory + * carved out by the BIOS. + * + * %AMDGPU_GEM_DOMAIN_GDS Global on-chip data storage used to share data + * across shader threads. + * + * %AMDGPU_GEM_DOMAIN_GWS Global wave sync, used to synchronize the + * execution of all the waves on a device. + * + * %AMDGPU_GEM_DOMAIN_OA Ordered append, used by 3D or Compute engines + * for appending data. + * + * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for + * signalling user mode queues. + * + * %AMDGPU_GEM_DOMAIN_MMIO_REMAP MMIO remap page (special mapping for HDP flushing). + */ +/* hybrid specific ioctls */ +#define DRM_IOCTL_AMDGPU_SEM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_SEM, union drm_amdgpu_sem) + +#define AMDGPU_GEM_DOMAIN_CPU 0x1 +#define AMDGPU_GEM_DOMAIN_GTT 0x2 +#define AMDGPU_GEM_DOMAIN_VRAM 0x4 +#define AMDGPU_GEM_DOMAIN_GDS 0x8 +#define AMDGPU_GEM_DOMAIN_GWS 0x10 +#define AMDGPU_GEM_DOMAIN_OA 0x20 +#define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 +#define AMDGPU_GEM_DOMAIN_MMIO_REMAP 0x80 +#define AMDGPU_GEM_DOMAIN_DGMA 0x400 +#define AMDGPU_GEM_DOMAIN_DGMA_IMPORT 0x800 + +#define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ + AMDGPU_GEM_DOMAIN_GTT | \ + AMDGPU_GEM_DOMAIN_VRAM | \ + AMDGPU_GEM_DOMAIN_GDS | \ + AMDGPU_GEM_DOMAIN_GWS | \ + AMDGPU_GEM_DOMAIN_OA |\ + AMDGPU_GEM_DOMAIN_DOORBELL |\ + AMDGPU_GEM_DOMAIN_MMIO_REMAP |\ + AMDGPU_GEM_DOMAIN_DGMA |\ + AMDGPU_GEM_DOMAIN_DGMA_IMPORT) + +/* Flag that CPU access will be required for the case of VRAM domain */ +#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) +/* Flag that CPU access will not work, this VRAM domain is invisible */ +#define AMDGPU_GEM_CREATE_NO_CPU_ACCESS (1 << 1) +/* Flag that USWC attributes should be used for GTT */ +#define AMDGPU_GEM_CREATE_CPU_GTT_USWC (1 << 2) +/* Flag that the memory should be in VRAM and cleared */ +#define AMDGPU_GEM_CREATE_VRAM_CLEARED (1 << 3) +/* Flag that allocating the BO should use linear VRAM */ +#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS (1 << 5) +/* Flag that BO is always valid in this VM */ +#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) +/* Flag that BO sharing will be explicitly synchronized */ +#define AMDGPU_GEM_CREATE_EXPLICIT_SYNC (1 << 7) +/* Flag that indicates allocating MQD gart on GFX9, where the mtype + * for the second page onward should be set to NC. It should never + * be used by user space applications. + */ +#define AMDGPU_GEM_CREATE_CP_MQD_GFX9 (1 << 8) +/* Flag that BO may contain sensitive data that must be wiped before + * releasing the memory + */ +#define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) +/* Flag that BO will be encrypted and that the TMZ bit should be + * set in the PTEs when mapping this buffer via GPUVM or + * accessing it with various hw blocks + */ +#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) +/* Flag that BO will be used only in preemptible context, which does + * not require GTT memory accounting + */ +#define AMDGPU_GEM_CREATE_PREEMPTIBLE (1 << 11) +/* Flag that BO can be discarded under memory pressure without keeping the + * content. + */ +#define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) +/* Flag that BO is shared coherently between multiple devices or CPU threads. + * May depend on GPU instructions to flush caches to system scope explicitly. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_COHERENT (1 << 13) +/* Flag that BO should not be cached by GPU. Coherent without having to flush + * GPU caches explicitly + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) +/* Flag that BO should be coherent across devices when using device-level + * atomics. May depend on GPU instructions to flush caches to device scope + * explicitly, promoting them to system scope automatically. + * + * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and + * may override the MTYPE selected in AMDGPU_VA_OP_MAP. + */ +#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) +/* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ +#define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) + +/* hybrid specific */ +/* Flag that the memory should be in SPARSE resource */ +#define AMDGPU_GEM_CREATE_SPARSE (1ULL << 29) +/* Flag that the memory allocation should be from top of domain */ +#define AMDGPU_GEM_CREATE_TOP_DOWN (1ULL << 30) +/* Flag that the memory allocation should be pinned */ +#define AMDGPU_GEM_CREATE_NO_EVICT (1ULL << 31) + +struct drm_amdgpu_gem_create_in { + /** the requested memory size */ + __u64 bo_size; + /** physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; + /** the requested memory domains */ + __u64 domains; + /** allocation flags */ + __u64 domain_flags; +}; + +struct drm_amdgpu_gem_create_out { + /** returned GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +union drm_amdgpu_gem_create { + struct drm_amdgpu_gem_create_in in; + struct drm_amdgpu_gem_create_out out; +}; + +/** Opcode to create new residency list. */ +#define AMDGPU_BO_LIST_OP_CREATE 0 +/** Opcode to destroy previously created residency list */ +#define AMDGPU_BO_LIST_OP_DESTROY 1 +/** Opcode to update resource information in the list */ +#define AMDGPU_BO_LIST_OP_UPDATE 2 + +struct drm_amdgpu_bo_list_in { + /** Type of operation */ + __u32 operation; + /** Handle of list or 0 if we want to create one */ + __u32 list_handle; + /** Number of BOs in list */ + __u32 bo_number; + /** Size of each element describing BO */ + __u32 bo_info_size; + /** Pointer to array describing BOs */ + __u64 bo_info_ptr; +}; + +struct drm_amdgpu_bo_list_entry { + /** Handle of BO */ + __u32 bo_handle; + /** New (if specified) BO priority to be used during migration */ + __u32 bo_priority; +}; + +struct drm_amdgpu_bo_list_out { + /** Handle of resource list */ + __u32 list_handle; + __u32 _pad; +}; + +union drm_amdgpu_bo_list { + struct drm_amdgpu_bo_list_in in; + struct drm_amdgpu_bo_list_out out; +}; + +/* context related */ +#define AMDGPU_CTX_OP_ALLOC_CTX 1 +#define AMDGPU_CTX_OP_FREE_CTX 2 +#define AMDGPU_CTX_OP_QUERY_STATE 3 +#define AMDGPU_CTX_OP_QUERY_STATE2 4 +#define AMDGPU_CTX_OP_GET_STABLE_PSTATE 5 +#define AMDGPU_CTX_OP_SET_STABLE_PSTATE 6 + +/* GPU reset status */ +#define AMDGPU_CTX_NO_RESET 0 +/* this the context caused it */ +#define AMDGPU_CTX_GUILTY_RESET 1 +/* some other context caused it */ +#define AMDGPU_CTX_INNOCENT_RESET 2 +/* unknown cause */ +#define AMDGPU_CTX_UNKNOWN_RESET 3 + +/* indicate gpu reset occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET (1<<0) +/* indicate vram lost occurred after ctx created */ +#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1) +/* indicate some job from this context once cause gpu hang */ +#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2) +/* indicate some errors are detected by RAS */ +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE (1<<3) +#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE (1<<4) +/* indicate that the reset hasn't completed yet */ +#define AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS (1<<5) + +/* Context priority level */ +#define AMDGPU_CTX_PRIORITY_UNSET -2048 +#define AMDGPU_CTX_PRIORITY_VERY_LOW -1023 +#define AMDGPU_CTX_PRIORITY_LOW -512 +#define AMDGPU_CTX_PRIORITY_NORMAL 0 +/* + * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires + * CAP_SYS_NICE or DRM_MASTER +*/ +#define AMDGPU_CTX_PRIORITY_HIGH 512 +#define AMDGPU_CTX_PRIORITY_VERY_HIGH 1023 + +/* select a stable profiling pstate for perfmon tools */ +#define AMDGPU_CTX_STABLE_PSTATE_FLAGS_MASK 0xf +#define AMDGPU_CTX_STABLE_PSTATE_NONE 0 +#define AMDGPU_CTX_STABLE_PSTATE_STANDARD 1 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK 2 +#define AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK 3 +#define AMDGPU_CTX_STABLE_PSTATE_PEAK 4 + +struct drm_amdgpu_ctx_in { + /** AMDGPU_CTX_OP_* */ + __u32 op; + /** Flags */ + __u32 flags; + __u32 ctx_id; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; +}; + +union drm_amdgpu_ctx_out { + struct { + __u32 ctx_id; + __u32 _pad; + } alloc; + + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** Number of resets caused by this context so far. */ + __u32 hangs; + /** Reset status since the last call of the ioctl. */ + __u32 reset_status; + } state; + + struct { + __u32 flags; + __u32 _pad; + } pstate; +}; + +union drm_amdgpu_ctx { + struct drm_amdgpu_ctx_in in; + union drm_amdgpu_ctx_out out; +}; + +/* user queue IOCTL operations */ +#define AMDGPU_USERQ_OP_CREATE 1 +#define AMDGPU_USERQ_OP_FREE 2 + +/* queue priority levels */ +/* low < normal low < normal high < high */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK 0x3 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_LOW 0 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_LOW 1 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH 2 +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH 3 /* admin only */ +/* for queues that need access to protected content */ +#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE (1 << 2) + +/* + * This structure is a container to pass input configuration + * info for all supported userqueue related operations. + * For operation AMDGPU_USERQ_OP_CREATE: user is expected + * to set all fields, excep the parameter 'queue_id'. + * For operation AMDGPU_USERQ_OP_FREE: the only input parameter expected + * to be set is 'queue_id', eveything else is ignored. + */ +struct drm_amdgpu_userq_in { + /** AMDGPU_USERQ_OP_* */ + __u32 op; + /** Queue id passed for operation USERQ_OP_FREE */ + __u32 queue_id; + /** the target GPU engine to execute workload (AMDGPU_HW_IP_*) */ + __u32 ip_type; + /** + * @doorbell_handle: the handle of doorbell GEM object + * associated with this userqueue client. + */ + __u32 doorbell_handle; + /** + * @doorbell_offset: 32-bit offset of the doorbell in the doorbell bo. + * Kernel will generate absolute doorbell offset using doorbell_handle + * and doorbell_offset in the doorbell bo. + */ + __u32 doorbell_offset; + /** + * @flags: flags used for queue parameters + */ + __u32 flags; + /** + * @queue_va: Virtual address of the GPU memory which holds the queue + * object. The queue holds the workload packets. + */ + __u64 queue_va; + /** + * @queue_size: Size of the queue in bytes, this needs to be 256-byte + * aligned. + */ + __u64 queue_size; + /** + * @rptr_va : Virtual address of the GPU memory which holds the ring RPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + */ + __u64 rptr_va; + /** + * @wptr_va : Virtual address of the GPU memory which holds the ring WPTR. + * This object must be at least 8 byte in size and aligned to 8-byte offset. + * + * Queue, RPTR and WPTR can come from the same object, as long as the size + * and alignment related requirements are met. + */ + __u64 wptr_va; + /** + * @mqd: MQD (memory queue descriptor) is a set of parameters which allow + * the GPU to uniquely define and identify a usermode queue. + * + * MQD data can be of different size for different GPU IP/engine and + * their respective versions/revisions, so this points to a __u64 * + * which holds IP specific MQD of this usermode queue. + */ + __u64 mqd; + /** + * @size: size of MQD data in bytes, it must match the MQD structure + * size of the respective engine/revision defined in UAPI for ex, for + * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11). + */ + __u64 mqd_size; +}; + +/* The structure to carry output of userqueue ops */ +struct drm_amdgpu_userq_out { + /** + * For operation AMDGPU_USERQ_OP_CREATE: This field contains a unique + * queue ID to represent the newly created userqueue in the system, otherwise + * it should be ignored. + */ + __u32 queue_id; + __u32 _pad; +}; + +union drm_amdgpu_userq { + struct drm_amdgpu_userq_in in; + struct drm_amdgpu_userq_out out; +}; + +/* GFX V11 IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_gfx11 { + /** + * @shadow_va: Virtual address of the GPU memory to hold the shadow buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 shadow_va; + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * Use AMDGPU_INFO_IOCTL to find the exact size of the object. + */ + __u64 csa_va; +}; + +/* GFX V11 SDMA IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_sdma_gfx11 { + /** + * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 csa_va; +}; + +/* GFX V11 Compute IP specific MQD parameters */ +struct drm_amdgpu_userq_mqd_compute_gfx11 { + /** + * @eop_va: Virtual address of the GPU memory to hold the EOP buffer. + * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL + * to get the size. + */ + __u64 eop_va; +}; + +/* userq signal/wait ioctl */ +struct drm_amdgpu_userq_signal { + /** + * @queue_id: Queue handle used by the userq fence creation function + * to retrieve the WPTR. + */ + __u32 queue_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to be signaled. + */ + __u64 syncobj_handles; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u64 num_syncobj_handles; + /** + * @bo_read_handles: The list of BO handles that the submitted user queue job + * is using for read only. This will update BO fences in the kernel. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of BO handles that the submitted user queue job + * is using for write only. This will update BO fences in the kernel. + */ + __u64 bo_write_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; +}; + +struct drm_amdgpu_userq_fence_info { + /** + * @va: A gpu address allocated for each queue which stores the + * read pointer (RPTR) value. + */ + __u64 va; + /** + * @value: A 64 bit value represents the write pointer (WPTR) of the + * queue commands which compared with the RPTR value to signal the + * fences. + */ + __u64 value; +}; + +struct drm_amdgpu_userq_wait { + /** + * @waitq_id: Queue handle used by the userq wait IOCTL to retrieve the + * wait queue and maintain the fence driver references in it. + */ + __u32 waitq_id; + __u32 pad; + /** + * @syncobj_handles: The list of syncobj handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 syncobj_handles; + /** + * @syncobj_timeline_handles: The list of timeline syncobj handles submitted by + * the user queue job to get the va/value pairs at given @syncobj_timeline_points. + */ + __u64 syncobj_timeline_handles; + /** + * @syncobj_timeline_points: The list of timeline syncobj points submitted by the + * user queue job for the corresponding @syncobj_timeline_handles. + */ + __u64 syncobj_timeline_points; + /** + * @bo_read_handles: The list of read BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_read_handles; + /** + * @bo_write_handles: The list of write BO handles submitted by the user queue + * job to get the va/value pairs. + */ + __u64 bo_write_handles; + /** + * @num_syncobj_timeline_handles: A count that represents the number of timeline + * syncobj handles in @syncobj_timeline_handles. + */ + __u16 num_syncobj_timeline_handles; + /** + * @num_fences: This field can be used both as input and output. As input it defines + * the maximum number of fences that can be returned and as output it will specify + * how many fences were actually returned from the ioctl. + */ + __u16 num_fences; + /** + * @num_syncobj_handles: A count that represents the number of syncobj handles in + * @syncobj_handles. + */ + __u32 num_syncobj_handles; + /** + * @num_bo_read_handles: A count that represents the number of read BO handles in + * @bo_read_handles. + */ + __u32 num_bo_read_handles; + /** + * @num_bo_write_handles: A count that represents the number of write BO handles in + * @bo_write_handles. + */ + __u32 num_bo_write_handles; + /** + * @out_fences: The field is a return value from the ioctl containing the list of + * address/value pairs to wait for. + */ + __u64 out_fences; +}; + +/* sem related */ +#define AMDGPU_SEM_OP_CREATE_SEM 1 +#define AMDGPU_SEM_OP_WAIT_SEM 2 +#define AMDGPU_SEM_OP_SIGNAL_SEM 3 +#define AMDGPU_SEM_OP_DESTROY_SEM 4 +#define AMDGPU_SEM_OP_IMPORT_SEM 5 +#define AMDGPU_SEM_OP_EXPORT_SEM 6 + +struct drm_amdgpu_sem_in { + /** AMDGPU_SEM_OP_* */ + uint32_t op; + uint32_t handle; + uint32_t ctx_id; + uint32_t ip_type; + uint32_t ip_instance; + uint32_t ring; + uint64_t seq; +}; + +union drm_amdgpu_sem_out { + int32_t fd; + uint32_t handle; +}; + +union drm_amdgpu_sem { + struct drm_amdgpu_sem_in in; + union drm_amdgpu_sem_out out; +}; + +/* vm ioctl */ +#define AMDGPU_VM_OP_RESERVE_VMID 1 +#define AMDGPU_VM_OP_UNRESERVE_VMID 2 + +struct drm_amdgpu_vm_in { + /** AMDGPU_VM_OP_* */ + __u32 op; + __u32 flags; +}; + +struct drm_amdgpu_vm_out { + /** For future use, no flags defined so far */ + __u64 flags; +}; + +union drm_amdgpu_vm { + struct drm_amdgpu_vm_in in; + struct drm_amdgpu_vm_out out; +}; + +/* sched ioctl */ +#define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE 1 +#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE 2 + +struct drm_amdgpu_sched_in { + /* AMDGPU_SCHED_OP_* */ + __u32 op; + __u32 fd; + /** AMDGPU_CTX_PRIORITY_* */ + __s32 priority; + __u32 ctx_id; +}; + +union drm_amdgpu_sched { + struct drm_amdgpu_sched_in in; +}; + +/* + * This is not a reliable API and you should expect it to fail for any + * number of reasons and have fallback path that do not use userptr to + * perform any operation. + */ +#define AMDGPU_GEM_USERPTR_READONLY (1 << 0) +#define AMDGPU_GEM_USERPTR_ANONONLY (1 << 1) +#define AMDGPU_GEM_USERPTR_VALIDATE (1 << 2) +#define AMDGPU_GEM_USERPTR_REGISTER (1 << 3) + +struct drm_amdgpu_gem_userptr { + __u64 addr; + __u64 size; + /* AMDGPU_GEM_USERPTR_* */ + __u32 flags; + /* Resulting GEM handle */ + __u32 handle; +}; + +#define AMDGPU_GEM_DGMA_IMPORT 0 +#define AMDGPU_GEM_DGMA_QUERY_PHYS_ADDR 1 +struct drm_amdgpu_gem_dgma { + __u64 addr; + __u64 size; + __u32 op; + __u32 handle; +}; + +/* SI-CI-VI: */ +/* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ +#define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 +#define AMDGPU_TILING_ARRAY_MODE_MASK 0xf +#define AMDGPU_TILING_PIPE_CONFIG_SHIFT 4 +#define AMDGPU_TILING_PIPE_CONFIG_MASK 0x1f +#define AMDGPU_TILING_TILE_SPLIT_SHIFT 9 +#define AMDGPU_TILING_TILE_SPLIT_MASK 0x7 +#define AMDGPU_TILING_MICRO_TILE_MODE_SHIFT 12 +#define AMDGPU_TILING_MICRO_TILE_MODE_MASK 0x7 +#define AMDGPU_TILING_BANK_WIDTH_SHIFT 15 +#define AMDGPU_TILING_BANK_WIDTH_MASK 0x3 +#define AMDGPU_TILING_BANK_HEIGHT_SHIFT 17 +#define AMDGPU_TILING_BANK_HEIGHT_MASK 0x3 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_SHIFT 19 +#define AMDGPU_TILING_MACRO_TILE_ASPECT_MASK 0x3 +#define AMDGPU_TILING_NUM_BANKS_SHIFT 21 +#define AMDGPU_TILING_NUM_BANKS_MASK 0x3 + +/* GFX9 - GFX11: */ +#define AMDGPU_TILING_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_SWIZZLE_MODE_MASK 0x1f +#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT 5 +#define AMDGPU_TILING_DCC_OFFSET_256B_MASK 0xFFFFFF +#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT 29 +#define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 +#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 +#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 +#define AMDGPU_TILING_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_SCANOUT_MASK 0x1 + +/* GFX12 and later: */ +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 +#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 +/* These are DCC recompression settings for memory management: */ +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 +#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 +#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 +#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ +/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata + * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */ +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT 14 +#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK 0x1 +/* bit gap */ +#define AMDGPU_TILING_GFX12_SCANOUT_SHIFT 63 +#define AMDGPU_TILING_GFX12_SCANOUT_MASK 0x1 + +/* Set/Get helpers for tiling flags. */ +#define AMDGPU_TILING_SET(field, value) \ + (((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT) +#define AMDGPU_TILING_GET(value, field) \ + (((__u64)(value) >> AMDGPU_TILING_##field##_SHIFT) & AMDGPU_TILING_##field##_MASK) + +#define AMDGPU_GEM_METADATA_OP_SET_METADATA 1 +#define AMDGPU_GEM_METADATA_OP_GET_METADATA 2 + +/** The same structure is shared for input/output */ +struct drm_amdgpu_gem_metadata { + /** GEM Object handle */ + __u32 handle; + /** Do we want get or set metadata */ + __u32 op; + struct { + /** For future use, no flags defined so far */ + __u64 flags; + /** family specific tiling info */ + __u64 tiling_info; + __u32 data_size_bytes; + __u32 data[64]; + } data; +}; + +struct drm_amdgpu_gem_mmap_in { + /** the GEM object handle */ + __u32 handle; + __u32 _pad; +}; + +struct drm_amdgpu_gem_mmap_out { + /** mmap offset from the vma offset manager */ + __u64 addr_ptr; +}; + +union drm_amdgpu_gem_mmap { + struct drm_amdgpu_gem_mmap_in in; + struct drm_amdgpu_gem_mmap_out out; +}; + +struct drm_amdgpu_gem_wait_idle_in { + /** GEM object handle */ + __u32 handle; + /** For future use, no flags defined so far */ + __u32 flags; + /** Absolute timeout to wait */ + __u64 timeout; +}; + +struct drm_amdgpu_gem_wait_idle_out { + /** BO status: 0 - BO is idle, 1 - BO is busy */ + __u32 status; + /** Returned current memory domain */ + __u32 domain; +}; + +union drm_amdgpu_gem_wait_idle { + struct drm_amdgpu_gem_wait_idle_in in; + struct drm_amdgpu_gem_wait_idle_out out; +}; + +struct drm_amdgpu_wait_cs_in { + /* Command submission handle + * handle equals 0 means none to wait for + * handle equals ~0ull means wait for the latest sequence number + */ + __u64 handle; + /** Absolute timeout to wait */ + __u64 timeout; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; +}; + +struct drm_amdgpu_wait_cs_out { + /** CS status: 0 - CS completed, 1 - CS still busy */ + __u64 status; +}; + +union drm_amdgpu_wait_cs { + struct drm_amdgpu_wait_cs_in in; + struct drm_amdgpu_wait_cs_out out; +}; + +struct drm_amdgpu_fence { + __u32 ctx_id; + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u64 seq_no; +}; + +struct drm_amdgpu_wait_fences_in { + /** This points to uint64_t * which points to fences */ + __u64 fences; + __u32 fence_count; + __u32 wait_all; + __u64 timeout_ns; +}; + +struct drm_amdgpu_wait_fences_out { + __u32 status; + __u32 first_signaled; +}; + +union drm_amdgpu_wait_fences { + struct drm_amdgpu_wait_fences_in in; + struct drm_amdgpu_wait_fences_out out; +}; + +#define AMDGPU_GEM_OP_GET_GEM_CREATE_INFO 0 +#define AMDGPU_GEM_OP_SET_PLACEMENT 1 +#define AMDGPU_GEM_OP_GET_MAPPING_INFO 2 + +struct drm_amdgpu_gem_vm_entry { + /* Start of mapping (in bytes) */ + __u64 addr; + + /* Size of mapping (in bytes) */ + __u64 size; + + /* Mapping offset */ + __u64 offset; + + /* flags needed to recreate mapping */ + __u64 flags; +}; + +/* Sets or returns a value associated with a buffer. */ +struct drm_amdgpu_gem_op { + /** GEM object handle */ + __u32 handle; + /** AMDGPU_GEM_OP_* */ + __u32 op; + /** Input or return value. For MAPPING_INFO op: pointer to array of struct drm_amdgpu_gem_vm_entry */ + __u64 value; + /** For MAPPING_INFO op: number of mappings (in/out) */ + __u32 num_entries; + + __u32 padding; +}; + +#define AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT (1 << 0) + +struct drm_amdgpu_gem_list_handles { + /* User pointer to array of drm_amdgpu_gem_bo_info_entry */ + __u64 entries; + + /* Size of entries buffer / Number of handles in process (if larger than size of buffer, must retry) */ + __u32 num_entries; + + __u32 padding; +}; + +struct drm_amdgpu_gem_list_handles_entry { + /* gem handle of buffer object */ + __u32 gem_handle; + + /* Currently just one flag: IS_IMPORT */ + __u32 flags; + + /* Size of bo */ + __u64 size; + + /* Preferred domains for GEM_CREATE */ + __u64 preferred_domains; + + /* GEM_CREATE flags for re-creation of buffer */ + __u64 alloc_flags; + + /* physical start_addr alignment in bytes for some HW requirements */ + __u64 alignment; +}; + +#define AMDGPU_VA_OP_MAP 1 +#define AMDGPU_VA_OP_UNMAP 2 +#define AMDGPU_VA_OP_CLEAR 3 +#define AMDGPU_VA_OP_REPLACE 4 + +/* Delay the page table update till the next CS */ +#define AMDGPU_VM_DELAY_UPDATE (1 << 0) + +/* Mapping flags */ +/* readable mapping */ +#define AMDGPU_VM_PAGE_READABLE (1 << 1) +/* writable mapping */ +#define AMDGPU_VM_PAGE_WRITEABLE (1 << 2) +/* executable mapping, new for VI */ +#define AMDGPU_VM_PAGE_EXECUTABLE (1 << 3) +/* partially resident texture */ +#define AMDGPU_VM_PAGE_PRT (1 << 4) +/* MTYPE flags use bit 5 to 8 */ +#define AMDGPU_VM_MTYPE_MASK (0xf << 5) +/* Default MTYPE. Pre-AI must use this. Recommended for newer ASICs. */ +#define AMDGPU_VM_MTYPE_DEFAULT (0 << 5) +/* Use Non Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_NC (1 << 5) +/* Use Write Combine MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_WC (2 << 5) +/* Use Cache Coherent MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_CC (3 << 5) +/* Use UnCached MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_UC (4 << 5) +/* Use Read Write MTYPE instead of default MTYPE */ +#define AMDGPU_VM_MTYPE_RW (5 << 5) +/* don't allocate MALL */ +#define AMDGPU_VM_PAGE_NOALLOC (1 << 9) + +struct drm_amdgpu_gem_va { + /** GEM object handle */ + __u32 handle; + __u32 _pad; + /** AMDGPU_VA_OP_* */ + __u32 operation; + /** AMDGPU_VM_PAGE_* */ + __u32 flags; + /** va address to assign . Must be correctly aligned.*/ + __u64 va_address; + /** Specify offset inside of BO to assign. Must be correctly aligned.*/ + __u64 offset_in_bo; + /** Specify mapping size. Must be correctly aligned. */ + __u64 map_size; + /** + * vm_timeline_point is a sequence number used to add new timeline point. + */ + __u64 vm_timeline_point; + /** + * The vm page table update fence is installed in given vm_timeline_syncobj_out + * at vm_timeline_point. + */ + __u32 vm_timeline_syncobj_out; + /** the number of syncobj handles in @input_fence_syncobj_handles */ + __u32 num_syncobj_handles; + /** Array of sync object handle to wait for given input fences */ + __u64 input_fence_syncobj_handles; +}; + +#define AMDGPU_HW_IP_GFX 0 +#define AMDGPU_HW_IP_COMPUTE 1 +#define AMDGPU_HW_IP_DMA 2 +#define AMDGPU_HW_IP_UVD 3 +#define AMDGPU_HW_IP_VCE 4 +#define AMDGPU_HW_IP_UVD_ENC 5 +#define AMDGPU_HW_IP_VCN_DEC 6 +/* + * From VCN4, AMDGPU_HW_IP_VCN_ENC is re-used to support + * both encoding and decoding jobs. + */ +#define AMDGPU_HW_IP_VCN_ENC 7 +#define AMDGPU_HW_IP_VCN_JPEG 8 +#define AMDGPU_HW_IP_VPE 9 +#define AMDGPU_HW_IP_NUM 10 + +#define AMDGPU_HW_IP_INSTANCE_MAX_COUNT 1 + +#define AMDGPU_CHUNK_ID_IB 0x01 +#define AMDGPU_CHUNK_ID_FENCE 0x02 +#define AMDGPU_CHUNK_ID_DEPENDENCIES 0x03 +#define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 +#define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 +#define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 +#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT 0x08 +#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL 0x09 +#define AMDGPU_CHUNK_ID_CP_GFX_SHADOW 0x0a + +struct drm_amdgpu_cs_chunk { + __u32 chunk_id; + __u32 length_dw; + __u64 chunk_data; +}; + +struct drm_amdgpu_cs_in { + /** Rendering context id */ + __u32 ctx_id; + /** Handle of resource list associated with CS */ + __u32 bo_list_handle; + __u32 num_chunks; + __u32 flags; + /** this points to __u64 * which point to cs chunks */ + __u64 chunks; +}; + +struct drm_amdgpu_cs_out { + __u64 handle; +}; + +union drm_amdgpu_cs { + struct drm_amdgpu_cs_in in; + struct drm_amdgpu_cs_out out; +}; + +/* Specify flags to be used for IB */ + +/* This IB should be submitted to CE */ +#define AMDGPU_IB_FLAG_CE (1<<0) + +/* Preamble flag, which means the IB could be dropped if no context switch */ +#define AMDGPU_IB_FLAG_PREAMBLE (1<<1) + +/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ +#define AMDGPU_IB_FLAG_PREEMPT (1<<2) + +/* The IB fence should do the L2 writeback but not invalidate any shader + * caches (L2/vL1/sL1/I$). */ +#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) + +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. + * This will reset wave ID counters for the IB. + */ +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) + +/* Flag the IB as secure (TMZ) + */ +#define AMDGPU_IB_FLAGS_SECURE (1 << 5) + +/* Tell KMD to flush and invalidate caches + */ +#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) + +struct drm_amdgpu_cs_chunk_ib { + __u32 _pad; + /** AMDGPU_IB_FLAG_* */ + __u32 flags; + /** Virtual address to begin IB execution */ + __u64 va_start; + /** Size of submission */ + __u32 ib_bytes; + /** HW IP to submit to */ + __u32 ip_type; + /** HW IP index of the same type to submit to */ + __u32 ip_instance; + /** Ring index to submit to */ + __u32 ring; +}; + +struct drm_amdgpu_cs_chunk_dep { + __u32 ip_type; + __u32 ip_instance; + __u32 ring; + __u32 ctx_id; + __u64 handle; +}; + +struct drm_amdgpu_cs_chunk_fence { + __u32 handle; + __u32 offset; +}; + +struct drm_amdgpu_cs_chunk_sem { + __u32 handle; +}; + +struct drm_amdgpu_cs_chunk_syncobj { + __u32 handle; + __u32 flags; + __u64 point; +}; + +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ 0 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD 1 +#define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD 2 + +union drm_amdgpu_fence_to_handle { + struct { + struct drm_amdgpu_fence fence; + __u32 what; + __u32 pad; + } in; + struct { + __u32 handle; + } out; +}; + +struct drm_amdgpu_cs_chunk_data { + union { + struct drm_amdgpu_cs_chunk_ib ib_data; + struct drm_amdgpu_cs_chunk_fence fence_data; + }; +}; + +#define AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW 0x1 + +struct drm_amdgpu_cs_chunk_cp_gfx_shadow { + __u64 shadow_va; + __u64 csa_va; + __u64 gds_va; + __u64 flags; +}; + +/* + * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU + * + */ +#define AMDGPU_IDS_FLAGS_FUSION 0x01 +#define AMDGPU_IDS_FLAGS_PREEMPTION 0x02 +#define AMDGPU_IDS_FLAGS_TMZ 0x04 +#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x08 +#define AMDGPU_IDS_FLAGS_GANG_SUBMIT 0x10 + +/* + * Query h/w info: Flag identifying VF/PF/PT mode + * + */ +#define AMDGPU_IDS_FLAGS_MODE_MASK 0x300 +#define AMDGPU_IDS_FLAGS_MODE_SHIFT 0x8 +#define AMDGPU_IDS_FLAGS_MODE_PF 0x0 +#define AMDGPU_IDS_FLAGS_MODE_VF 0x1 +#define AMDGPU_IDS_FLAGS_MODE_PT 0x2 + +/* indicate if acceleration can be working */ +#define AMDGPU_INFO_ACCEL_WORKING 0x00 +/* get the crtc_id from the mode object id? */ +#define AMDGPU_INFO_CRTC_FROM_ID 0x01 +/* query hw IP info */ +#define AMDGPU_INFO_HW_IP_INFO 0x02 +/* query hw IP instance count for the specified type */ +#define AMDGPU_INFO_HW_IP_COUNT 0x03 +/* timestamp for GL_ARB_timer_query */ +#define AMDGPU_INFO_TIMESTAMP 0x05 +/* Query the firmware version */ +#define AMDGPU_INFO_FW_VERSION 0x0e + /* Subquery id: Query VCE firmware version */ + #define AMDGPU_INFO_FW_VCE 0x1 + /* Subquery id: Query UVD firmware version */ + #define AMDGPU_INFO_FW_UVD 0x2 + /* Subquery id: Query GMC firmware version */ + #define AMDGPU_INFO_FW_GMC 0x03 + /* Subquery id: Query GFX ME firmware version */ + #define AMDGPU_INFO_FW_GFX_ME 0x04 + /* Subquery id: Query GFX PFP firmware version */ + #define AMDGPU_INFO_FW_GFX_PFP 0x05 + /* Subquery id: Query GFX CE firmware version */ + #define AMDGPU_INFO_FW_GFX_CE 0x06 + /* Subquery id: Query GFX RLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC 0x07 + /* Subquery id: Query GFX MEC firmware version */ + #define AMDGPU_INFO_FW_GFX_MEC 0x08 + /* Subquery id: Query SMC firmware version */ + #define AMDGPU_INFO_FW_SMC 0x0a + /* Subquery id: Query SDMA firmware version */ + #define AMDGPU_INFO_FW_SDMA 0x0b + /* Subquery id: Query PSP SOS firmware version */ + #define AMDGPU_INFO_FW_SOS 0x0c + /* Subquery id: Query PSP ASD firmware version */ + #define AMDGPU_INFO_FW_ASD 0x0d + /* Subquery id: Query VCN firmware version */ + #define AMDGPU_INFO_FW_VCN 0x0e + /* Subquery id: Query GFX RLC SRLC firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f + /* Subquery id: Query GFX RLC SRLG firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10 + /* Subquery id: Query GFX RLC SRLS firmware version */ + #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11 + /* Subquery id: Query DMCU firmware version */ + #define AMDGPU_INFO_FW_DMCU 0x12 + #define AMDGPU_INFO_FW_TA 0x13 + /* Subquery id: Query DMCUB firmware version */ + #define AMDGPU_INFO_FW_DMCUB 0x14 + /* Subquery id: Query TOC firmware version */ + #define AMDGPU_INFO_FW_TOC 0x15 + /* Subquery id: Query CAP firmware version */ + #define AMDGPU_INFO_FW_CAP 0x16 + /* Subquery id: Query GFX RLCP firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCP 0x17 + /* Subquery id: Query GFX RLCV firmware version */ + #define AMDGPU_INFO_FW_GFX_RLCV 0x18 + /* Subquery id: Query MES_KIQ firmware version */ + #define AMDGPU_INFO_FW_MES_KIQ 0x19 + /* Subquery id: Query MES firmware version */ + #define AMDGPU_INFO_FW_MES 0x1a + /* Subquery id: Query IMU firmware version */ + #define AMDGPU_INFO_FW_IMU 0x1b + /* Subquery id: Query VPE firmware version */ + #define AMDGPU_INFO_FW_VPE 0x1c + +/* number of bytes moved for TTM migration */ +#define AMDGPU_INFO_NUM_BYTES_MOVED 0x0f +/* the used VRAM size */ +#define AMDGPU_INFO_VRAM_USAGE 0x10 +/* the used GTT size */ +#define AMDGPU_INFO_GTT_USAGE 0x11 +/* Information about GDS, etc. resource configuration */ +#define AMDGPU_INFO_GDS_CONFIG 0x13 +/* Query information about VRAM and GTT domains */ +#define AMDGPU_INFO_VRAM_GTT 0x14 +/* Query information about register in MMR address space*/ +#define AMDGPU_INFO_READ_MMR_REG 0x15 +/* Query information about device: rev id, family, etc. */ +#define AMDGPU_INFO_DEV_INFO 0x16 +/* visible vram usage */ +#define AMDGPU_INFO_VIS_VRAM_USAGE 0x17 +/* number of TTM buffer evictions */ +#define AMDGPU_INFO_NUM_EVICTIONS 0x18 +/* Query memory about VRAM and GTT domains */ +#define AMDGPU_INFO_MEMORY 0x19 +/* Query vce clock table */ +#define AMDGPU_INFO_VCE_CLOCK_TABLE 0x1A +/* Query vbios related information */ +#define AMDGPU_INFO_VBIOS 0x1B + /* Subquery id: Query vbios size */ + #define AMDGPU_INFO_VBIOS_SIZE 0x1 + /* Subquery id: Query vbios image */ + #define AMDGPU_INFO_VBIOS_IMAGE 0x2 + /* Subquery id: Query vbios info */ + #define AMDGPU_INFO_VBIOS_INFO 0x3 +/* Query UVD handles */ +#define AMDGPU_INFO_NUM_HANDLES 0x1C +/* Query sensor related information */ +#define AMDGPU_INFO_SENSOR 0x1D + /* Subquery id: Query GPU shader clock */ + #define AMDGPU_INFO_SENSOR_GFX_SCLK 0x1 + /* Subquery id: Query GPU memory clock */ + #define AMDGPU_INFO_SENSOR_GFX_MCLK 0x2 + /* Subquery id: Query GPU temperature */ + #define AMDGPU_INFO_SENSOR_GPU_TEMP 0x3 + /* Subquery id: Query GPU load */ + #define AMDGPU_INFO_SENSOR_GPU_LOAD 0x4 + /* Subquery id: Query average GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_AVG_POWER 0x5 + /* Subquery id: Query northbridge voltage */ + #define AMDGPU_INFO_SENSOR_VDDNB 0x6 + /* Subquery id: Query graphics voltage */ + #define AMDGPU_INFO_SENSOR_VDDGFX 0x7 + /* Subquery id: Query GPU stable pstate shader clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_SCLK 0x8 + /* Subquery id: Query GPU stable pstate memory clock */ + #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_MCLK 0x9 + /* Subquery id: Query GPU peak pstate shader clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa + /* Subquery id: Query GPU peak pstate memory clock */ + #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb + /* Subquery id: Query input GPU power */ + #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER 0xc +/* Number of VRAM page faults on CPU access. */ +#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E +#define AMDGPU_INFO_VRAM_LOST_COUNTER 0x1F +/* query ras mask of enabled features*/ +#define AMDGPU_INFO_RAS_ENABLED_FEATURES 0x20 +/* RAS MASK: UMC (VRAM) */ +#define AMDGPU_INFO_RAS_ENABLED_UMC (1 << 0) +/* RAS MASK: SDMA */ +#define AMDGPU_INFO_RAS_ENABLED_SDMA (1 << 1) +/* RAS MASK: GFX */ +#define AMDGPU_INFO_RAS_ENABLED_GFX (1 << 2) +/* RAS MASK: MMHUB */ +#define AMDGPU_INFO_RAS_ENABLED_MMHUB (1 << 3) +/* RAS MASK: ATHUB */ +#define AMDGPU_INFO_RAS_ENABLED_ATHUB (1 << 4) +/* RAS MASK: PCIE */ +#define AMDGPU_INFO_RAS_ENABLED_PCIE (1 << 5) +/* RAS MASK: HDP */ +#define AMDGPU_INFO_RAS_ENABLED_HDP (1 << 6) +/* RAS MASK: XGMI */ +#define AMDGPU_INFO_RAS_ENABLED_XGMI (1 << 7) +/* RAS MASK: DF */ +#define AMDGPU_INFO_RAS_ENABLED_DF (1 << 8) +/* RAS MASK: SMN */ +#define AMDGPU_INFO_RAS_ENABLED_SMN (1 << 9) +/* RAS MASK: SEM */ +#define AMDGPU_INFO_RAS_ENABLED_SEM (1 << 10) +/* RAS MASK: MP0 */ +#define AMDGPU_INFO_RAS_ENABLED_MP0 (1 << 11) +/* RAS MASK: MP1 */ +#define AMDGPU_INFO_RAS_ENABLED_MP1 (1 << 12) +/* RAS MASK: FUSE */ +#define AMDGPU_INFO_RAS_ENABLED_FUSE (1 << 13) +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS 0x21 + /* Subquery id: Decode */ + #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0 + /* Subquery id: Encode */ + #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1 +/* Query the max number of IBs per gang per submission */ +#define AMDGPU_INFO_MAX_IBS 0x22 +/* query last page fault info */ +#define AMDGPU_INFO_GPUVM_FAULT 0x23 +/* query FW object size and alignment */ +#define AMDGPU_INFO_UQ_FW_AREAS 0x24 + +/* Hybrid Stack Specific Defs*/ +/* gpu capability */ +#define AMDGPU_INFO_CAPABILITY 0x50 +/* virtual range */ +#define AMDGPU_INFO_VIRTUAL_RANGE 0x51 +/* query pin memory capability */ +#define AMDGPU_CAPABILITY_PIN_MEM_FLAG (1 << 0) +/* query direct gma capability */ +#define AMDGPU_CAPABILITY_DIRECT_GMA_FLAG (1 << 1) + +#define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 +#define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff +#define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 +#define AMDGPU_INFO_MMR_SH_INDEX_MASK 0xff + +struct drm_amdgpu_query_fw { + /** AMDGPU_INFO_FW_* */ + __u32 fw_type; + /** + * Index of the IP if there are more IPs of + * the same type. + */ + __u32 ip_instance; + /** + * Index of the engine. Whether this is used depends + * on the firmware type. (e.g. MEC, SDMA) + */ + __u32 index; + __u32 _pad; +}; + +/* Input structure for the INFO ioctl */ +struct drm_amdgpu_info { + /* Where the return value will be stored */ + __u64 return_pointer; + /* The size of the return value. Just like "size" in "snprintf", + * it limits how many bytes the kernel can write. */ + __u32 return_size; + /* The query request id. */ + __u32 query; + + union { + struct { + __u32 id; + __u32 _pad; + } mode_crtc; + + struct { + /** AMDGPU_HW_IP_* */ + __u32 type; + /** + * Index of the IP if there are more IPs of the same + * type. Ignored by AMDGPU_INFO_HW_IP_COUNT. + */ + __u32 ip_instance; + } query_hw_ip; + + struct { + __u32 dword_offset; + /** number of registers to read */ + __u32 count; + __u32 instance; + /** For future use, no flags defined so far */ + __u32 flags; + } read_mmr_reg; + + struct { + uint32_t aperture; + uint32_t _pad; + } virtual_range; + + struct drm_amdgpu_query_fw query_fw; + + struct { + __u32 type; + __u32 offset; + } vbios_info; + + struct { + __u32 type; + } sensor_info; + + struct { + __u32 type; + } video_cap; + }; +}; + +struct drm_amdgpu_info_gds { + /** GDS GFX partition size */ + __u32 gds_gfx_partition_size; + /** GDS compute partition size */ + __u32 compute_partition_size; + /** total GDS memory size */ + __u32 gds_total_size; + /** GWS size per GFX partition */ + __u32 gws_per_gfx_partition; + /** GSW size per compute partition */ + __u32 gws_per_compute_partition; + /** OA size per GFX partition */ + __u32 oa_per_gfx_partition; + /** OA size per compute partition */ + __u32 oa_per_compute_partition; + __u32 _pad; +}; + +struct drm_amdgpu_info_vram_gtt { + __u64 vram_size; + __u64 vram_cpu_accessible_size; + __u64 gtt_size; +}; + +struct drm_amdgpu_heap_info { + /** max. physical memory */ + __u64 total_heap_size; + + /** Theoretical max. available memory in the given heap */ + __u64 usable_heap_size; + + /** + * Number of bytes allocated in the heap. This includes all processes + * and private allocations in the kernel. It changes when new buffers + * are allocated, freed, and moved. It cannot be larger than + * heap_size. + */ + __u64 heap_usage; + + /** + * Theoretical possible max. size of buffer which + * could be allocated in the given heap + */ + __u64 max_allocation; +}; + +struct drm_amdgpu_memory_info { + struct drm_amdgpu_heap_info vram; + struct drm_amdgpu_heap_info cpu_accessible_vram; + struct drm_amdgpu_heap_info gtt; +}; + +struct drm_amdgpu_info_firmware { + __u32 ver; + __u32 feature; +}; + +struct drm_amdgpu_info_vbios { + __u8 name[64]; + __u8 vbios_pn[64]; + __u32 version; + __u32 pad; + __u8 vbios_ver_str[32]; + __u8 date[32]; +}; + +#define AMDGPU_VRAM_TYPE_UNKNOWN 0 +#define AMDGPU_VRAM_TYPE_GDDR1 1 +#define AMDGPU_VRAM_TYPE_DDR2 2 +#define AMDGPU_VRAM_TYPE_GDDR3 3 +#define AMDGPU_VRAM_TYPE_GDDR4 4 +#define AMDGPU_VRAM_TYPE_GDDR5 5 +#define AMDGPU_VRAM_TYPE_HBM 6 +#define AMDGPU_VRAM_TYPE_DDR3 7 +#define AMDGPU_VRAM_TYPE_DDR4 8 +#define AMDGPU_VRAM_TYPE_GDDR6 9 +#define AMDGPU_VRAM_TYPE_DDR5 10 +#define AMDGPU_VRAM_TYPE_LPDDR4 11 +#define AMDGPU_VRAM_TYPE_LPDDR5 12 +#define AMDGPU_VRAM_TYPE_HBM3E 13 + +#define AMDGPU_VRAM_TYPE_HBM_WIDTH 4096 + +struct drm_amdgpu_info_device { + /** PCI Device ID */ + __u32 device_id; + /** Internal chip revision: A0, A1, etc.) */ + __u32 chip_rev; + __u32 external_rev; + /** Revision id in PCI Config space */ + __u32 pci_rev; + __u32 family; + __u32 num_shader_engines; + __u32 num_shader_arrays_per_engine; + /* in KHz */ + __u32 gpu_counter_freq; + __u64 max_engine_clock; + __u64 max_memory_clock; + /* cu information */ + __u32 cu_active_number; + /* NOTE: cu_ao_mask is INVALID, DON'T use it */ + __u32 cu_ao_mask; + __u32 cu_bitmap[4][4]; + /** Render backend pipe mask. One render backend is CB+DB. */ + __u32 enabled_rb_pipes_mask; + __u32 num_rb_pipes; + __u32 num_hw_gfx_contexts; + /* PCIe version (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_gen; + __u64 ids_flags; + /** Starting virtual address for UMDs. */ + __u64 virtual_address_offset; + /** The maximum virtual address */ + __u64 virtual_address_max; + /** Required alignment of virtual addresses. */ + __u32 virtual_address_alignment; + /** Page table entry - fragment size */ + __u32 pte_fragment_size; + __u32 gart_page_size; + /** constant engine ram size*/ + __u32 ce_ram_size; + /** video memory type info*/ + __u32 vram_type; + /** video memory bit width*/ + __u32 vram_bit_width; + /* vce harvesting instance */ + __u32 vce_harvest_config; + /* gfx double offchip LDS buffers */ + __u32 gc_double_offchip_lds_buf; + /* NGG Primitive Buffer */ + __u64 prim_buf_gpu_addr; + /* NGG Position Buffer */ + __u64 pos_buf_gpu_addr; + /* NGG Control Sideband */ + __u64 cntl_sb_buf_gpu_addr; + /* NGG Parameter Cache */ + __u64 param_buf_gpu_addr; + __u32 prim_buf_size; + __u32 pos_buf_size; + __u32 cntl_sb_buf_size; + __u32 param_buf_size; + /* wavefront size*/ + __u32 wave_front_size; + /* shader visible vgprs*/ + __u32 num_shader_visible_vgprs; + /* CU per shader array*/ + __u32 num_cu_per_sh; + /* number of tcc blocks*/ + __u32 num_tcc_blocks; + /* gs vgt table depth*/ + __u32 gs_vgt_table_depth; + /* gs primitive buffer depth*/ + __u32 gs_prim_buffer_depth; + /* max gs wavefront per vgt*/ + __u32 max_gs_waves_per_vgt; + /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */ + __u32 pcie_num_lanes; + /* always on cu bitmap */ + __u32 cu_ao_bitmap[4][4]; + /** Starting high virtual address for UMDs. */ + __u64 high_va_offset; + /** The maximum high virtual address */ + __u64 high_va_max; + /* gfx10 pa_sc_tile_steering_override */ + __u32 pa_sc_tile_steering_override; + /* disabled TCCs */ + __u64 tcc_disabled_mask; + __u64 min_engine_clock; + __u64 min_memory_clock; + /* The following fields are only set on gfx11+, older chips set 0. */ + __u32 tcp_cache_size; /* AKA GL0, VMEM cache */ + __u32 num_sqc_per_wgp; + __u32 sqc_data_cache_size; /* AKA SMEM cache */ + __u32 sqc_inst_cache_size; + __u32 gl1c_cache_size; + __u32 gl2c_cache_size; + __u64 mall_size; /* AKA infinity cache */ + /* high 32 bits of the rb pipes mask */ + __u32 enabled_rb_pipes_mask_hi; + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; + /* Userq IP mask (1 << AMDGPU_HW_IP_*) */ + __u32 userq_ip_mask; + __u32 pad; +}; + +struct drm_amdgpu_info_hw_ip { + /** Version of h/w IP */ + __u32 hw_ip_version_major; + __u32 hw_ip_version_minor; + /** Capabilities */ + __u64 capabilities_flags; + /** command buffer address start alignment*/ + __u32 ib_start_alignment; + /** command buffer size alignment*/ + __u32 ib_size_alignment; + /** Bitmask of available rings. Bit 0 means ring 0, etc. */ + __u32 available_rings; + /** version info: bits 23:16 major, 15:8 minor, 7:0 revision */ + __u32 ip_discovery_version; + /* Userq available slots */ + __u32 userq_num_slots; +}; + +/* GFX metadata BO sizes and alignment info (in bytes) */ +struct drm_amdgpu_info_uq_fw_areas_gfx { + /* shadow area size */ + __u32 shadow_size; + /* shadow area base virtual mem alignment */ + __u32 shadow_alignment; + /* context save area size */ + __u32 csa_size; + /* context save area base virtual mem alignment */ + __u32 csa_alignment; +}; + +/* IP specific fw related information used in the + * subquery AMDGPU_INFO_UQ_FW_AREAS + */ +struct drm_amdgpu_info_uq_fw_areas { + union { + struct drm_amdgpu_info_uq_fw_areas_gfx gfx; + }; +}; + +struct drm_amdgpu_info_num_handles { + /** Max handles as supported by firmware for UVD */ + __u32 uvd_max_handles; + /** Handles currently in use for UVD */ + __u32 uvd_used_handles; +}; + +#define AMDGPU_VCE_CLOCK_TABLE_ENTRIES 6 + +struct drm_amdgpu_info_vce_clock_table_entry { + /** System clock */ + __u32 sclk; + /** Memory clock */ + __u32 mclk; + /** VCE clock */ + __u32 eclk; + __u32 pad; +}; + +struct drm_amdgpu_info_vce_clock_table { + struct drm_amdgpu_info_vce_clock_table_entry entries[AMDGPU_VCE_CLOCK_TABLE_ENTRIES]; + __u32 num_valid_entries; + __u32 pad; +}; + +/* query video encode/decode caps */ +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2 0 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4 1 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1 2 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC 3 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC 4 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG 5 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9 6 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1 7 +#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT 8 + +struct drm_amdgpu_info_video_codec_info { + __u32 valid; + __u32 max_width; + __u32 max_height; + __u32 max_pixels_per_frame; + __u32 max_level; + __u32 pad; +}; + +struct drm_amdgpu_info_video_caps { + struct drm_amdgpu_info_video_codec_info codec_info[AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT]; +}; + +#define AMDGPU_VMHUB_TYPE_MASK 0xff +#define AMDGPU_VMHUB_TYPE_SHIFT 0 +#define AMDGPU_VMHUB_TYPE_GFX 0 +#define AMDGPU_VMHUB_TYPE_MM0 1 +#define AMDGPU_VMHUB_TYPE_MM1 2 +#define AMDGPU_VMHUB_IDX_MASK 0xff00 +#define AMDGPU_VMHUB_IDX_SHIFT 8 + +struct drm_amdgpu_info_gpuvm_fault { + __u64 addr; + __u32 status; + __u32 vmhub; +}; + +struct drm_amdgpu_info_uq_metadata_gfx { + /* shadow area size for gfx11 */ + __u32 shadow_size; + /* shadow area base virtual alignment for gfx11 */ + __u32 shadow_alignment; + /* context save area size for gfx11 */ + __u32 csa_size; + /* context save area base virtual alignment for gfx11 */ + __u32 csa_alignment; +}; + +struct drm_amdgpu_info_uq_metadata { + union { + struct drm_amdgpu_info_uq_metadata_gfx gfx; + }; +}; + +/* + * Supported GPU families + */ +#define AMDGPU_FAMILY_UNKNOWN 0 +#define AMDGPU_FAMILY_SI 110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */ +#define AMDGPU_FAMILY_CI 120 /* Bonaire, Hawaii */ +#define AMDGPU_FAMILY_KV 125 /* Kaveri, Kabini, Mullins */ +#define AMDGPU_FAMILY_VI 130 /* Iceland, Tonga */ +#define AMDGPU_FAMILY_CZ 135 /* Carrizo, Stoney */ +#define AMDGPU_FAMILY_AI 141 /* Vega10 */ +#define AMDGPU_FAMILY_RV 142 /* Raven */ +#define AMDGPU_FAMILY_NV 143 /* Navi10 */ +#define AMDGPU_FAMILY_VGH 144 /* Van Gogh */ +#define AMDGPU_FAMILY_GC_11_0_0 145 /* GC 11.0.0 */ +#define AMDGPU_FAMILY_YC 146 /* Yellow Carp */ +#define AMDGPU_FAMILY_GC_11_0_1 148 /* GC 11.0.1 */ +#define AMDGPU_FAMILY_GC_10_3_6 149 /* GC 10.3.6 */ +#define AMDGPU_FAMILY_GC_10_3_7 151 /* GC 10.3.7 */ +#define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ +#define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ + +#ifndef HAVE_DRM_COLOR_CTM_3X4 +/* FIXME wrong namespace! */ +struct drm_color_ctm_3x4 { + /* + * Conversion matrix with 3x4 dimensions in S31.32 sign-magnitude + * (not two's complement!) format. + */ + __u64 matrix[12]; +}; +#endif + +/** + * Definition of System Unified Address (SUA) apertures + */ +#define AMDGPU_SUA_APERTURE_PRIVATE 1 +#define AMDGPU_SUA_APERTURE_SHARED 2 +struct drm_amdgpu_virtual_range { + uint64_t start; + uint64_t end; +}; + +struct drm_amdgpu_capability { + __u32 flag; + __u32 direct_gma_size; +}; + +/* + * Definition of free sync enter and exit signals + * We may have more options in the future + */ +#define AMDGPU_FREESYNC_FULLSCREEN_ENTER 1 +#define AMDGPU_FREESYNC_FULLSCREEN_EXIT 2 + +struct drm_amdgpu_freesync { + __u32 op; /* AMDGPU_FREESYNC_FULLSCREEN_ENTER or */ + /* AMDGPU_FREESYNC_FULLSCREEN_ENTER */ + __u32 spare[7]; +}; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 0a55e34a2..ee55bde0a 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -12,72 +12,42 @@ #include #include #include +#include +#include #include #include #include #include #include -#include #include "criu-plugin.h" #include "plugin.h" #include "criu-amdgpu.pb-c.h" +#include "util.h" +#include "util-pie.h" +#include "fdstore.h" #include "kfd_ioctl.h" #include "xmalloc.h" #include "criu-log.h" #include "files.h" +#include "pstree.h" +#include "sockets.h" +#include "rst-malloc.h" #include "common/list.h" +#include "amdgpu_drm.h" +#include "amdgpu_plugin_dmabuf.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" +#include "amdgpu_socket_utils.h" #include "img-streamer.h" #include "image.h" #include "cr_options.h" - -#define AMDGPU_KFD_DEVICE "/dev/kfd" -#define PROCPIDMEM "/proc/%d/mem" -#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" -#define HSAKMT_SHM "/hsakmt_shared_mem" -#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" -#define HSAKMT_SEM "hsakmt_semaphore" - -#define KFD_IOCTL_MAJOR_VERSION 1 -#define MIN_KFD_IOCTL_MINOR_VERSION 8 - -#define IMG_KFD_FILE "amdgpu-kfd-%d.img" -#define IMG_RENDERD_FILE "amdgpu-renderD-%d.img" -#define IMG_PAGES_FILE "amdgpu-pages-%d-%04x.img" - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "amdgpu_plugin: " - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif - -#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) - -#define SDMA_OPCODE_COPY 1 -#define SDMA_COPY_SUB_OPCODE_LINEAR 0 -#define SDMA_NOP 0 -#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) - -enum sdma_op_type { - SDMA_OP_VRAM_READ, - SDMA_OP_VRAM_WRITE, -}; +#include "util.h" struct vma_metadata { struct list_head list; @@ -89,140 +59,42 @@ struct vma_metadata { }; /************************************ Global Variables ********************************************/ -struct tp_system src_topology; -struct tp_system dest_topology; - -struct device_maps checkpoint_maps; -struct device_maps restore_maps; - -extern int fd_next; static LIST_HEAD(update_vma_info_list); -extern bool kfd_fw_version_check; -extern bool kfd_sdma_fw_version_check; -extern bool kfd_caches_count_check; -extern bool kfd_num_gws_check; -extern bool kfd_vram_size_check; -extern bool kfd_numa_check; -extern bool kfd_capability_check; +size_t kfd_max_buffer_size; +bool plugin_added_to_inventory = false; + +bool plugin_disabled = false; + +struct handle_id { + int handle; + int fdstore_id; +}; +struct shared_handle_ids { + int num_handles; + struct handle_id *handles; +}; +struct shared_handle_ids *shared_memory = NULL; + +static mutex_t *shared_memory_mutex; + +int current_pid; +/* + * In the case of a single process (common case), this optimization can effectively + * reduce the restore latency with parallel restore. In the case of multiple processes, + * states are already restored in parallel within different processes. Therefore, this + * optimization does not introduce further improvement and will be disabled by default + * in this case. The flag, parallel_disabled, is used to control whether the + * optimization is enabled or disabled. + */ +bool parallel_disabled = false; + +pthread_t parallel_thread = 0; +int parallel_thread_result = 0; /**************************************************************************************************/ -int write_fp(FILE *fp, const void *buf, const size_t buf_len) -{ - size_t len_write; - - len_write = fwrite(buf, 1, buf_len, fp); - if (len_write != buf_len) { - pr_perror("Unable to write file (wrote:%ld buf_len:%ld)", len_write, buf_len); - return -EIO; - } - return 0; -} - -int read_fp(FILE *fp, void *buf, const size_t buf_len) -{ - size_t len_read; - - len_read = fread(buf, 1, buf_len, fp); - if (len_read != buf_len) { - pr_perror("Unable to read file (read:%ld buf_len:%ld)", len_read, buf_len); - return -EIO; - } - return 0; -} - -/** - * @brief Open an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * The FILE * returned is already at the location of the first actual contents. - * - * @param path The file path - * @param write False for read, true for write - * @param size Size of actual contents - * @return FILE *if successful, NULL if failed - */ -FILE *open_img_file(char *path, bool write, size_t *size) -{ - FILE *fp = NULL; - int fd, ret; - - if (opts.stream) - fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); - else - fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); - - if (fd < 0) { - pr_perror("%s: Failed to open for %s", path, write ? "write" : "read"); - return NULL; - } - - fp = fdopen(fd, write ? "w" : "r"); - if (!fp) { - pr_perror("%s: Failed get pointer for %s", path, write ? "write" : "read"); - return NULL; - } - - if (write) - ret = write_fp(fp, size, sizeof(*size)); - else - ret = read_fp(fp, size, sizeof(*size)); - - if (ret) { - pr_perror("%s:Failed to access file size", path); - fclose(fp); - return NULL; - } - - pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); - return fp; -} - -/** - * @brief Write an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * - * @param path The file path - * @param buf pointer to data to be written - * @param buf_len size of buf - * @return 0 if successful. -errno on failure - */ -int write_img_file(char *path, const void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - size_t len = buf_len; - - fp = open_img_file(path, true, &len); - if (!fp) - return -errno; - - ret = write_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - -int read_file(const char *file_path, void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - - fp = fopen(file_path, "r"); - if (!fp) { - pr_perror("Cannot fopen %s", file_path); - return -errno; - } - - ret = read_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - /* Call ioctl, restarting if it is interrupted */ int kmtIoctl(int fd, unsigned long request, void *arg) { @@ -260,21 +132,21 @@ static void free_e(CriuKfd *e) static int allocate_device_entries(CriuKfd *e, int num_of_devices) { - e->device_entries = xmalloc(sizeof(DeviceEntry *) * num_of_devices); + e->device_entries = xmalloc(sizeof(KfdDeviceEntry *) * num_of_devices); if (!e->device_entries) { pr_err("Failed to allocate device_entries\n"); return -ENOMEM; } for (int i = 0; i < num_of_devices; i++) { - DeviceEntry *entry = xzalloc(sizeof(*entry)); + KfdDeviceEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate entry\n"); return -ENOMEM; } - device_entry__init(entry); + kfd_device_entry__init(entry); e->device_entries[i] = entry; e->n_device_entries++; @@ -284,21 +156,21 @@ static int allocate_device_entries(CriuKfd *e, int num_of_devices) static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr) { - e->bo_entries = xmalloc(sizeof(BoEntry *) * num_bos); + e->bo_entries = xmalloc(sizeof(KfdBoEntry *) * num_bos); if (!e->bo_entries) { pr_err("Failed to allocate bo_info\n"); return -ENOMEM; } for (int i = 0; i < num_bos; i++) { - BoEntry *entry = xzalloc(sizeof(*entry)); + KfdBoEntry *entry = xzalloc(sizeof(*entry)); if (!entry) { pr_err("Failed to allocate botest\n"); return -ENOMEM; } - bo_entry__init(entry); + kfd_bo_entry__init(entry); e->bo_entries[i] = entry; e->n_bo_entries++; @@ -306,13 +178,13 @@ static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucke return 0; } -int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceEntry **deviceEntries) +int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDeviceEntry **deviceEntries) { uint32_t devinfo_index = 0; struct tp_node *node; list_for_each_entry(node, &sys->nodes, listm_system) { - DeviceEntry *devinfo = deviceEntries[devinfo_index++]; + KfdDeviceEntry *devinfo = deviceEntries[devinfo_index++]; devinfo->node_id = node->id; @@ -380,11 +252,11 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceE return 0; } -int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) +int devinfo_to_topology(KfdDeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) { for (int i = 0; i < num_devices; i++) { struct tp_node *node; - DeviceEntry *devinfo = devinfos[i]; + KfdDeviceEntry *devinfo = devinfos[i]; node = sys_add_node(sys, devinfo->node_id, devinfo->gpu_id); if (!node) @@ -449,9 +321,56 @@ void getenv_bool(const char *var, bool *value) pr_info("param: %s:%s\n", var, *value ? "Y" : "N"); } +void getenv_size_t(const char *var, size_t *value) +{ + char *value_str = getenv(var); + char *endp = value_str; + int sh = 0; + size_t size; + + if (value_str) { + size = (size_t)strtoul(value_str, &endp, 0); + if (errno || value_str == endp) { + pr_err("Ignoring invalid value for %s=%s, expecting a positive integer\n", var, value_str); + return; + } + switch (*endp) { + case 'k': + case 'K': + sh = 10; + break; + case 'M': + sh = 20; + break; + case 'G': + sh = 30; + break; + case '\0': + sh = 0; + break; + default: + pr_err("Ignoring invalid size suffix for %s=%s, expecting 'K'/k', 'M', or 'G'\n", var, value_str); + return; + } + if (SIZE_MAX >> sh < size) { + pr_err("Ignoring invalid value for %s=%s, exceeds SIZE_MAX\n", var, value_str); + return; + } + *value = size << sh; + } + pr_info("param: %s:0x%lx\n", var, *value); +} + int amdgpu_plugin_init(int stage) { - pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + + pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); topology_init(&src_topology); topology_init(&dest_topology); @@ -459,6 +378,15 @@ int amdgpu_plugin_init(int stage) maps_init(&restore_maps); if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (has_children(root_item)) { + pr_info("Parallel restore disabled\n"); + parallel_disabled = true; + } else { + if (install_parallel_sock() < 0) { + pr_err("Failed to install parallel socket\n"); + return -1; + } + } /* Default Values */ kfd_fw_version_check = true; kfd_sdma_fw_version_check = true; @@ -476,12 +404,18 @@ int amdgpu_plugin_init(int stage) getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check); getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check); } + kfd_max_buffer_size = 0; + getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size); + return 0; } void amdgpu_plugin_fini(int stage, int ret) { - pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + if (plugin_disabled) + return; + + pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); if (stage == CR_PLUGIN_STAGE__RESTORE) sys_close_drm_render_devices(&dest_topology); @@ -501,7 +435,7 @@ struct thread_data { uint32_t gpu_id; pid_t pid; struct kfd_criu_bo_bucket *bo_buckets; - BoEntry **bo_entries; + KfdBoEntry **bo_entries; int drm_fd; int ret; int id; /* File ID used by CRIU to identify KFD image for this process */ @@ -509,38 +443,36 @@ struct thread_data { int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) { - struct stat st_kfd, st_dri_min; - char img_path[128]; + struct stat st_kfd; int ret = 0; - pr_debug("amdgpu_plugin: Enter %s\n", __func__); + pr_debug("Enter %s\n", __func__); ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { pr_perror("stat error for /dev/kfd"); return ret; } - snprintf(img_path, sizeof(img_path), "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE); - - ret = stat(img_path, &st_dri_min); - if (ret == -1) { - pr_perror("stat error for %s", img_path); - return ret; - } - - if (major(st_buf->st_rdev) == major(st_kfd.st_rdev) || ((major(st_buf->st_rdev) == major(st_dri_min.st_rdev)) && - (minor(st_buf->st_rdev) >= minor(st_dri_min.st_rdev) && - minor(st_buf->st_rdev) >= DRM_FIRST_RENDER_NODE))) { + /* If input device is KFD return device as supported */ + if (major(st_buf->st_rdev) == major(st_kfd.st_rdev)) { pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev)); - pr_debug("AMD KFD(maj) = %d, DRI(maj,min) = %d:%d VMA Device fd(maj,min) = %d:%d\n", - major(st_kfd.st_rdev), major(st_dri_min.st_rdev), minor(st_dri_min.st_rdev), - major(st_buf->st_rdev), minor(st_buf->st_rdev)); - /* VMA belongs to kfd */ return 0; } - pr_perror("amdgpu_plugin: Can't handle the VMA mapping"); - return -ENOTSUP; + /* Determine if input is a DRM device and therefore is supported */ + ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf); + if (ret) + pr_perror("%s(), Can't handle VMAs of input device", __func__); + + if (!ret && !plugin_added_to_inventory) { + ret = add_inventory_plugin(CR_PLUGIN_DESC.name); + if (ret) + pr_err("Failed to add AMDGPU plugin to inventory image\n"); + else + plugin_added_to_inventory = true; + } + + return ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) @@ -607,16 +539,15 @@ void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, amdgpu_bo_free(h_bo); } -int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type) +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free) { - uint64_t size, gpu_addr_src, gpu_addr_dest, gpu_addr_ib; - uint64_t gpu_addr_src_orig, gpu_addr_dest_orig; - amdgpu_va_handle h_va_src, h_va_dest, h_va_ib; - amdgpu_bo_handle h_bo_src, h_bo_dest, h_bo_ib; + uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; + uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; + amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; + amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; struct amdgpu_bo_import_result res = { 0 }; - uint64_t copy_size, bytes_remain, j = 0; - uint64_t n_packets; struct amdgpu_cs_ib_info ib_info; amdgpu_bo_list_handle h_bo_list; struct amdgpu_cs_request cs_req; @@ -625,102 +556,98 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am uint32_t expired; amdgpu_context_handle h_ctx; uint32_t *ib = NULL; - int err, shared_fd; + int j, err, packets_per_buffer; - shared_fd = bo_buckets[i].dmabuf_fd; - size = bo_buckets[i].size; + buffer_bo_size = min(size, buffer_size); + packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; + src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; + dst_bo_size = (type == SDMA_OP_VRAM_READ) ? buffer_bo_size : size; plugin_log_msg("Enter %s\n", __func__); /* prepare src buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_src); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, src_bo_size, &h_bo_src); if (err) { pr_perror("failed to create userptr for sdma"); return -EFAULT; } - break; - case SDMA_OP_VRAM_READ: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); return -EFAULT; } - h_bo_src = res.buf_handle; break; - default: pr_perror("Invalid sdma operation"); return -EINVAL; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_src, &h_va_src, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, src_bo_size, 0x1000, 0, &gpu_addr_src, + &h_va_src, 0); if (err) { pr_perror("failed to alloc VA for src bo"); goto err_src_va; } - err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_src, 0, src_bo_size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the src BO"); goto err_src_bo_map; } - plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, size); + plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, src_bo_size); + /* prepare dest buffer */ switch (type) { case SDMA_OP_VRAM_WRITE: err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); if (err) { pr_perror("failed to import dmabuf handle from libdrm"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - - h_bo_dest = res.buf_handle; + h_bo_dst = res.buf_handle; break; - case SDMA_OP_VRAM_READ: - err = amdgpu_create_bo_from_user_mem(h_dev, userptr, size, &h_bo_dest); + err = amdgpu_create_bo_from_user_mem(h_dev, buffer, dst_bo_size, &h_bo_dst); if (err) { pr_perror("failed to create userptr for sdma"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } break; - default: pr_perror("Invalid sdma operation"); - goto err_dest_bo_prep; + goto err_dst_bo_prep; } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr_dest, &h_va_dest, 0); + err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, dst_bo_size, 0x1000, 0, &gpu_addr_dst, + &h_va_dst, 0); if (err) { pr_perror("failed to alloc VA for dest bo"); - goto err_dest_va; + goto err_dst_va; } - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_MAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, dst_bo_size, gpu_addr_dst, 0, AMDGPU_VA_OP_MAP); if (err) { pr_perror("failed to GPU map the dest BO"); - goto err_dest_bo_map; + goto err_dst_bo_map; } - plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dest, size); + plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dst, dst_bo_size); - n_packets = (size + max_copy_size) / max_copy_size; /* prepare ring buffer/indirect buffer for command submission * each copy packet is 7 dwords so we need to alloc 28x size for ib */ - err = alloc_and_map(h_dev, n_packets * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, + err = alloc_and_map(h_dev, packets_per_buffer * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, (void **)&ib); if (err) { pr_perror("failed to allocate and map ib/rb"); goto err_ib_gpu_alloc; } - - plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, n_packets * 28); + plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, packets_per_buffer * 28); resources[0] = h_bo_src; - resources[1] = h_bo_dest; + resources[1] = h_bo_dst; resources[2] = h_bo_ib; err = amdgpu_bo_list_create(h_dev, 3, resources, NULL, &h_bo_list); if (err) { @@ -728,103 +655,124 @@ int sdma_copy_bo(struct kfd_criu_bo_bucket *bo_buckets, void *userptr, int i, am goto err_bo_list; } - memset(&cs_req, 0, sizeof(cs_req)); - memset(&fence, 0, sizeof(fence)); - memset(ib, 0, n_packets * 28); - - plugin_log_msg("setting up sdma packets for command submission\n"); bytes_remain = size; - gpu_addr_src_orig = gpu_addr_src; - gpu_addr_dest_orig = gpu_addr_dest; + if (type == SDMA_OP_VRAM_WRITE) + copy_dst = gpu_addr_dst; + else + copy_src = gpu_addr_src; + while (bytes_remain > 0) { - copy_size = min(bytes_remain, max_copy_size); + memset(&cs_req, 0, sizeof(cs_req)); + memset(&fence, 0, sizeof(fence)); + memset(&ib_info, 0, sizeof(ib_info)); + memset(ib, 0, packets_per_buffer * 28); - ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); - ib[j++] = copy_size; - ib[j++] = 0; - ib[j++] = 0xffffffff & gpu_addr_src; - ib[j++] = (0xffffffff00000000 & gpu_addr_src) >> 32; - ib[j++] = 0xffffffff & gpu_addr_dest; - ib[j++] = (0xffffffff00000000 & gpu_addr_dest) >> 32; + if (type == SDMA_OP_VRAM_WRITE) { + err = read_fp(storage_fp, buffer, min(bytes_remain, buffer_bo_size)); + if (err) { + pr_perror("failed to read from storage"); + goto err_bo_list; + } + } - gpu_addr_src += copy_size; - gpu_addr_dest += copy_size; - bytes_remain -= copy_size; - } + buffer_space_remain = buffer_bo_size; + if (type == SDMA_OP_VRAM_WRITE) + copy_src = gpu_addr_src; + else + copy_dst = gpu_addr_dst; + j = 0; - gpu_addr_src = gpu_addr_src_orig; - gpu_addr_dest = gpu_addr_dest_orig; - plugin_log_msg("pad the IB to align on 8 dw boundary\n"); - /* pad the IB to the required number of dw with SDMA_NOP */ - while (j & 7) - ib[j++] = SDMA_NOP; + while (bytes_remain > 0 && buffer_space_remain > 0) { + copy_size = min(min(bytes_remain, max_copy_size), buffer_space_remain); - ib_info.ib_mc_address = gpu_addr_ib; - ib_info.size = j; + ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); + ib[j++] = copy_size; + ib[j++] = 0; + ib[j++] = 0xffffffff & copy_src; + ib[j++] = (0xffffffff00000000 & copy_src) >> 32; + ib[j++] = 0xffffffff & copy_dst; + ib[j++] = (0xffffffff00000000 & copy_dst) >> 32; - cs_req.ip_type = AMDGPU_HW_IP_DMA; - /* possible future optimization: may use other rings, info available in - * amdgpu_query_hw_ip_info() - */ - cs_req.ring = 0; - cs_req.number_of_ibs = 1; - cs_req.ibs = &ib_info; - cs_req.resources = h_bo_list; - cs_req.fence_info.handle = NULL; + copy_src += copy_size; + copy_dst += copy_size; + bytes_remain -= copy_size; + buffer_space_remain -= copy_size; + } + /* pad the IB to the required number of dw with SDMA_NOP */ + while (j & 7) + ib[j++] = SDMA_NOP; - plugin_log_msg("create the context\n"); - err = amdgpu_cs_ctx_create(h_dev, &h_ctx); - if (err) { - pr_perror("failed to create context for SDMA command submission"); - goto err_ctx; - } + ib_info.ib_mc_address = gpu_addr_ib; + ib_info.size = j; - plugin_log_msg("initiate sdma command submission\n"); - err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); - if (err) { - pr_perror("failed to submit command for SDMA IB"); - goto err_cs_submit_ib; - } + cs_req.ip_type = AMDGPU_HW_IP_DMA; + /* possible future optimization: may use other rings, info available in + * amdgpu_query_hw_ip_info() + */ + cs_req.ring = 0; + cs_req.number_of_ibs = 1; + cs_req.ibs = &ib_info; + cs_req.resources = h_bo_list; + cs_req.fence_info.handle = NULL; - fence.context = h_ctx; - fence.ip_type = AMDGPU_HW_IP_DMA; - fence.ip_instance = 0; - fence.ring = 0; - fence.fence = cs_req.seq_no; - err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); - if (err) { - pr_perror("failed to query fence status"); - goto err_cs_submit_ib; - } + err = amdgpu_cs_ctx_create(h_dev, &h_ctx); + if (err) { + pr_perror("failed to create context for SDMA command submission"); + goto err_ctx; + } + err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); + if (err) { + pr_perror("failed to submit command for SDMA IB"); + goto err_cs_submit_ib; + } - if (!expired) { - pr_err("IB execution did not complete\n"); - err = -EBUSY; - goto err_cs_submit_ib; - } + fence.context = h_ctx; + fence.ip_type = AMDGPU_HW_IP_DMA; + fence.ip_instance = 0; + fence.ring = 0; + fence.fence = cs_req.seq_no; + err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); + if (err) { + pr_perror("failed to query fence status"); + goto err_cs_submit_ib; + } + if (!expired) { + pr_err("IB execution did not complete\n"); + err = -EBUSY; + goto err_cs_submit_ib; + } - plugin_log_msg("done querying fence status\n"); + if (type == SDMA_OP_VRAM_READ) { + err = write_fp(storage_fp, buffer, buffer_bo_size - buffer_space_remain); + if (err) { + pr_perror("failed to write out to storage"); + goto err_cs_submit_ib; + } + } err_cs_submit_ib: - amdgpu_cs_ctx_free(h_ctx); + amdgpu_cs_ctx_free(h_ctx); + if (err) + break; + } err_ctx: amdgpu_bo_list_destroy(h_bo_list); err_bo_list: - free_and_unmap(n_packets * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); + free_and_unmap(packets_per_buffer * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); err_ib_gpu_alloc: - err = amdgpu_bo_va_op(h_bo_dest, 0, size, gpu_addr_dest, 0, AMDGPU_VA_OP_UNMAP); + err = amdgpu_bo_va_op(h_bo_dst, 0, size, gpu_addr_dst, 0, AMDGPU_VA_OP_UNMAP); if (err) - pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dest, size); -err_dest_bo_map: - err = amdgpu_va_range_free(h_va_dest); + pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dst, size); +err_dst_bo_map: + err = amdgpu_va_range_free(h_va_dst); if (err) pr_perror("dest range free failed"); -err_dest_va: - err = amdgpu_bo_free(h_bo_dest); +err_dst_va: + if (!do_not_free) + err = amdgpu_bo_free(h_bo_dst); if (err) pr_perror("dest bo free failed"); - -err_dest_bo_prep: +err_dst_bo_prep: err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_UNMAP); if (err) pr_perror("failed to GPU unmap the src BO %lx, size = %lx", gpu_addr_src, size); @@ -836,7 +784,6 @@ err_src_va: err = amdgpu_bo_free(h_bo_src); if (err) pr_perror("src bo free failed"); - plugin_log_msg("Leaving sdma_copy_bo, err = %d\n", err); return err; } @@ -845,19 +792,18 @@ void *dump_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - BoEntry **bo_info = thread_data->bo_entries; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; - size_t max_bo_size = 0, image_size = 0; + size_t max_bo_size = 0, image_size = 0, buffer_size; uint64_t max_copy_size; uint32_t major, minor; int num_bos = 0; int i, ret = 0; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -884,15 +830,16 @@ void *dump_bo_contents(void *_thread_data) } } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, true, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -910,19 +857,17 @@ void *dump_bo_contents(void *_thread_data) num_bos++; /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_READ); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_READ, false); + if (ret) { pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); break; } - plugin_log_msg("** Successfully drained the BO using sDMA: bo_buckets[%d] **\n", i); - ret = write_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - break; } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -939,19 +884,18 @@ void *restore_bo_contents(void *_thread_data) { struct thread_data *thread_data = (struct thread_data *)_thread_data; struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - size_t image_size = 0, total_bo_size = 0, max_bo_size = 0; - BoEntry **bo_info = thread_data->bo_entries; + size_t image_size = 0, total_bo_size = 0, max_bo_size = 0, buffer_size; struct amdgpu_gpu_info gpu_info = { 0 }; amdgpu_device_handle h_dev; uint64_t max_copy_size; uint32_t major, minor; FILE *bo_contents_fp = NULL; - void *buffer; + void *buffer = NULL; char img_path[40]; int num_bos = 0; int i, ret = 0; - pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id); + pr_info("Thread[0x%x] started\n", thread_data->gpu_id); ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); if (ret) { @@ -969,7 +913,7 @@ void *restore_bo_contents(void *_thread_data) max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : SDMA_LINEAR_COPY_MAX_SIZE - 1; - snprintf(img_path, sizeof(img_path), IMG_PAGES_FILE, thread_data->id, thread_data->gpu_id); + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); bo_contents_fp = open_img_file(img_path, false, &image_size); if (!bo_contents_fp) { pr_perror("Cannot fopen %s", img_path); @@ -977,7 +921,6 @@ void *restore_bo_contents(void *_thread_data) goto exit; } - /* Allocate buffer to fit biggest BO */ for (i = 0; i < thread_data->num_of_bos; i++) { if (bo_buckets[i].gpu_id == thread_data->gpu_id && (bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) { @@ -989,17 +932,17 @@ void *restore_bo_contents(void *_thread_data) } if (total_bo_size != image_size) { - pr_err("amdgpu_plugin: %s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, - total_bo_size); + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, total_bo_size); ret = -EINVAL; goto exit; } - /* Allocate buffer to fit biggest BO */ - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), max_bo_size); + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); if (!buffer) { - pr_perror("Failed to alloc aligned memory"); + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); ret = -ENOMEM; goto exit; } @@ -1013,11 +956,8 @@ void *restore_bo_contents(void *_thread_data) num_bos++; - ret = read_fp(bo_contents_fp, buffer, bo_info[i]->size); - if (ret) - goto exit; - - ret = sdma_copy_bo(bo_buckets, buffer, i, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE); + ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, false); if (ret) { pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); break; @@ -1026,7 +966,7 @@ void *restore_bo_contents(void *_thread_data) } exit: - pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); + pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); if (bo_contents_fp) fclose(bo_contents_fp); @@ -1054,9 +994,9 @@ int check_hsakmt_shared_mem(uint64_t *shared_mem_size, uint32_t *shared_mem_magi /* First 4 bytes of shared file is the magic */ ret = read_file(HSAKMT_SHM_PATH, shared_mem_magic, sizeof(*shared_mem_magic)); if (ret) - pr_perror("amdgpu_plugin: Failed to read shared mem magic"); + pr_perror("Failed to read shared mem magic"); else - plugin_log_msg("amdgpu_plugin: Shared mem magic:0x%x\n", *shared_mem_magic); + plugin_log_msg("Shared mem magic:0x%x\n", *shared_mem_magic); return 0; } @@ -1071,7 +1011,7 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; if (!stat(HSAKMT_SHM_PATH, &st)) { - pr_debug("amdgpu_plugin: %s already exists\n", HSAKMT_SHM_PATH); + pr_debug("%s already exists\n", HSAKMT_SHM_PATH); } else { pr_info("Warning:%s was missing. Re-creating new file but we may lose perf counters\n", HSAKMT_SHM_PATH); @@ -1079,14 +1019,14 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha ret = ftruncate(fd, shared_mem_size); if (ret < 0) { - pr_err("amdgpu_plugin: Failed to truncate shared mem %s\n", HSAKMT_SHM); + pr_err("Failed to truncate shared mem %s\n", HSAKMT_SHM); close(fd); return -errno; } ret = write(fd, &shared_mem_magic, sizeof(shared_mem_magic)); if (ret != sizeof(shared_mem_magic)) { - pr_perror("amdgpu_plugin: Failed to restore shared mem magic"); + pr_perror("Failed to restore shared mem magic"); close(fd); return -errno; } @@ -1103,24 +1043,163 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha return 0; } -static int unpause_process(int fd) +int amdgpu_unpause_processes(int pid) { int ret = 0; struct kfd_ioctl_criu_args args = { 0 }; + struct list_head *l = get_dumped_fds(); + struct dumped_fd *st; - args.op = KFD_CRIU_OP_UNPAUSE; + list_for_each_entry(st, l, l) { + if (st->is_drm) { + close(st->fd); + } else { + args.op = KFD_CRIU_OP_UNPAUSE; - ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); - if (ret) { - pr_perror("amdgpu_plugin: Failed to unpause process"); - goto exit; + ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args); + if (ret) { + pr_perror("Failed to unpause process"); + goto exit; + } + } } + if (post_dump_dmabuf_check() < 0) + ret = -1; + exit: pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); + clear_dumped_fds(); return ret; } +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes) + +int store_dmabuf_fd(int handle, int fd) +{ + int id; + + id = fdstore_add(fd); + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return 0; + } + if (shared_memory->handles[i].handle == -1) { + shared_memory->handles[i].handle = handle; + shared_memory->handles[i].fdstore_id = id; + mutex_unlock(shared_memory_mutex); + return 0; + } + } + mutex_unlock(shared_memory_mutex); + + return -1; +} + +int amdgpu_id_for_handle(int handle) +{ + mutex_lock(shared_memory_mutex); + for (int i = 0; i < shared_memory->num_handles; i++) { + if (shared_memory->handles[i].handle == handle) { + mutex_unlock(shared_memory_mutex); + return shared_memory->handles[i].fdstore_id; + } + } + mutex_unlock(shared_memory_mutex); + return -1; +} + +int amdgpu_restore_init(void) +{ + if (!shared_memory) { + int protection = PROT_READ | PROT_WRITE; + int visibility = MAP_SHARED | MAP_ANONYMOUS; + size_t img_size; + FILE *img_fp = NULL; + int ret; + unsigned char *buf; + int num_handles = 0; + char img_path[PATH_MAX]; + CriuRenderNode *rd = NULL; + CriuKfd *e = NULL; + + DIR *d; + struct dirent *dir; + d = opendir("."); + if (d) { + while ((dir = readdir(d)) != NULL) { + if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + e = criu_kfd__unpack(NULL, img_size, buf); + num_handles += e->num_of_bos; + criu_kfd__free_unpacked(e, NULL); + xfree(buf); + } + if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { + img_fp = open_img_file(dir->d_name, false, &img_size); + buf = xmalloc(img_size); + if (!buf) { + fclose(img_fp); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", img_path); + fclose(img_fp); + xfree(buf); + return ret; + } + + fclose(img_fp); + rd = criu_render_node__unpack(NULL, img_size, buf); + num_handles += rd->num_of_bos; + criu_render_node__free_unpacked(rd, NULL); + xfree(buf); + } + } + closedir(d); + } + + if (num_handles > 0) { + shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0); + shared_memory->num_handles = num_handles; + shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0); + + for (int i = 0; i < num_handles; i++) { + shared_memory->handles[i].handle = -1; + shared_memory->handles[i].fdstore_id = -1; + } + + shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex)); + if (!shared_memory_mutex) { + pr_err("Can't create amdgpu mutex\n"); + return -1; + } + mutex_init(shared_memory_mutex); + } + } + + return 0; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init) static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets, CriuKfd *e) @@ -1164,6 +1243,8 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd { struct thread_data *thread_datas; int ret = 0, i; + amdgpu_device_handle h_dev; + uint32_t major, minor; pr_debug("Dumping %d BOs\n", args->num_bos); @@ -1180,13 +1261,26 @@ static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd for (i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *boinfo = e->bo_entries[i]; + KfdBoEntry *boinfo = e->bo_entries[i]; boinfo->gpu_id = bo_bucket->gpu_id; boinfo->addr = bo_bucket->addr; boinfo->size = bo_bucket->size; boinfo->offset = bo_bucket->offset; boinfo->alloc_flags = bo_bucket->alloc_flags; + + ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev); + + boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd); + + amdgpu_device_deinitialize(h_dev); + } + for (i = 0; i < e->num_of_bos; i++) { + KfdBoEntry *boinfo = e->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, false); + if (ret) + goto exit; } for (int i = 0; i < e->num_of_gpus; i++) { @@ -1254,7 +1348,7 @@ bool kernel_supports_criu(int fd) } if (kmtIoctl(fd, AMDKFD_IOC_GET_VERSION, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call get version ioctl"); + pr_perror("Failed to call get version ioctl"); ret = false; goto exit; } @@ -1262,8 +1356,8 @@ bool kernel_supports_criu(int fd) pr_debug("Kernel IOCTL version:%d.%02d\n", args.major_version, args.minor_version); if (args.major_version != KFD_IOCTL_MAJOR_VERSION || args.minor_version < MIN_KFD_IOCTL_MINOR_VERSION) { - pr_err("amdgpu_plugin: CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", - args.major_version, args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); + pr_err("CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", args.major_version, + args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); ret = false; goto exit; } @@ -1286,13 +1380,13 @@ int amdgpu_plugin_dump_file(int fd, int id) size_t len; if (fstat(fd, &st) == -1) { - pr_perror("amdgpu_plugin: fstat error"); + pr_perror("fstat error"); return -1; } ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); if (ret == -1) { - pr_perror("amdgpu_plugin: fstat error for /dev/kfd"); + pr_perror("fstat error for /dev/kfd"); return -1; } @@ -1307,50 +1401,36 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } - /* Check whether this plugin was called for kfd or render nodes */ + /* Check whether this plugin was called for kfd, dmabuf or render nodes */ + ret = get_dmabuf_info(fd, &st); + if (ret < 0) { + pr_perror("Failed to get dmabuf info"); + return -1; + } + if (ret == 0) { + pr_info("Dumping dmabuf fd = %d\n", fd); + return amdgpu_plugin_dmabuf_dump(fd, id); + } + if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { + /* This is RenderD dumper plugin, for now just save renderD * minor number to be used during restore. In later phases this * needs to save more data for video decode etc. */ - - CriuRenderNode rd = CRIU_RENDER_NODE__INIT; - struct tp_node *tp_node; - - pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), - fd, id); - - tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); - if (!tp_node) { - pr_err("amdgpu_plugin: Failed to find a device with minor number = %d\n", minor(st.st_rdev)); - - return -ENODEV; - } - - rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd.gpu_id) - return -ENODEV; - - len = criu_render_node__get_packed_size(&rd); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; - - criu_render_node__pack(&rd, buf); - - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); - ret = write_img_file(img_path, buf, len); - if (ret) { - xfree(buf); + ret = amdgpu_plugin_drm_dump_file(fd, id, &st); + if (ret) + return ret; + + ret = record_dumped_fd(fd, true); + if (ret) return ret; - } - xfree(buf); /* Need to return success here so that criu can call plugins for renderD nodes */ - return ret; + return try_dump_dmabuf_list(); } - pr_info("amdgpu_plugin: %s : %s() called for fd = %d\n", CR_PLUGIN_DESC.name, __func__, major(st.st_rdev)); + pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); /* KFD only allows ioctl calls from the same process that opened the KFD file descriptor. * The existing /dev/kfd file descriptor that is passed in is only allowed to do IOCTL calls with @@ -1362,13 +1442,13 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_PROCESS_INFO; if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("amdgpu_plugin: Failed to call process info ioctl"); + pr_perror("Failed to call process info ioctl"); ret = -1; goto exit; } - pr_info("amdgpu_plugin: devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos, - args.num_objects, args.priv_data_size); + pr_info("devices:%" PRIu32 " bos:%" PRIu32 " objects:%" PRIu32 " priv_data:%" PRIu64 "\n", + args.num_devices, args.num_bos, args.num_objects, args.priv_data_size); e = xmalloc(sizeof(*e)); if (!e) { @@ -1401,7 +1481,7 @@ int amdgpu_plugin_dump_file(int fd, int id) args.op = KFD_CRIU_OP_CHECKPOINT; ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); if (ret) { - pr_perror("amdgpu_plugin: Failed to call dumper (process) ioctl"); + pr_perror("Failed to call dumper (process) ioctl"); goto exit; } @@ -1423,11 +1503,11 @@ int amdgpu_plugin_dump_file(int fd, int id) goto exit; snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); - pr_info("amdgpu_plugin: img_path = %s\n", img_path); + pr_info("img_path = %s\n", img_path); len = criu_kfd__get_packed_size(e); - pr_info("amdgpu_plugin: Len = %ld\n", len); + pr_info("Len = %ld\n", len); buf = xmalloc(len); if (!buf) { @@ -1441,11 +1521,12 @@ int amdgpu_plugin_dump_file(int fd, int id) ret = write_img_file(img_path, buf, len); xfree(buf); -exit: - /* Restore all queues */ - unpause_process(fd); - sys_close_drm_render_devices(&src_topology); + ret = record_dumped_fd(fd, false); + if (ret) + goto exit; + +exit: xfree((void *)args.devices); xfree((void *)args.bos); xfree((void *)args.priv_data); @@ -1453,9 +1534,9 @@ exit: free_e(e); if (ret) - pr_err("amdgpu_plugin: Failed to dump (ret:%d)\n", ret); + pr_err("Failed to dump (ret:%d)\n", ret); else - pr_info("amdgpu_plugin: Dump successful\n"); + pr_info("Dump successful\n"); return ret; } @@ -1468,7 +1549,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) int ret = 0, bucket_index = 0; pr_debug("Restoring %d devices\n", e->num_of_gpus); - args->num_devices = e->num_of_gpus; device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices); if (!device_buckets) @@ -1478,7 +1558,7 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) { struct kfd_criu_device_bucket *device_bucket; - DeviceEntry *devinfo = e->device_entries[entries_i]; + KfdDeviceEntry *devinfo = e->device_entries[entries_i]; struct tp_node *tp_node; if (!devinfo->gpu_id) @@ -1501,10 +1581,10 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) device_bucket->drm_fd = node_get_drm_render_device(tp_node); if (device_bucket->drm_fd < 0) { - pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver"); + pr_perror("Can't pass NULL drm render fd to driver"); goto exit; } else { - pr_info("amdgpu_plugin: passing drm render fd = %d to driver\n", device_bucket->drm_fd); + pr_info("passing drm render fd = %d to driver\n", device_bucket->drm_fd); } } @@ -1528,7 +1608,7 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int i = 0; i < args->num_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - BoEntry *bo_entry = e->bo_entries[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; bo_bucket->gpu_id = bo_entry->gpu_id; bo_bucket->addr = bo_entry->addr; @@ -1541,19 +1621,37 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) } pr_info("Restore BOs Ok\n"); + + return 0; +} + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd) +{ + struct vma_metadata *vma_md; + + vma_md = xmalloc(sizeof(*vma_md)); + if (!vma_md) { + return -ENOMEM; + } + + memset(vma_md, 0, sizeof(*vma_md)); + + vma_md->old_pgoff = offset; + vma_md->vma_entry = addr; + + vma_md->new_pgoff = restored_offset; + vma_md->fd = fd; + + list_add_tail(&vma_md->list, &update_vma_info_list); + return 0; } static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - struct thread_data *thread_datas; + struct thread_data *thread_datas = NULL; int thread_i, ret = 0; - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } + uint64_t offset = 0; for (int i = 0; i < e->num_of_bos; i++) { struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; @@ -1588,7 +1686,7 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf vma_md->new_pgoff = bo_bucket->restored_offset; vma_md->fd = node_get_drm_render_device(tp_node); - plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx " + plugin_log_msg("adding vma_entry:addr:0x%lx old-off:0x%lx " "new_off:0x%lx new_minor:%d\n", vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor); @@ -1596,56 +1694,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf } } - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; + if (!parallel_disabled) { + parallel_restore_cmd restore_cmd; + pr_info("Begin to send parallel restore cmd\n"); + ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); + if (ret) + goto exit_parallel; - if (!e->device_entries[i]->gpu_id) - continue; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + uint32_t target_gpu_id; + struct tp_node *dev; - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + if (!e->device_entries[i]->gpu_id) + continue; - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit_parallel; + } + parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); + + for (int j = 0; j < e->num_of_bos; j++) { + if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) + continue; + if (bo_buckets[j].alloc_flags & + (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, + bo_buckets[j].size, offset, &restore_cmd); + offset += bo_buckets[j].size; + } + } + } + ret = send_parallel_restore_cmd(&restore_cmd); +exit_parallel: + free_parallel_restore_cmd(&restore_cmd); + } else { + thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); + if (!thread_datas) { + ret = -ENOMEM; goto exit; } - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; + thread_i = 0; + for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { + struct tp_node *dev; + int ret_thread = 0; + uint32_t target_gpu_id; - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; + if (!e->device_entries[i]->gpu_id) + continue; + + /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ + target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); + + /* We need the fd for actual_gpu_id */ + dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!dev) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + thread_datas[thread_i].id = id; + thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; + thread_datas[thread_i].bo_buckets = bo_buckets; + thread_datas[thread_i].bo_entries = e->bo_entries; + thread_datas[thread_i].pid = e->pid; + thread_datas[thread_i].num_of_bos = e->num_of_bos; + + thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); + if (thread_datas[thread_i].drm_fd < 0) { + ret = -thread_datas[thread_i].drm_fd; + goto exit; + } + + ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, + (void *)&thread_datas[thread_i]); + if (ret_thread) { + pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); + ret = -ret_thread; + goto exit; + } + thread_i++; } - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; - } + for (int i = 0; i < e->num_of_gpus; i++) { + pthread_join(thread_datas[i].thread, NULL); + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; + if (thread_datas[i].ret) { + ret = thread_datas[i].ret; + goto exit; + } } } exit: @@ -1653,12 +1796,12 @@ exit: if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) close(bo_buckets[i].dmabuf_fd); } - - xfree(thread_datas); + if (thread_datas) + xfree(thread_datas); return ret; } -int amdgpu_plugin_restore_file(int id) +int amdgpu_plugin_restore_file(int id, bool *retry_needed) { int ret = 0, fd; char img_path[PATH_MAX]; @@ -1669,7 +1812,12 @@ int amdgpu_plugin_restore_file(int id) size_t img_size; FILE *img_fp = NULL; - pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id); + *retry_needed = false; + + if (plugin_disabled) + return -ENOTSUP; + + pr_info("Initialized kfd plugin restorer with ID = %d\n", id); snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); @@ -1683,13 +1831,22 @@ int amdgpu_plugin_restore_file(int id) * TODO: Currently, this code will only work if this function is called for /dev/kfd * first as we assume restore_maps is already filled. Need to fix this later. */ - snprintf(img_path, sizeof(img_path), IMG_RENDERD_FILE, id); - pr_info("Restoring RenderD %s\n", img_path); + snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); img_fp = open_img_file(img_path, false, &img_size); - if (!img_fp) - return -EINVAL; - + if (!img_fp) { + ret = amdgpu_plugin_dmabuf_restore(id); + if (ret == 1) { + /* This is a dmabuf fd, but the corresponding buffer object that was + * exported to make it has not yet been restored. Need to try again + * later when the buffer object exists, so it can be re-exported. + */ + *retry_needed = true; + return 0; + } + return ret; + } + pr_info("Restoring RenderD %s\n", img_path); pr_debug("RenderD Image file size:%ld\n", img_size); buf = xmalloc(img_size); if (!buf) { @@ -1713,7 +1870,7 @@ int amdgpu_plugin_restore_file(int id) } fclose(img_fp); - pr_info("amdgpu_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id); + pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id); target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id); if (!target_gpu_id) { @@ -1727,11 +1884,21 @@ int amdgpu_plugin_restore_file(int id) goto fail; } - pr_info("amdgpu_plugin: render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); + pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); fd = node_get_drm_render_device(tp_node); - if (fd < 0) - pr_err("amdgpu_plugin: Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + if (fd < 0) { + pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + return -1; + } + + ret = amdgpu_plugin_drm_restore_file(fd, rd); + if (ret == 1) + *retry_needed = true; + if (ret < 0) { + fd = ret; + goto fail; + } fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); @@ -1743,7 +1910,20 @@ int amdgpu_plugin_restore_file(int id) * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in * tp_node. */ - return dup(fd); + + if (fd < 0) + return fd; + + if (!(*retry_needed)) { + fd = dup(fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + return fd; + } + + return 0; } fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); @@ -1752,7 +1932,7 @@ int amdgpu_plugin_restore_file(int id) return -1; } - pr_info("amdgpu_plugin: Opened kfd, fd = %d\n", fd); + pr_info("Opened kfd, fd = %d\n", fd); if (!kernel_supports_criu(fd)) return -ENOTSUP; @@ -1780,18 +1960,20 @@ int amdgpu_plugin_restore_file(int id) return -1; } - plugin_log_msg("amdgpu_plugin: read image file data\n"); + plugin_log_msg("read image file data\n"); /* * Initialize fd_next to be 1 greater than the biggest file descriptor in use by the target restore process. * This way, we know that the file descriptors we store will not conflict with file descriptors inside core * CRIU. */ - fd_next = find_unused_fd_pid(e->pid); - if (fd_next <= 0) { - pr_err("Failed to find unused fd (fd:%d)\n", fd_next); - ret = -EINVAL; - goto exit; + if (fd_next == -1) { + fd_next = find_unused_fd_pid(e->pid); + if (fd_next <= 0) { + pr_err("Failed to find unused fd (fd:%d)\n", fd_next); + ret = -EINVAL; + goto exit; + } } ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology); @@ -1824,14 +2006,26 @@ int amdgpu_plugin_restore_file(int id) args.num_objects = e->num_of_objects; args.priv_data_size = e->priv_data.len; args.priv_data = (uintptr_t)e->priv_data.data; - args.op = KFD_CRIU_OP_RESTORE; + if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { pr_perror("Restore ioctl failed"); ret = -1; goto exit; } + if (ret < 0) + goto exit; + + for (int i = 0; i < args.num_bos; i++) { + struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i]; + KfdBoEntry *bo_entry = e->bo_entries[i]; + + if (bo_entry->handle != -1) { + store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd); + } + } + ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e); if (ret) goto exit; @@ -1847,10 +2041,10 @@ exit: xfree(buf); if (ret) { - pr_err("amdgpu_plugin: Failed to restore (ret:%d)\n", ret); + pr_err("Failed to restore (ret:%d)\n", ret); fd = ret; } else { - pr_info("amdgpu_plugin: Restore successful (fd:%d)\n", fd); + pr_info("Restore successful (fd:%d)\n", fd); } return fd; @@ -1870,7 +2064,10 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const char *p_end; bool is_kfd = false, is_renderD = false; - plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__); + if (plugin_disabled) + return -ENOTSUP; + + plugin_log_msg("Enter %s\n", __func__); strncpy(path, in_path, sizeof(path)); @@ -1903,13 +2100,18 @@ int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) { *new_offset = vma_md->new_pgoff; - if (is_renderD) - *updated_fd = vma_md->fd; - else - *updated_fd = -1; + *updated_fd = -1; + if (is_renderD) { + int fd = dup(vma_md->fd); + if (fd == -1) { + pr_perror("unable to duplicate the render fd"); + return -1; + } + *updated_fd = fd; + } - plugin_log_msg("amdgpu_plugin: old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, - vma_md->new_pgoff, *updated_fd); + plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, + *updated_fd); return 1; } @@ -1922,26 +2124,290 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, amdgpu_plugin_update_vma int amdgpu_plugin_resume_devices_late(int target_pid) { struct kfd_ioctl_criu_args args = { 0 }; - int fd, ret = 0; + int fd, exit_code = 0; - pr_info("amdgpu_plugin: Inside %s for target pid = %d\n", __func__, target_pid); + if (plugin_disabled) + return -ENOTSUP; + + if (!parallel_disabled) { + pr_info("Close parallel restore server\n"); + if (close_parallel_restore_server()) { + pr_err("Close parallel restore server fail\n"); + return -1; + } + + exit_code = pthread_join(parallel_thread, NULL); + if (exit_code) { + pr_err("Failed to join parallel thread ret:%d\n", exit_code); + return -1; + } + if (parallel_thread_result) { + pr_err("Parallel restore fail\n"); + return parallel_thread_result; + } + } + + pr_info("Inside %s for target pid = %d\n", __func__, target_pid); fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { pr_perror("failed to open kfd in plugin"); - return -1; + return -ENOTSUP; } args.pid = target_pid; args.op = KFD_CRIU_OP_RESUME; - pr_info("amdgpu_plugin: Calling IOCTL to start notifiers and queues\n"); + pr_info("Calling IOCTL to start notifiers and queues\n"); if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("restore late ioctl failed"); - ret = -1; + if (errno == ESRCH) { + pr_info("Pid %d has no kfd process info\n", target_pid); + exit_code = -ENOTSUP; + } else { + pr_perror("restore late ioctl failed"); + exit_code = -1; + } } + clear_restore_state(); + close(fd); - return ret; + return exit_code; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) + +int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) +{ + int ret = 0; + int drm_fd = -1; + uint32_t major, minor; + + struct amdgpu_gpu_info gpu_info = { 0 }; + + drm_fd = open_drm_render_device(dev_minor); + if (drm_fd < 0) { + return drm_fd; + } + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); + if (ret) { + pr_perror("Failed to initialize device"); + goto err; + } + + ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto err; + } + *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + return 0; +err: + amdgpu_device_deinitialize(*h_dev); + return ret; +} + +FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) +{ + char img_path[PATH_MAX]; + size_t image_size = 0; + FILE *bo_contents_fp = NULL; + + snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); + bo_contents_fp = open_img_file(img_path, false, &image_size); + if (!bo_contents_fp) { + pr_perror("Cannot fopen %s", img_path); + return NULL; + } + + if (tot_size != image_size) { + pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); + fclose(bo_contents_fp); + return NULL; + } + return bo_contents_fp; +} + +struct parallel_thread_data { + pthread_t thread; + uint32_t gpu_id; + int minor; + parallel_restore_cmd *restore_cmd; + int ret; +}; + +void *parallel_restore_bo_contents(void *_thread_data) +{ + struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; + FILE *bo_contents_fp = NULL; + parallel_restore_entry *entry; + parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; + int ret = 0; + int offset = 0; + void *buffer = NULL; + + ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); + if (ret) { + goto err; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { + total_bo_size += restore_cmd->entries[i].size; + max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); + } + } + + buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; + + bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); + if (bo_contents_fp == NULL) { + ret = -1; + goto err_sdma; + } + offset = ftell(bo_contents_fp); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto err_sdma; + } + + for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { + if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) + continue; + + entry = &restore_cmd->entries[i]; + fseeko(bo_contents_fp, entry->read_offset + offset, SEEK_SET); + ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, + buffer, buffer_size, h_dev, + max_copy_size, SDMA_OP_VRAM_WRITE, false); + + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + goto err_sdma; + } + } + +err_sdma: + if (bo_contents_fp) + fclose(bo_contents_fp); + if (buffer) + xfree(buffer); + amdgpu_device_deinitialize(h_dev); +err: + thread_data->ret = ret; + return NULL; +} + +void *restore_device_parallel_worker(void *arg) +{ + while (1) { + parallel_restore_cmd restore_cmd = { 0 }; + struct parallel_thread_data *thread_datas = NULL; + int ret; + int error_occurred = 0, join_ret = 0, created_threads = 0; + + ret = recv_parallel_restore_cmd(&restore_cmd); + if (ret) { + if (ret == 1) { + *(int *)arg = 0; + goto exit; + } + goto err; + } + + thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); + if (!thread_datas) { + ret = -ENOMEM; + goto err; + } + + for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { + thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; + thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; + thread_datas[created_threads].restore_cmd = &restore_cmd; + + ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, + (void *)&thread_datas[created_threads]); + if (ret) { + pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); + error_occurred = 1; + break; + } + } + + for (int i = 0; i < created_threads; i++) { + join_ret = pthread_join(thread_datas[i].thread, NULL); + if (join_ret != 0) { + pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", + thread_datas[i].gpu_id, join_ret); + if (!error_occurred) { + ret = join_ret; + error_occurred = 1; + } + } + + pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); + + /* Check thread return value */ + if (thread_datas[i].ret && !error_occurred) { + ret = thread_datas[i].ret; + error_occurred = 1; + } + } + + if (thread_datas) + xfree(thread_datas); +err: + free_parallel_restore_cmd(&restore_cmd); + + if (ret) { + *(int *)arg = ret; + return NULL; + } + } +exit: + return NULL; +} + +/* + * While the background thread is running, some processing functions (e.g., stop_cgroupd) + * in the main thread need to block SIGCHLD. To prevent interference from this background + * thread, SIGCHLD is blocked in this thread. + */ +static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) +{ + int ret = 0; + sigset_t blockmask, oldmask; + + sigemptyset(&blockmask); + sigaddset(&blockmask, SIGCHLD); + sigprocmask(SIG_BLOCK, &blockmask, &oldmask); + + ret = pthread_create(newthread, NULL, f, arg); + if (ret) { + pr_err("Create worker thread fail: %d\n", ret); + return -1; + } + + sigprocmask(SIG_SETMASK, &oldmask, NULL); + return 0; +} + +int amdgpu_plugin_post_forking(void) +{ + if (plugin_disabled) + return -ENOTSUP; + + if (parallel_disabled) + return 0; + + return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c new file mode 100644 index 000000000..11c9792e3 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.c @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common/list.h" +#include "criu-amdgpu.pb-c.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_dmabuf.h" +#include "fdstore.h" + +#include "util.h" +#include "common/scm.h" + +struct dmabuf { + int id; + int dmabuf_fd; + struct list_head node; +}; + +static LIST_HEAD(dmabuf_list); + +/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */ +int get_dmabuf_info(int fd, struct stat *st) +{ + char path[PATH_MAX]; + + if (read_fd_link(fd, path, sizeof(path)) < 0) + return -1; + + if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0) + return 1; + + return 0; +} + +int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret = 0; + char path[PATH_MAX]; + size_t len = 0; + unsigned char *buf = NULL; + int gem_handle; + + gem_handle = handle_for_shared_bo_fd(dmabuf_fd); + if (gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); + return -EAGAIN; /* Retry needed */ + } + + CriuDmabufNode *node = xmalloc(sizeof(*node)); + if (!node) { + pr_err("Failed to allocate memory for dmabuf node\n"); + return -ENOMEM; + } + criu_dmabuf_node__init(node); + + node->gem_handle = gem_handle; + + if (node->gem_handle < 0) { + pr_err("Failed to get handle for dmabuf_fd\n"); + xfree(node); + return -EINVAL; + } + + /* Serialize metadata to a file */ + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + len = criu_dmabuf_node__get_packed_size(node); + buf = xmalloc(len); + if (!buf) { + pr_err("Failed to allocate buffer for dmabuf metadata\n"); + xfree(node); + return -ENOMEM; + } + criu_dmabuf_node__pack(node, buf); + ret = write_img_file(path, buf, len); + + xfree(buf); + xfree(node); + return ret; +} + +int amdgpu_plugin_dmabuf_restore(int id) +{ + char path[PATH_MAX]; + size_t img_size; + FILE *img_fp = NULL; + int ret = 0; + CriuDmabufNode *rd = NULL; + unsigned char *buf = NULL; + int fd_id; + + snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); + + /* Read serialized metadata */ + img_fp = open_img_file(path, false, &img_size); + if (!img_fp) { + pr_err("Failed to open dmabuf metadata file: %s\n", path); + return -EINVAL; + } + + pr_debug("dmabuf Image file size:%ld\n", img_size); + buf = xmalloc(img_size); + if (!buf) { + pr_perror("Failed to allocate memory"); + return -ENOMEM; + } + + ret = read_fp(img_fp, buf, img_size); + if (ret) { + pr_perror("Unable to read from %s", path); + xfree(buf); + return ret; + } + + rd = criu_dmabuf_node__unpack(NULL, img_size, buf); + if (rd == NULL) { + pr_perror("Unable to parse the dmabuf message %d", id); + xfree(buf); + fclose(img_fp); + return -1; + } + fclose(img_fp); + + /* Match GEM handle with shared_dmabuf list */ + fd_id = amdgpu_id_for_handle(rd->gem_handle); + if (fd_id == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); + return 1; + } + + int dmabuf_fd = fdstore_get(fd_id); + if (dmabuf_fd == -1) { + pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); + return 1; /* Retry needed */ + } + + pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", dmabuf_fd, rd->gem_handle); + ret = dmabuf_fd; + + pr_info("Successfully restored dmabuf_fd %d\n", dmabuf_fd); + criu_dmabuf_node__free_unpacked(rd, NULL); + xfree(buf); + return ret; +} + +int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) +{ + int ret; + + ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id); + if (ret == -EAGAIN) { + struct dmabuf *b = xmalloc(sizeof(*b)); + b->id = id; + b->dmabuf_fd = dmabuf_fd; + list_add(&b->node, &dmabuf_list); + return 0; + } + return ret; +} + +int try_dump_dmabuf_list() +{ + struct dmabuf *b, *t; + list_for_each_entry_safe(b, t, &dmabuf_list, node) { + int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); + if (ret == -EAGAIN) + continue; + if (ret) + return ret; + list_del(&b->node); + xfree(b); + } + return 0; +} + +int post_dump_dmabuf_check() +{ + if (!list_empty(&dmabuf_list)) { + pr_err("Not all dma buffers have been dumped\n"); + return -1; + } + return 0; +} diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.h b/plugins/amdgpu/amdgpu_plugin_dmabuf.h new file mode 100644 index 000000000..f07af7ee0 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_dmabuf.h @@ -0,0 +1,16 @@ + +#ifndef __AMDGPU_PLUGIN_DMABUF_H__ +#define __AMDGPU_PLUGIN_DMABUF_H__ + +#include "amdgpu_plugin_util.h" +#include "criu-amdgpu.pb-c.h" + +int amdgpu_plugin_dmabuf_dump(int fd, int id); +int amdgpu_plugin_dmabuf_restore(int id); + +int try_dump_dmabuf_list(); +int post_dump_dmabuf_check(); + +int get_dmabuf_info(int fd, struct stat *st); + +#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */ \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c new file mode 100644 index 000000000..3520bca7a --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.c @@ -0,0 +1,569 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" +#include "files.h" +#include "fdstore.h" + +#include "criu-amdgpu.pb-c.h" + +/* Define __user as empty for kernel headers in user-space */ +#define __user +#include "drm.h" + +#include +#include + +#include "xmalloc.h" +#include "amdgpu_drm.h" +#include "amdgpu_plugin_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + +#include "util.h" +#include "common/scm.h" + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) +{ + uint32_t handle; + int fd = amdgpu_device_get_fd(h_dev); + + if (dmabuf_fd == -1) { + return -1; + } + + if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle)) + return -1; + + return handle; +} + +int drmIoctl(int fd, unsigned long request, void *arg) +{ + int ret, max_retries = 200; + + do { + ret = ioctl(fd, request, arg); + } while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN)); + + if (ret == -1 && errno == EBADF) + /* In case pthread_atfork didn't catch it, this will + * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. + */ + pr_perror("KFD file descriptor not valid in this process"); + return ret; +} + +static int allocate_bo_entries(CriuRenderNode *e, int num_bos) +{ + e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos); + if (!e->bo_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_bo_entry__init(entry); + + e->bo_entries[i] = entry; + e->n_bo_entries++; + } + return 0; +} + +static int allocate_vm_entries(DrmBoEntry *e, int num_vms) +{ + e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms); + if (!e->vm_entries) { + pr_err("Failed to allocate bo_info\n"); + return -ENOMEM; + } + + for (int i = 0; i < num_vms; i++) { + DrmVmEntry *entry = xzalloc(sizeof(*entry)); + + if (!entry) { + pr_err("Failed to allocate botest\n"); + return -ENOMEM; + } + + drm_vm_entry__init(entry); + + e->vm_entries[i] = entry; + e->n_vm_entries++; + } + return 0; +} + +static void free_e(CriuRenderNode *e) +{ + for (int i = 0; i < e->n_bo_entries; i++) { + if (e->bo_entries[i]) + xfree(e->bo_entries[i]); + } + + xfree(e); +} + +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) +{ + char path[PATH_MAX]; + struct stat drm; + int ret = 0; + + snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE); + ret = stat(path, &drm); + if (ret == -1) { + pr_err("Error in getting stat for: %s\n", path); + return ret; + } + + if ((major(st->st_rdev) != major(drm.st_rdev)) || + (minor(st->st_rdev) < minor(drm.st_rdev)) || + (minor(st->st_rdev) > DRM_LAST_RENDER_NODE)) { + pr_err("Can't handle VMA mapping of input device\n"); + return -ENOTSUP; + } + + pr_debug("AMD DRI(maj,min) = %d:%d VMA Device FD(maj,min) = %d:%d\n", + major(drm.st_rdev), minor(drm.st_rdev), + major(st->st_rdev), minor(st->st_rdev)); + + return 0; +} + +static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs) +{ + size_t image_size = 0, max_bo_size = 0, buffer_size; + struct amdgpu_gpu_info gpu_info = { 0 }; + amdgpu_device_handle h_dev; + uint64_t max_copy_size; + uint32_t major, minor; + FILE *bo_contents_fp = NULL; + void *buffer = NULL; + char img_path[40]; + int i, ret = 0; + + ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); + if (ret) { + pr_perror("failed to initialize device"); + goto exit; + } + plugin_log_msg("libdrm initialized successfully\n"); + + ret = amdgpu_query_gpu_info(h_dev, &gpu_info); + if (ret) { + pr_perror("failed to query gpuinfo via libdrm"); + goto exit; + } + + max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : + SDMA_LINEAR_COPY_MAX_SIZE - 1; + + for (i = 0; i < rd->num_of_bos; i++) { + if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) { + if (rd->bo_entries[i]->size > max_bo_size) + max_bo_size = rd->bo_entries[i]->size; + } + } + + buffer_size = max_bo_size; + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); + if (!buffer) { + pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); + ret = -ENOMEM; + goto exit; + } + + for (i = 0; i < rd->num_of_bos; i++) { + if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT))) + continue; + + if (rd->bo_entries[i]->num_of_vms == 0) + continue; + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); + + bo_contents_fp = open_img_file(img_path, false, &image_size); + + ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, + SDMA_OP_VRAM_WRITE, true); + if (ret) { + pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); + break; + } + plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i); + + if (bo_contents_fp) + fclose(bo_contents_fp); + } + +exit: + for (int i = 0; i < rd->num_of_bos; i++) { + if (dmabufs[i] != KFD_INVALID_FD) + close(dmabufs[i]); + } + + xfree(buffer); + + amdgpu_device_deinitialize(h_dev); + return ret; +} + +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) +{ + CriuRenderNode *rd = NULL; + char path[PATH_MAX]; + unsigned char *buf; + int minor; + int len; + int ret; + size_t image_size; + struct tp_node *tp_node; + struct drm_amdgpu_gem_list_handles list_handles_args = { 0 }; + struct drm_amdgpu_gem_list_handles_entry *list_handles_entries; + int num_bos; + + rd = xmalloc(sizeof(*rd)); + if (!rd) { + ret = -ENOMEM; + goto exit; + } + criu_render_node__init(rd); + + /* Get the topology node of the DRM device */ + minor = minor(drm->st_rdev); + rd->drm_render_minor = minor; + rd->id = id; + + num_bos = 8; + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret && errno == EINVAL) { + pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n"); + list_handles_args.num_entries = 0; + } else if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + + if (list_handles_args.num_entries > num_bos) { + num_bos = list_handles_args.num_entries; + xfree(list_handles_entries); + list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); + list_handles_args.num_entries = num_bos; + list_handles_args.entries = (uintptr_t)list_handles_entries; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); + if (ret) { + pr_perror("Failed to call bo info ioctl"); + goto exit; + } + } else { + num_bos = list_handles_args.num_entries; + } + + rd->num_of_bos = num_bos; + ret = allocate_bo_entries(rd, num_bos); + if (ret) + goto exit; + + for (int i = 0; i < num_bos; i++) { + int num_vm_entries = 8; + struct drm_amdgpu_gem_vm_entry *vm_info_entries; + struct drm_amdgpu_gem_op vm_info_args = { 0 }; + DrmBoEntry *boinfo = rd->bo_entries[i]; + struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i]; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + int dmabuf_fd; + uint32_t major, minor; + amdgpu_device_handle h_dev; + void *buffer = NULL; + char img_path[40]; + FILE *bo_contents_fp = NULL; + int device_fd; + + boinfo->size = handle_entry.size; + + boinfo->alloc_flags = handle_entry.alloc_flags; + boinfo->preferred_domains = handle_entry.preferred_domains; + boinfo->alignment = handle_entry.alignment; + boinfo->handle = handle_entry.gem_handle; + boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle); + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + boinfo->offset = mmap_args.out.addr_ptr; + + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + + if (vm_info_args.num_entries > num_vm_entries) { + num_vm_entries = vm_info_args.num_entries; + xfree(vm_info_entries); + vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); + vm_info_args.handle = handle_entry.gem_handle; + vm_info_args.num_entries = num_vm_entries; + vm_info_args.value = (uintptr_t)vm_info_entries; + vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; + ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); + if (ret) { + pr_perror("Failed to call vm info ioctl"); + goto exit; + } + } else { + num_vm_entries = vm_info_args.num_entries; + } + + boinfo->num_of_vms = num_vm_entries; + ret = allocate_vm_entries(boinfo, num_vm_entries); + if (ret) + goto exit; + + for (int j = 0; j < num_vm_entries; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + boinfo->addr = vm_info_entries[j].addr; + vminfo->addr = vm_info_entries[j].addr; + vminfo->size = vm_info_entries[j].size; + vminfo->offset = vm_info_entries[j].offset; + vminfo->flags = vm_info_entries[j].flags; + } + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + + device_fd = amdgpu_device_get_fd(h_dev); + + drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd); + + snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i); + bo_contents_fp = open_img_file(img_path, true, &image_size); + + posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size); + + ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000, + SDMA_OP_VRAM_READ, false); + + if (dmabuf_fd != KFD_INVALID_FD) + close(dmabuf_fd); + + if (bo_contents_fp) + fclose(bo_contents_fp); + + ret = amdgpu_device_deinitialize(h_dev); + if (ret) + goto exit; + + xfree(vm_info_entries); + } + xfree(list_handles_entries); + + for (int i = 0; i < num_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + + ret = record_shared_bo(boinfo->handle, boinfo->is_import); + if (ret) + goto exit; + } + + tp_node = sys_get_node_by_render_minor(&src_topology, minor); + if (!tp_node) { + pr_err("Failed to find a device with minor number = %d\n", minor); + return -ENODEV; + } + + /* Get the GPU_ID of the DRM device */ + rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd->gpu_id) { + pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id); + return -ENODEV; + } + + len = criu_render_node__get_packed_size(rd); + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + + criu_render_node__pack(rd, buf); + + snprintf(path, sizeof(path), IMG_DRM_FILE, id); + ret = write_img_file(path, buf, len); + + xfree(buf); +exit: + free_e(rd); + return ret; +} + +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) +{ + int ret = 0; + bool retry_needed = false; + uint32_t major, minor; + amdgpu_device_handle h_dev; + int device_fd; + int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos); + + ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); + if (ret) { + pr_info("Error in init amdgpu device\n"); + goto exit; + } + + device_fd = amdgpu_device_get_fd(h_dev); + + for (int i = 0; i < rd->num_of_bos; i++) { + DrmBoEntry *boinfo = rd->bo_entries[i]; + int dmabuf_fd = -1; + uint32_t handle; + struct drm_gem_change_handle change_args = { 0 }; + union drm_amdgpu_gem_mmap mmap_args = { 0 }; + struct drm_amdgpu_gem_va va_args = { 0 }; + int fd_id; + + if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { + continue; + } else if (boinfo->handle != -1) { + if (boinfo->is_import) { + fd_id = amdgpu_id_for_handle(boinfo->handle); + if (fd_id == -1) { + retry_needed = true; + continue; + } + dmabuf_fd = fdstore_get(fd_id); + } + } + + if (boinfo->is_import) { + drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); + } else { + union drm_amdgpu_gem_create create_args = { 0 }; + + create_args.in.bo_size = boinfo->size; + create_args.in.alignment = boinfo->alignment; + create_args.in.domains = boinfo->preferred_domains; + create_args.in.domain_flags = boinfo->alloc_flags; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) { + pr_perror("Error Failed to call create ioctl"); + ret = -1; + goto exit; + } + handle = create_args.out.handle; + + drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); + } + + change_args.handle = handle; + change_args.new_handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) { + pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support"); + ret = -1; + goto exit; + } + + if (!boinfo->is_import) + store_dmabuf_fd(boinfo->handle, dmabuf_fd); + + dmabufs[i] = dmabuf_fd; + + ret = record_completed_work(boinfo->handle, rd->drm_render_minor); + if (ret) + goto exit; + + mmap_args.in.handle = boinfo->handle; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { + pr_perror("Error Failed to call mmap ioctl"); + ret = -1; + goto exit; + } + + for (int j = 0; j < boinfo->num_of_vms; j++) { + DrmVmEntry *vminfo = boinfo->vm_entries[j]; + + va_args.handle = boinfo->handle; + va_args.operation = AMDGPU_VA_OP_MAP; + va_args.flags = vminfo->flags; + va_args.va_address = vminfo->addr; + va_args.offset_in_bo = vminfo->offset; + va_args.map_size = vminfo->size; + + if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) { + pr_perror("Error Failed to call gem va ioctl"); + ret = -1; + goto exit; + } + } + + ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd); + if (ret < 0) + goto exit; + } + + if (ret) { + pr_info("Error in deinit amdgpu device\n"); + goto exit; + } + + ret = record_completed_work(-1, rd->drm_render_minor); + if (ret) + goto exit; + + ret = amdgpu_device_deinitialize(h_dev); + + if (rd->num_of_bos > 0) { + ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs); + if (ret) + goto exit; + } + +exit: + if (ret < 0) + return ret; + xfree(dmabufs); + + return retry_needed; +} diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h new file mode 100644 index 000000000..c766def56 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_drm.h @@ -0,0 +1,40 @@ +#ifndef __AMDGPU_PLUGIN_DRM_H__ +#define __AMDGPU_PLUGIN_DRM_H__ + +#include +#include "common/list.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" + + +/** + * Determines if VMA's of input file descriptor belong to amdgpu's + * DRM device and are therefore supported + */ +int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); + +/** + * Serialize meta-data about a particular DRM device, its number of BOs, + * etc into a file. The serialized filename has in it the value ID that + * is passed in as a parameter + */ +int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); + +int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd); + +int amdgpu_plugin_drm_unpause_file(int fd); + +int amdgpu_id_for_handle(int handle); + +int store_dmabuf_fd(int handle, int fd); + +int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd); + +int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id); + +#endif /* __AMDGPU_PLUGIN_DRM_H__ */ + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 42689933e..730f2e028 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -16,34 +16,11 @@ #include "xmalloc.h" #include "kfd_ioctl.h" +#include "amdgpu_plugin_util.h" #include "amdgpu_plugin_topology.h" #define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef COMPILE_TESTS -#undef pr_err -#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) -#undef pr_info -#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) -#undef pr_debug -#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) - -#undef pr_perror -#define pr_perror(format, arg...) \ - fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#endif - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif +#define MAX_PARAMETER_LEN 64 /* User override options */ /* Skip firmware version check */ @@ -68,7 +45,7 @@ bool kfd_capability_check = true; */ int fd_next = -1; -static int open_drm_render_device(int minor) +int open_drm_render_device(int minor) { char path[128]; int fd, ret_fd; @@ -441,7 +418,9 @@ struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id static bool get_prop(char *line, char *name, uint64_t *value) { - if (sscanf(line, " %29s %lu", name, value) != 2) + char format[16]; + sprintf(format, " %%%ds %%lu", MAX_PARAMETER_LEN); + if (sscanf(line, format, name, value) != 2) return false; return true; } @@ -461,7 +440,7 @@ static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -589,7 +568,7 @@ static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -678,7 +657,7 @@ static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path) } while (fgets(line, sizeof(line), file)) { - char name[30]; + char name[MAX_PARAMETER_LEN + 1]; uint64_t value; memset(name, 0, sizeof(name)); @@ -840,6 +819,9 @@ void topology_free(struct tp_system *sys) list_del(&p2pgroup->listm_system); xfree(p2pgroup); } + + /* Update Topology as being freed */ + sys->parsed = false; } /** @@ -1063,7 +1045,7 @@ static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest) * * Nodes compatibility are determined by: * 1. Comparing the node properties - * 2. Making sure iolink mappings to CPUs would be compabitle with existing iolink mappings in maps + * 2. Making sure iolink mappings to CPUs would be compatible with existing iolink mappings in maps * * If src_node and dest_node are mappable, then map_device will push the new mapping * for src_node -> dest_node into new_maps. @@ -1241,7 +1223,7 @@ static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, s return true; } else { /* We could not map remaining nodes in the list. Add dest node back - * to list and try to map next dest ndoe in list to current src + * to list and try to map next dest node in list to current src * node. */ pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, " @@ -1461,3 +1443,15 @@ int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, return ret; } + +int topology_gpu_count(struct tp_system *sys) +{ + struct tp_node *node; + int count = 0; + + list_for_each_entry(node, &sys->nodes, listm_system) + if (NODE_IS_GPU(node)) + count++; + return count; +} + diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index 9d99cda1c..e19f8e7ce 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -107,6 +107,8 @@ int topology_parse(struct tp_system *topology, const char *msg); int topology_determine_iolinks(struct tp_system *sys); void topology_print(const struct tp_system *sys, const char *msg); +int topology_gpu_count(struct tp_system *topology); + struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id); struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id); @@ -116,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); +int open_drm_render_device(int minor); int node_get_drm_render_device(struct tp_node *node); void sys_close_drm_render_devices(struct tp_system *sys); diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c new file mode 100644 index 000000000..592562474 --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.c @@ -0,0 +1,330 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include +#include + +#include "criu-plugin.h" +#include "plugin.h" +#include "criu-amdgpu.pb-c.h" + +#include "img-streamer.h" +#include "image.h" +#include "cr_options.h" + +#include "xmalloc.h" +#include "criu-log.h" +#include "kfd_ioctl.h" +#include "amdgpu_drm.h" +#include "amdgpu_plugin_util.h" +#include "amdgpu_plugin_topology.h" +#include "amdgpu_plugin_drm.h" + +static LIST_HEAD(dumped_fds); +static LIST_HEAD(shared_bos); +static LIST_HEAD(completed_work); + +/* Helper structures to encode device topology of SRC and DEST platforms */ +struct tp_system src_topology; +struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +struct device_maps checkpoint_maps; +struct device_maps restore_maps; + +int record_dumped_fd(int fd, bool is_drm) +{ + int newfd = dup(fd); + + if (newfd < 0) + return newfd; + struct dumped_fd *st = malloc(sizeof(struct dumped_fd)); + if (!st) + return -1; + st->fd = newfd; + st->is_drm = is_drm; + list_add(&st->l, &dumped_fds); + + return 0; +} + +struct list_head *get_dumped_fds() +{ + return &dumped_fds; +} + +bool shared_bo_has_exporter(int handle) +{ + struct shared_bo *bo; + + if (handle == -1) + return false; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return bo->has_exporter; + } + } + + return false; +} + +int record_shared_bo(int handle, bool is_imported) +{ + struct shared_bo *bo; + + if (handle == -1) + return 0; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == handle) { + return 0; + } + } + bo = malloc(sizeof(struct shared_bo)); + if (!bo) + return -1; + bo->handle = handle; + bo->has_exporter = !is_imported; + list_add(&bo->l, &shared_bos); + + return 0; +} + +int handle_for_shared_bo_fd(int fd) +{ + struct dumped_fd *df; + int trial_handle; + amdgpu_device_handle h_dev; + uint32_t major, minor; + struct shared_bo *bo; + + list_for_each_entry(df, &dumped_fds, l) { + /* see if the gem handle for fd using the hdev for df->fd is the + same as bo->handle. */ + + if (!df->is_drm) { + continue; + } + + if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) { + pr_err("Failed to initialize amdgpu device\n"); + continue; + } + + trial_handle = get_gem_handle(h_dev, fd); + if (trial_handle < 0) + continue; + + list_for_each_entry(bo, &shared_bos, l) { + if (bo->handle == trial_handle) + return trial_handle; + } + + amdgpu_device_deinitialize(h_dev); + } + + return -1; +} + +int record_completed_work(int handle, int id) +{ + struct restore_completed_work *work; + + work = malloc(sizeof(struct restore_completed_work)); + if (!work) + return -1; + work->handle = handle; + work->id = id; + list_add(&work->l, &completed_work); + + return 0; +} + +bool work_already_completed(int handle, int id) +{ + struct restore_completed_work *work; + + list_for_each_entry(work, &completed_work, l) { + if (work->handle == handle && work->id == id) { + return true; + } + } + + return false; +} + +void clear_restore_state() +{ + while (!list_empty(&completed_work)) { + struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); + list_del(&st->l); + free(st); + } +} + +void clear_dumped_fds() +{ + while (!list_empty(&dumped_fds)) { + struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l); + list_del(&st->l); + close(st->fd); + free(st); + } +} + +int read_fp(FILE *fp, void *buf, const size_t buf_len) +{ + size_t len_read; + + len_read = fread(buf, 1, buf_len, fp); + if (len_read != buf_len) { + pr_err("Unable to read file (read:%ld buf_len:%ld)\n", len_read, buf_len); + return -EIO; + } + return 0; +} + +int write_fp(FILE *fp, const void *buf, const size_t buf_len) +{ + size_t len_write; + + len_write = fwrite(buf, 1, buf_len, fp); + if (len_write != buf_len) { + pr_err("Unable to write file (wrote:%ld buf_len:%ld)\n", len_write, buf_len); + return -EIO; + } + return 0; +} + +/** + * @brief Open an image file + * + * We store the size of the actual contents in the first 8-bytes of + * the file. This allows us to determine the file size when using + * criu_image_streamer when fseek and fstat are not available. The + * FILE * returned is already at the location of the first actual + * contents. + * + * @param path The file path + * @param write False for read, true for write + * @param size Size of actual contents + * @return FILE *if successful, NULL if failed + */ +FILE *open_img_file(char *path, bool write, size_t *size) +{ + FILE *fp = NULL; + int fd, ret; + + if (opts.stream) + fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); + else + fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); + + if (fd < 0) { + pr_err("%s: Failed to open for %s\n", path, write ? "write" : "read"); + return NULL; + } + + fp = fdopen(fd, write ? "w" : "r"); + if (!fp) { + pr_err("%s: Failed get pointer for %s\n", path, write ? "write" : "read"); + return NULL; + } + + if (write) + ret = write_fp(fp, size, sizeof(*size)); + else + ret = read_fp(fp, size, sizeof(*size)); + + if (ret) { + pr_err("%s:Failed to access file size\n", path); + fclose(fp); + return NULL; + } + + pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); + return fp; +} + +int read_file(const char *file_path, void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + + fp = fopen(file_path, "r"); + if (!fp) { + pr_err("Cannot fopen %s\n", file_path); + return -errno; + } + + ret = read_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + + +/** + * @brief Write an image file + * + * We store the size of the actual contents in the first 8-bytes of the file. This allows us to + * determine the file size when using criu_image_streamer when fseek and fstat are not available. + * + * @param path The file path + * @param buf pointer to data to be written + * @param buf_len size of buf + * @return 0 if successful. -errno on failure + */ +int write_img_file(char *path, const void *buf, const size_t buf_len) +{ + int ret; + FILE *fp; + size_t len = buf_len; + + fp = open_img_file(path, true, &len); + if (!fp) + return -errno; + + ret = write_fp(fp, buf, buf_len); + fclose(fp); /* this will also close fd */ + return ret; +} + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) +{ + struct kfd_criu_bo_bucket *bo; + + pr_info("\n"); + for (int idx = 0; idx < bo_cnt; idx++) { + bo = &bo_list[idx]; + pr_info("\n"); + pr_info("%s(), %d. KFD BO Addr: %" PRIx64 " \n", __func__, idx, bo->addr); + pr_info("%s(), %d. KFD BO Size: %" PRIx64 " \n", __func__, idx, bo->size); + pr_info("%s(), %d. KFD BO Offset: %" PRIx64 " \n", __func__, idx, bo->offset); + pr_info("%s(), %d. KFD BO Restored Offset: %" PRIx64 " \n", __func__, idx, bo->restored_offset); + pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags); + pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id); + pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd); + pr_info("\n"); + } + pr_info("\n"); +} diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h new file mode 100644 index 000000000..f5f752d0b --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_util.h @@ -0,0 +1,145 @@ +#ifndef __AMDGPU_PLUGIN_UTIL_H__ +#define __AMDGPU_PLUGIN_UTIL_H__ + +#include + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#ifdef COMPILE_TESTS +#undef pr_err +#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) +#undef pr_info +#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) +#undef pr_debug +#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) + +#undef pr_perror +#define pr_perror(format, arg...) \ + fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#endif + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "amdgpu_plugin: " + +#ifdef DEBUG +#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) +#else +#define plugin_log_msg(fmt, ...) \ + { \ + } +#endif + + +/* Path where KFD device is surfaced */ +#define AMDGPU_KFD_DEVICE "/dev/kfd" + +/* Path where DRM devices are surfaced */ +#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d" + +/* Minimum version of KFD IOCTL's that supports C&R */ +#define KFD_IOCTL_MAJOR_VERSION 1 +#define MIN_KFD_IOCTL_MINOR_VERSION 8 + +/* Name of file having serialized data of KFD device */ +#define IMG_KFD_FILE "amdgpu-kfd-%d.img" + +/* Name of file having serialized data of KFD buffer objects (BOs) */ +#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img" + +/* Name of file having serialized data of DRM device */ +#define IMG_DRM_FILE "amdgpu-renderD-%d.img" + +/* Name of file having serialized data of dmabuf meta */ +#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img" + +/* Name of file having serialized data of DRM device buffer objects (BOs) */ +#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" + +/* Helper macros to Checkpoint and Restore a ROCm file */ +#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" +#define HSAKMT_SHM "/hsakmt_shared_mem" +#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" +#define HSAKMT_SEM "hsakmt_semaphore" +#define DMABUF_LINK "/dmabuf" + +/* Help macros to build sDMA command packets */ +#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) + +#define SDMA_OPCODE_COPY 1 +#define SDMA_COPY_SUB_OPCODE_LINEAR 0 +#define SDMA_NOP 0 +#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) + +enum sdma_op_type { + SDMA_OP_VRAM_READ, + SDMA_OP_VRAM_WRITE, +}; + +struct dumped_fd { + struct list_head l; + int fd; + bool is_drm; +}; + +struct shared_bo { + struct list_head l; + int handle; + bool has_exporter; +}; + +struct restore_completed_work { + struct list_head l; + int handle; + int id; +}; + +/* Helper structures to encode device topology of SRC and DEST platforms */ +extern struct tp_system src_topology; +extern struct tp_system dest_topology; + +/* Helper structures to encode device maps during Checkpoint and Restore operations */ +extern struct device_maps checkpoint_maps; +extern struct device_maps restore_maps; + +extern int fd_next; + +extern bool kfd_fw_version_check; +extern bool kfd_sdma_fw_version_check; +extern bool kfd_caches_count_check; +extern bool kfd_num_gws_check; +extern bool kfd_vram_size_check; +extern bool kfd_numa_check; +extern bool kfd_capability_check; + +int read_fp(FILE *fp, void *buf, const size_t buf_len); +int write_fp(FILE *fp, const void *buf, const size_t buf_len); +int read_file(const char *file_path, void *buf, const size_t buf_len); +int write_img_file(char *path, const void *buf, const size_t buf_len); +FILE *open_img_file(char *path, bool write, size_t *size); + +int record_dumped_fd(int fd, bool is_drm); +struct list_head *get_dumped_fds(); +void clear_dumped_fds(); + +bool shared_bo_has_exporter(int handle); +int record_shared_bo(int handle, bool is_imported); +int handle_for_shared_bo_fd(int dmabuf_fd); + +int record_completed_work(int handle, int id); +bool work_already_completed(int handle, int id); + +void clear_restore_state(); + +void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); + +int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, + void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, + uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free); + +int serve_out_dmabuf_fd(int handle, int fd); + +#endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c new file mode 100644 index 000000000..c8bf6d1ba --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.c @@ -0,0 +1,320 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "amdgpu_socket_utils.h" +#include "criu-log.h" +#include "common/scm.h" +#include "fdstore.h" +#include "util-pie.h" +#include "util.h" + +int parallel_socket_addr_len; +struct sockaddr_un parallel_socket_addr; +int parallel_socket_id = 0; + +static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) +{ + addr->sun_family = AF_UNIX; + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); + *len = SUN_LEN(addr); + *addr->sun_path = '\0'; +} + +int install_parallel_sock(void) +{ + int ret = 0; + int sock_fd; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("socket creation failed"); + return -1; + } + + amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); + ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("bind failed"); + goto err; + } + + ret = listen(sock_fd, SOMAXCONN); + if (ret < 0) { + pr_perror("listen failed"); + goto err; + } + + parallel_socket_id = fdstore_add(sock_fd); + if (parallel_socket_id < 0) { + ret = -1; + goto err; + } +err: + close(sock_fd); + return ret; +} + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd) +{ + parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; + restore_entry->gpu_id = gpu_id; + restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; + restore_entry->write_offset = 0; + restore_entry->read_offset = offset; + restore_entry->size = size; + + restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; + + restore_cmd->cmd_head.entry_num += 1; + restore_cmd->cmd_head.fd_write_num += 1; +} + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; + restore_cmd->cmd_head.gpu_num += 1; +} + +static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + return 0; +} + +static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Send parallel restore command fail"); + return -1; + } + return 0; +} + +static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Send dmabuf fds fail"); + return -1; + } + return 0; +} + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd; + int ret = 0; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + ret = send_metadata(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_gpu_ids(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_cmds(sock_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = send_dmabuf_fds(sock_fd, restore_cmd); + +err: + close(sock_fd); + return ret; +} + +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) +{ + restore_cmd->cmd_head.id = id; + restore_cmd->cmd_head.fd_write_num = 0; + restore_cmd->cmd_head.entry_num = 0; + restore_cmd->cmd_head.gpu_num = 0; + + restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + if (restore_cmd->gpu_ids) + xfree(restore_cmd->gpu_ids); + if (restore_cmd->fds_write) + xfree(restore_cmd->fds_write); + if (restore_cmd->entries) + xfree(restore_cmd->entries); +} + +static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) +{ + restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); + if (!restore_cmd->gpu_ids) + return -ENOMEM; + restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); + if (!restore_cmd->fds_write) + return -ENOMEM; + restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); + if (!restore_cmd->entries) + return -ENOMEM; + return 0; +} + +static int check_quit_cmd(parallel_restore_cmd *restore_cmd) +{ + return restore_cmd->cmd_head.fd_write_num == 0; +} + +static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Recv parallel restore command head fail"); + return -1; + } + return 0; +} + +static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { + pr_perror("Recv parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { + pr_perror("Send GPU ids of parallel restore command fail"); + return -1; + } + return 0; +} + +static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) +{ + if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { + pr_perror("Recv dmabuf fds fail"); + return -1; + } + return 0; +} + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) +{ + int sock_fd, client_fd; + int ret = 0; + + sock_fd = fdstore_get(parallel_socket_id); + if (sock_fd < 0) + return -1; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) { + ret = client_fd; + goto err_accept; + } + + ret = recv_metadata(client_fd, restore_cmd); + if (ret) { + goto err; + } + + // Return 1 to quit + if (check_quit_cmd(restore_cmd)) { + ret = 1; + goto err; + } + + ret = init_parallel_restore_cmd_by_head(restore_cmd); + if (ret) { + goto err; + } + + ret = recv_gpu_ids(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_cmds(client_fd, restore_cmd); + if (ret) { + goto err; + } + + ret = recv_dmabuf_fds(client_fd, restore_cmd); + +err: + close(client_fd); +err_accept: + close(sock_fd); + return ret; +} + +int close_parallel_restore_server(void) +{ + int sock_fd; + int ret = 0; + parallel_restore_cmd_head cmd_head; + + sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); + if (sock_fd < 0) { + pr_perror("Socket creation failed"); + return -1; + } + + ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); + if (ret < 0) { + pr_perror("Connect failed"); + goto err; + } + + memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); + if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { + pr_perror("Send parallel restore command head fail"); + return -1; + } + +err: + close(sock_fd); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h new file mode 100644 index 000000000..d7200c6bd --- /dev/null +++ b/plugins/amdgpu/amdgpu_socket_utils.h @@ -0,0 +1,54 @@ +#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ +#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ + +typedef struct { + int id; + int fd_write_num; /* The number of buffer objects to be restored. */ + int entry_num; /* The number of restore commands.*/ + int gpu_num; +} parallel_restore_cmd_head; + +typedef struct { + int gpu_id; + int minor; +} parallel_gpu_info; + +typedef struct { + int gpu_id; + int write_id; + uint64_t read_offset; + uint64_t write_offset; + uint64_t size; +} parallel_restore_entry; + +typedef struct { + parallel_restore_cmd_head cmd_head; + int *fds_write; + parallel_gpu_info *gpu_ids; + parallel_restore_entry *entries; +} parallel_restore_cmd; + +/* + * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU + * buffer object. However, initially, the ownership of these buffer objects and the metadata for + * restoration are all with the target process. Therefore, we introduce a series of functions to + * help the target process send these tasks to the main CRIU process. + */ +int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); + +void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int install_parallel_sock(void); + +int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); + +void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, + parallel_restore_cmd *restore_cmd); + +void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); + +int close_parallel_restore_server(void); + +#endif \ No newline at end of file diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 81d00d3ff..7682a8f21 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -5,7 +5,7 @@ message dev_iolink { required uint32 node_to_id = 2; } -message device_entry { +message kfd_device_entry { required uint32 node_id = 1; required uint32 gpu_id = 2; required uint32 cpu_cores_count = 3; @@ -40,27 +40,56 @@ message device_entry { repeated dev_iolink iolinks = 32; } -message bo_entry { - required uint64 addr = 1; - required uint64 size = 2; - required uint64 offset = 3; +message kfd_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; required uint32 alloc_flags = 4; required uint32 gpu_id = 5; + required uint32 handle = 6; } message criu_kfd { required uint32 pid = 1; required uint32 num_of_gpus = 2; required uint32 num_of_cpus = 3; - repeated device_entry device_entries = 4; - required uint64 num_of_bos = 5; - repeated bo_entry bo_entries = 6; - required uint32 num_of_objects = 7; + repeated kfd_device_entry device_entries = 4; + required uint64 num_of_bos = 5; + repeated kfd_bo_entry bo_entries = 6; + required uint32 num_of_objects = 7; required uint64 shared_mem_size = 8; required uint32 shared_mem_magic = 9; required bytes priv_data = 10; } +message drm_bo_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 alloc_flags = 4; + required uint64 alignment = 5; + required uint32 preferred_domains = 6; + required uint32 handle = 7; + required uint32 is_import = 8; + required uint32 num_of_vms = 9; + repeated drm_vm_entry vm_entries = 10; +} + +message drm_vm_entry { + required uint64 addr = 1; + required uint64 size = 2; + required uint64 offset = 3; + required uint64 flags = 4; +} + message criu_render_node { required uint32 gpu_id = 1; + required uint32 id = 2; + required uint32 drm_render_minor = 3; + required uint64 num_of_bos = 4; + repeated drm_bo_entry bo_entries = 5; +} + +message criu_dmabuf_node { + required uint32 gem_handle = 1; } diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h new file mode 100644 index 000000000..3cd5cf15e --- /dev/null +++ b/plugins/amdgpu/drm.h @@ -0,0 +1,1476 @@ +/* + * Header for the Direct Rendering Manager + * + * Author: Rickard E. (Rik) Faith + * + * Acknowledgments: + * Dec 1999, Richard Henderson , move to generic cmpxchg. + */ + +/* + * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. + * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DRM_H_ +#define _DRM_H_ + +#if defined(__KERNEL__) + +#include +#include +typedef unsigned int drm_handle_t; + +#elif defined(__linux__) + +#include +#include +typedef unsigned int drm_handle_t; + +#else /* One of the BSDs */ + +#include +#include +#include +typedef int8_t __s8; +typedef uint8_t __u8; +typedef int16_t __s16; +typedef uint16_t __u16; +typedef int32_t __s32; +typedef uint32_t __u32; +typedef int64_t __s64; +typedef uint64_t __u64; +typedef size_t __kernel_size_t; +typedef unsigned long drm_handle_t; + +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ +#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ +#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ +#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ + +#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ +#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ +#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) +#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) +#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) + +typedef unsigned int drm_context_t; +typedef unsigned int drm_drawable_t; +typedef unsigned int drm_magic_t; + +/* + * Cliprect. + * + * \warning: If you change this structure, make sure you change + * XF86DRIClipRectRec in the server as well + * + * \note KW: Actually it's illegal to change either for + * backwards-compatibility reasons. + */ +struct drm_clip_rect { + unsigned short x1; + unsigned short y1; + unsigned short x2; + unsigned short y2; +}; + +/* + * Drawable information. + */ +struct drm_drawable_info { + unsigned int num_rects; + struct drm_clip_rect *rects; +}; + +/* + * Texture region, + */ +struct drm_tex_region { + unsigned char next; + unsigned char prev; + unsigned char in_use; + unsigned char padding; + unsigned int age; +}; + +/* + * Hardware lock. + * + * The lock structure is a simple cache-line aligned integer. To avoid + * processor bus contention on a multiprocessor system, there should not be any + * other data stored in the same cache line. + */ +struct drm_hw_lock { + __volatile__ unsigned int lock; /**< lock variable */ + char padding[60]; /**< Pad to cache line */ +}; + +/* + * DRM_IOCTL_VERSION ioctl argument type. + * + * \sa drmGetVersion(). + */ +struct drm_version { + int version_major; /**< Major version */ + int version_minor; /**< Minor version */ + int version_patchlevel; /**< Patch level */ + __kernel_size_t name_len; /**< Length of name buffer */ + char __user *name; /**< Name of driver */ + __kernel_size_t date_len; /**< Length of date buffer */ + char __user *date; /**< User-space buffer to hold date */ + __kernel_size_t desc_len; /**< Length of desc buffer */ + char __user *desc; /**< User-space buffer to hold desc */ +}; + +/* + * DRM_IOCTL_GET_UNIQUE ioctl argument type. + * + * \sa drmGetBusid() and drmSetBusId(). + */ +struct drm_unique { + __kernel_size_t unique_len; /**< Length of unique */ + char __user *unique; /**< Unique name for driver instantiation */ +}; + +struct drm_list { + int count; /**< Length of user-space structures */ + struct drm_version __user *version; +}; + +struct drm_block { + int unused; +}; + +/* + * DRM_IOCTL_CONTROL ioctl argument type. + * + * \sa drmCtlInstHandler() and drmCtlUninstHandler(). + */ +struct drm_control { + enum { + DRM_ADD_COMMAND, + DRM_RM_COMMAND, + DRM_INST_HANDLER, + DRM_UNINST_HANDLER + } func; + int irq; +}; + +/* + * Type of memory to map. + */ +enum drm_map_type { + _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ + _DRM_REGISTERS = 1, /**< no caching, no core dump */ + _DRM_SHM = 2, /**< shared, cached */ + _DRM_AGP = 3, /**< AGP/GART */ + _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ + _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ +}; + +/* + * Memory mapping flags. + */ +enum drm_map_flags { + _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ + _DRM_READ_ONLY = 0x02, + _DRM_LOCKED = 0x04, /**< shared, cached, locked */ + _DRM_KERNEL = 0x08, /**< kernel requires access */ + _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ + _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ + _DRM_REMOVABLE = 0x40, /**< Removable mapping */ + _DRM_DRIVER = 0x80 /**< Managed by driver */ +}; + +struct drm_ctx_priv_map { + unsigned int ctx_id; /**< Context requesting private mapping */ + void *handle; /**< Handle of map */ +}; + +/* + * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls + * argument type. + * + * \sa drmAddMap(). + */ +struct drm_map { + unsigned long offset; /**< Requested physical address (0 for SAREA)*/ + unsigned long size; /**< Requested physical size (bytes) */ + enum drm_map_type type; /**< Type of memory to map */ + enum drm_map_flags flags; /**< Flags */ + void *handle; /**< User-space: "Handle" to pass to mmap() */ + /**< Kernel-space: kernel-virtual address */ + int mtrr; /**< MTRR slot used */ + /* Private data */ +}; + +/* + * DRM_IOCTL_GET_CLIENT ioctl argument type. + */ +struct drm_client { + int idx; /**< Which client desired? */ + int auth; /**< Is client authenticated? */ + unsigned long pid; /**< Process ID */ + unsigned long uid; /**< User ID */ + unsigned long magic; /**< Magic */ + unsigned long iocs; /**< Ioctl count */ +}; + +enum drm_stat_type { + _DRM_STAT_LOCK, + _DRM_STAT_OPENS, + _DRM_STAT_CLOSES, + _DRM_STAT_IOCTLS, + _DRM_STAT_LOCKS, + _DRM_STAT_UNLOCKS, + _DRM_STAT_VALUE, /**< Generic value */ + _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ + _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ + + _DRM_STAT_IRQ, /**< IRQ */ + _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ + _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ + _DRM_STAT_DMA, /**< DMA */ + _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ + _DRM_STAT_MISSED /**< Missed DMA opportunity */ + /* Add to the *END* of the list */ +}; + +/* + * DRM_IOCTL_GET_STATS ioctl argument type. + */ +struct drm_stats { + unsigned long count; + struct { + unsigned long value; + enum drm_stat_type type; + } data[15]; +}; + +/* + * Hardware locking flags. + */ +enum drm_lock_flags { + _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ + _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ + _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ + _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ + /* These *HALT* flags aren't supported yet + -- they will be used to support the + full-screen DGA-like mode. */ + _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ + _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ +}; + +/* + * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. + * + * \sa drmGetLock() and drmUnlock(). + */ +struct drm_lock { + int context; + enum drm_lock_flags flags; +}; + +/* + * DMA flags + * + * \warning + * These values \e must match xf86drm.h. + * + * \sa drm_dma. + */ +enum drm_dma_flags { + /* Flags for DMA buffer dispatch */ + _DRM_DMA_BLOCK = 0x01, /**< + * Block until buffer dispatched. + * + * \note The buffer may not yet have + * been processed by the hardware -- + * getting a hardware lock with the + * hardware quiescent will ensure + * that the buffer has been + * processed. + */ + _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ + _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ + + /* Flags for DMA buffer request */ + _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ + _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ + _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ +}; + +/* + * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. + * + * \sa drmAddBufs(). + */ +struct drm_buf_desc { + int count; /**< Number of buffers of this size */ + int size; /**< Size in bytes */ + int low_mark; /**< Low water mark */ + int high_mark; /**< High water mark */ + enum { + _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ + _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ + _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ + _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ + _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ + } flags; + unsigned long agp_start; /**< + * Start address of where the AGP buffers are + * in the AGP aperture + */ +}; + +/* + * DRM_IOCTL_INFO_BUFS ioctl argument type. + */ +struct drm_buf_info { + int count; /**< Entries in list */ + struct drm_buf_desc __user *list; +}; + +/* + * DRM_IOCTL_FREE_BUFS ioctl argument type. + */ +struct drm_buf_free { + int count; + int __user *list; +}; + +/* + * Buffer information + * + * \sa drm_buf_map. + */ +struct drm_buf_pub { + int idx; /**< Index into the master buffer list */ + int total; /**< Buffer size */ + int used; /**< Amount of buffer in use (for DMA) */ + void __user *address; /**< Address of buffer */ +}; + +/* + * DRM_IOCTL_MAP_BUFS ioctl argument type. + */ +struct drm_buf_map { + int count; /**< Length of the buffer list */ +#ifdef __cplusplus + void __user *virt; +#else + void __user *virtual; /**< Mmap'd area in user-virtual */ +#endif + struct drm_buf_pub __user *list; /**< Buffer information */ +}; + +/* + * DRM_IOCTL_DMA ioctl argument type. + * + * Indices here refer to the offset into the buffer list in drm_buf_get. + * + * \sa drmDMA(). + */ +struct drm_dma { + int context; /**< Context handle */ + int send_count; /**< Number of buffers to send */ + int __user *send_indices; /**< List of handles to buffers */ + int __user *send_sizes; /**< Lengths of data to send */ + enum drm_dma_flags flags; /**< Flags */ + int request_count; /**< Number of buffers requested */ + int request_size; /**< Desired size for buffers */ + int __user *request_indices; /**< Buffer information */ + int __user *request_sizes; + int granted_count; /**< Number of buffers granted */ +}; + +enum drm_ctx_flags { + _DRM_CONTEXT_PRESERVED = 0x01, + _DRM_CONTEXT_2DONLY = 0x02 +}; + +/* + * DRM_IOCTL_ADD_CTX ioctl argument type. + * + * \sa drmCreateContext() and drmDestroyContext(). + */ +struct drm_ctx { + drm_context_t handle; + enum drm_ctx_flags flags; +}; + +/* + * DRM_IOCTL_RES_CTX ioctl argument type. + */ +struct drm_ctx_res { + int count; + struct drm_ctx __user *contexts; +}; + +/* + * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. + */ +struct drm_draw { + drm_drawable_t handle; +}; + +/* + * DRM_IOCTL_UPDATE_DRAW ioctl argument type. + */ +typedef enum { + DRM_DRAWABLE_CLIPRECTS +} drm_drawable_info_type_t; + +struct drm_update_draw { + drm_drawable_t handle; + unsigned int type; + unsigned int num; + unsigned long long data; +}; + +/* + * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. + */ +struct drm_auth { + drm_magic_t magic; +}; + +/* + * DRM_IOCTL_IRQ_BUSID ioctl argument type. + * + * \sa drmGetInterruptFromBusID(). + */ +struct drm_irq_busid { + int irq; /**< IRQ number */ + int busnum; /**< bus number */ + int devnum; /**< device number */ + int funcnum; /**< function number */ +}; + +enum drm_vblank_seq_type { + _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ + _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ + /* bits 1-6 are reserved for high crtcs */ + _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, + _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ + _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ + _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ + _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ + _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ +}; +#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 + +#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) +#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ + _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) + +struct drm_wait_vblank_request { + enum drm_vblank_seq_type type; + unsigned int sequence; + unsigned long signal; +}; + +struct drm_wait_vblank_reply { + enum drm_vblank_seq_type type; + unsigned int sequence; + long tval_sec; + long tval_usec; +}; + +/* + * DRM_IOCTL_WAIT_VBLANK ioctl argument type. + * + * \sa drmWaitVBlank(). + */ +union drm_wait_vblank { + struct drm_wait_vblank_request request; + struct drm_wait_vblank_reply reply; +}; + +#define _DRM_PRE_MODESET 1 +#define _DRM_POST_MODESET 2 + +/* + * DRM_IOCTL_MODESET_CTL ioctl argument type + * + * \sa drmModesetCtl(). + */ +struct drm_modeset_ctl { + __u32 crtc; + __u32 cmd; +}; + +/* + * DRM_IOCTL_AGP_ENABLE ioctl argument type. + * + * \sa drmAgpEnable(). + */ +struct drm_agp_mode { + unsigned long mode; /**< AGP mode */ +}; + +/* + * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. + * + * \sa drmAgpAlloc() and drmAgpFree(). + */ +struct drm_agp_buffer { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for binding / unbinding */ + unsigned long type; /**< Type of memory to allocate */ + unsigned long physical; /**< Physical used by i810 */ +}; + +/* + * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. + * + * \sa drmAgpBind() and drmAgpUnbind(). + */ +struct drm_agp_binding { + unsigned long handle; /**< From drm_agp_buffer */ + unsigned long offset; /**< In bytes -- will round to page boundary */ +}; + +/* + * DRM_IOCTL_AGP_INFO ioctl argument type. + * + * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), + * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), + * drmAgpVendorId() and drmAgpDeviceId(). + */ +struct drm_agp_info { + int agp_version_major; + int agp_version_minor; + unsigned long mode; + unsigned long aperture_base; /* physical address */ + unsigned long aperture_size; /* bytes */ + unsigned long memory_allowed; /* bytes */ + unsigned long memory_used; + + /* PCI information */ + unsigned short id_vendor; + unsigned short id_device; +}; + +/* + * DRM_IOCTL_SG_ALLOC ioctl argument type. + */ +struct drm_scatter_gather { + unsigned long size; /**< In bytes -- will round to page boundary */ + unsigned long handle; /**< Used for mapping / unmapping */ +}; + +/* + * DRM_IOCTL_SET_VERSION ioctl argument type. + */ +struct drm_set_version { + int drm_di_major; + int drm_di_minor; + int drm_dd_major; + int drm_dd_minor; +}; + +/** + * struct drm_gem_close - Argument for &DRM_IOCTL_GEM_CLOSE ioctl. + * @handle: Handle of the object to be closed. + * @pad: Padding. + * + * Releases the handle to an mm object. + */ +struct drm_gem_close { + __u32 handle; + __u32 pad; +}; + +/** + * struct drm_gem_flink - Argument for &DRM_IOCTL_GEM_FLINK ioctl. + * @handle: Handle for the object being named. + * @name: Returned global name. + * + * Create a global name for an object, returning the name. + * + * Note that the name does not hold a reference; when the object + * is freed, the name goes away. + */ +struct drm_gem_flink { + __u32 handle; + __u32 name; +}; + +/** + * struct drm_gem_open - Argument for &DRM_IOCTL_GEM_OPEN ioctl. + * @name: Name of object being opened. + * @handle: Returned handle for the object. + * @size: Returned size of the object + * + * Open an object using the global name, returning a handle and the size. + * + * This handle (of course) holds a reference to the object, so the object + * will not go away until the handle is deleted. + */ +struct drm_gem_open { + __u32 name; + __u32 handle; + __u64 size; +}; + +/** + * struct drm_gem_change_handle - Argument for &DRM_IOCTL_GEM_CHANGE_HANDLE ioctl. + * @handle: The handle of a gem object. + * @new_handle: An available gem handle. + * + * This ioctl changes the handle of a GEM object to the specified one. + * The new handle must be unused. On success the old handle is closed + * and all further IOCTL should refer to the new handle only. + * Calls to DRM_IOCTL_PRIME_FD_TO_HANDLE will return the new handle. + */ +struct drm_gem_change_handle { + __u32 handle; + __u32 new_handle; +}; + +/** + * DRM_CAP_DUMB_BUFFER + * + * If set to 1, the driver supports creating dumb buffers via the + * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. + */ +#define DRM_CAP_DUMB_BUFFER 0x1 +/** + * DRM_CAP_VBLANK_HIGH_CRTC + * + * If set to 1, the kernel supports specifying a :ref:`CRTC index` + * in the high bits of &drm_wait_vblank_request.type. + * + * Starting kernel version 2.6.39, this capability is always set to 1. + */ +#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 +/** + * DRM_CAP_DUMB_PREFERRED_DEPTH + * + * The preferred bit depth for dumb buffers. + * + * The bit depth is the number of bits used to indicate the color of a single + * pixel excluding any padding. This is different from the number of bits per + * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per + * pixel. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 +/** + * DRM_CAP_DUMB_PREFER_SHADOW + * + * If set to 1, the driver prefers userspace to render to a shadow buffer + * instead of directly rendering to a dumb buffer. For best speed, userspace + * should do streaming ordered memory copies into the dumb buffer and never + * read from it. + * + * Note that this preference only applies to dumb buffers, it's irrelevant for + * other types of buffers. + */ +#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 +/** + * DRM_CAP_PRIME + * + * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT + * and &DRM_PRIME_CAP_EXPORT. + * + * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and + * &DRM_PRIME_CAP_EXPORT are always advertised. + * + * PRIME buffers are exposed as dma-buf file descriptors. + * See :ref:`prime_buffer_sharing`. + */ +#define DRM_CAP_PRIME 0x5 +/** + * DRM_PRIME_CAP_IMPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME + * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_IMPORT 0x1 +/** + * DRM_PRIME_CAP_EXPORT + * + * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME + * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. + * + * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. + */ +#define DRM_PRIME_CAP_EXPORT 0x2 +/** + * DRM_CAP_TIMESTAMP_MONOTONIC + * + * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in + * struct drm_event_vblank. If set to 1, the kernel will report timestamps with + * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these + * clocks. + * + * Starting from kernel version 2.6.39, the default value for this capability + * is 1. Starting kernel version 4.15, this capability is always set to 1. + */ +#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 +/** + * DRM_CAP_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy + * page-flips. + */ +#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 +/** + * DRM_CAP_CURSOR_WIDTH + * + * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid + * width x height combination for the hardware cursor. The intention is that a + * hardware agnostic userspace can query a cursor plane size to use. + * + * Note that the cross-driver contract is to merely return a valid size; + * drivers are free to attach another meaning on top, eg. i915 returns the + * maximum plane size. + */ +#define DRM_CAP_CURSOR_WIDTH 0x8 +/** + * DRM_CAP_CURSOR_HEIGHT + * + * See &DRM_CAP_CURSOR_WIDTH. + */ +#define DRM_CAP_CURSOR_HEIGHT 0x9 +/** + * DRM_CAP_ADDFB2_MODIFIERS + * + * If set to 1, the driver supports supplying modifiers in the + * &DRM_IOCTL_MODE_ADDFB2 ioctl. + */ +#define DRM_CAP_ADDFB2_MODIFIERS 0x10 +/** + * DRM_CAP_PAGE_FLIP_TARGET + * + * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and + * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in + * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP + * ioctl. + */ +#define DRM_CAP_PAGE_FLIP_TARGET 0x11 +/** + * DRM_CAP_CRTC_IN_VBLANK_EVENT + * + * If set to 1, the kernel supports reporting the CRTC ID in + * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and + * &DRM_EVENT_FLIP_COMPLETE events. + * + * Starting kernel version 4.12, this capability is always set to 1. + */ +#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 +/** + * DRM_CAP_SYNCOBJ + * + * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ 0x13 +/** + * DRM_CAP_SYNCOBJ_TIMELINE + * + * If set to 1, the driver supports timeline operations on sync objects. See + * :ref:`drm_sync_objects`. + */ +#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 +/** + * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP + * + * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic + * commits. + */ +#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 + +/* DRM_IOCTL_GET_CAP ioctl argument type */ +struct drm_get_cap { + __u64 capability; + __u64 value; +}; + +/** + * DRM_CLIENT_CAP_STEREO_3D + * + * If set to 1, the DRM core will expose the stereo 3D capabilities of the + * monitor by advertising the supported 3D layouts in the flags of struct + * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 3.13. + */ +#define DRM_CLIENT_CAP_STEREO_3D 1 + +/** + * DRM_CLIENT_CAP_UNIVERSAL_PLANES + * + * If set to 1, the DRM core will expose all planes (overlay, primary, and + * cursor) to userspace. + * + * This capability has been introduced in kernel version 3.15. Starting from + * kernel version 3.17, this capability is always supported for all drivers. + */ +#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 + +/** + * DRM_CLIENT_CAP_ATOMIC + * + * If set to 1, the DRM core will expose atomic properties to userspace. This + * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and + * &DRM_CLIENT_CAP_ASPECT_RATIO. + * + * If the driver doesn't support atomic mode-setting, enabling this capability + * will fail with -EOPNOTSUPP. + * + * This capability has been introduced in kernel version 4.0. Starting from + * kernel version 4.2, this capability is always supported for atomic-capable + * drivers. + */ +#define DRM_CLIENT_CAP_ATOMIC 3 + +/** + * DRM_CLIENT_CAP_ASPECT_RATIO + * + * If set to 1, the DRM core will provide aspect ratio information in modes. + * See ``DRM_MODE_FLAG_PIC_AR_*``. + * + * This capability is always supported for all drivers starting from kernel + * version 4.18. + */ +#define DRM_CLIENT_CAP_ASPECT_RATIO 4 + +/** + * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS + * + * If set to 1, the DRM core will expose special connectors to be used for + * writing back to memory the scene setup in the commit. The client must enable + * &DRM_CLIENT_CAP_ATOMIC first. + * + * This capability is always supported for atomic-capable drivers starting from + * kernel version 4.19. + */ +#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 + +/** + * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT + * + * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and + * virtualbox) have additional restrictions for cursor planes (thus + * making cursor planes on those drivers not truly universal,) e.g. + * they need cursor planes to act like one would expect from a mouse + * cursor and have correctly set hotspot properties. + * If this client cap is not set the DRM core will hide cursor plane on + * those virtualized drivers because not setting it implies that the + * client is not capable of dealing with those extra restictions. + * Clients which do set cursor hotspot and treat the cursor plane + * like a mouse cursor should set this property. + * The client must enable &DRM_CLIENT_CAP_ATOMIC first. + * + * Setting this property on drivers which do not special case + * cursor planes (i.e. non-virtualized drivers) will return + * EOPNOTSUPP, which can be used by userspace to gauge + * requirements of the hardware/drivers they're running on. + * + * This capability is always supported for atomic-capable virtualized + * drivers starting from kernel version 6.6. + */ +#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 + +/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ +struct drm_set_client_cap { + __u64 capability; + __u64 value; +}; + +#define DRM_RDWR O_RDWR +#define DRM_CLOEXEC O_CLOEXEC +struct drm_prime_handle { + __u32 handle; + + /** Flags.. only applicable for handle->fd */ + __u32 flags; + + /** Returned dmabuf file descriptor */ + __s32 fd; +}; + +struct drm_syncobj_create { + __u32 handle; +#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) + __u32 flags; +}; + +struct drm_syncobj_destroy { + __u32 handle; + __u32 pad; +}; + +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE (1 << 1) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE (1 << 1) +struct drm_syncobj_handle { + __u32 handle; + __u32 flags; + + __s32 fd; + __u32 pad; + + __u64 point; +}; + +struct drm_syncobj_transfer { + __u32 src_handle; + __u32 dst_handle; + __u64 src_point; + __u64 dst_point; + __u32 flags; + __u32 pad; +}; + +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ +#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ +struct drm_syncobj_wait { + __u64 handles; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +struct drm_syncobj_timeline_wait { + __u64 handles; + /* wait on specific timeline point for every handles*/ + __u64 points; + /* absolute timeout */ + __s64 timeout_nsec; + __u32 count_handles; + __u32 flags; + __u32 first_signaled; /* only valid when not waiting all */ + __u32 pad; + /** + * @deadline_nsec - fence deadline hint + * + * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing + * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is + * set. + */ + __u64 deadline_nsec; +}; + +/** + * struct drm_syncobj_eventfd + * @handle: syncobj handle. + * @flags: Zero to wait for the point to be signalled, or + * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be + * available for the point. + * @point: syncobj timeline point (set to zero for binary syncobjs). + * @fd: Existing eventfd to sent events to. + * @pad: Must be zero. + * + * Register an eventfd to be signalled by a syncobj. The eventfd counter will + * be incremented by one. + */ +struct drm_syncobj_eventfd { + __u32 handle; + __u32 flags; + __u64 point; + __s32 fd; + __u32 pad; +}; + + +struct drm_syncobj_array { + __u64 handles; + __u32 count_handles; + __u32 pad; +}; + +#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ +struct drm_syncobj_timeline_array { + __u64 handles; + __u64 points; + __u32 count_handles; + __u32 flags; +}; + + +/* Query current scanout sequence number */ +struct drm_crtc_get_sequence { + __u32 crtc_id; /* requested crtc_id */ + __u32 active; /* return: crtc output is active */ + __u64 sequence; /* return: most recent vblank sequence */ + __s64 sequence_ns; /* return: most recent time of first pixel out */ +}; + +/* Queue event to be delivered at specified sequence. Time stamp marks + * when the first pixel of the refresh cycle leaves the display engine + * for the display + */ +#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ +#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ + +struct drm_crtc_queue_sequence { + __u32 crtc_id; + __u32 flags; + __u64 sequence; /* on input, target sequence. on output, actual sequence */ + __u64 user_data; /* user data passed to event */ +}; + +#define DRM_CLIENT_NAME_MAX_LEN 64 +struct drm_set_client_name { + __u64 name_len; + __u64 name; +}; + + +#if defined(__cplusplus) +} +#endif + +#include "drm_mode.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define DRM_IOCTL_BASE 'd' +#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) +#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) +#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) +#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) + +#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) +#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) +#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) +#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) +#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) +#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) +#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) +#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) +#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) +/** + * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. + * + * GEM handles are not reference-counted by the kernel. User-space is + * responsible for managing their lifetime. For example, if user-space imports + * the same memory object twice on the same DRM file description, the same GEM + * handle is returned by both imports, and user-space needs to ensure + * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen + * when a memory object is allocated, then exported and imported again on the + * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception + * and always returns fresh new GEM handles even if an existing GEM handle + * already refers to the same memory object before the IOCTL is performed. + */ +#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) +#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) +#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) +#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) +#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) + +#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) +#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) +#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) +#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) +#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) +#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) +#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) +#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) +#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) +#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) +#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) + +#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) + +#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) +#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) + +#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) +#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) + +#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) +#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) +#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) +#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) +#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) +#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) +#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) +#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) +#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) +#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) +#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) +#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) +#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) + +/** + * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. + * + * User-space sets &drm_prime_handle.handle with the GEM handle to export and + * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in + * &drm_prime_handle.fd. + * + * The export can fail for any driver-specific reason, e.g. because export is + * not supported for this specific GEM handle (but might be for others). + * + * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. + */ +#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) +/** + * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. + * + * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to + * import, and gets back a GEM handle in &drm_prime_handle.handle. + * &drm_prime_handle.flags is unused. + * + * If an existing GEM handle refers to the memory object backing the DMA-BUF, + * that GEM handle is returned. Therefore user-space which needs to handle + * arbitrary DMA-BUFs must have a user-space lookup data structure to manually + * reference-count duplicated GEM handles. For more information see + * &DRM_IOCTL_GEM_CLOSE. + * + * The import can fail for any driver-specific reason, e.g. because import is + * only supported for DMA-BUFs allocated on this DRM device. + * + * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. + */ +#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) + +#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) +#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) +#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) +#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) +#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) +#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) +#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) + +#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) +#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) + +#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) + +#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) +#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) + +#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) + +#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) +#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) +#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) +#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) +#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) +#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) +#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ +#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ + +#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) +#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) +#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) +#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) +#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) +/** + * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. + * + * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * Warning: removing a framebuffer currently in-use on an enabled plane will + * disable that plane. The CRTC the plane is linked to may also be disabled + * (depending on driver capabilities). + */ +#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) +#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) +#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) + +/** + * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. + * + * KMS dumb buffers provide a very primitive way to allocate a buffer object + * suitable for scanout and map it for software rendering. KMS dumb buffers are + * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb + * buffers are not suitable to be displayed on any other device than the KMS + * device where they were allocated from. Also see + * :ref:`kms_dumb_buffer_objects`. + * + * The IOCTL argument is a struct drm_mode_create_dumb. + * + * User-space is expected to create a KMS dumb buffer via this IOCTL, then add + * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via + * &DRM_IOCTL_MODE_MAP_DUMB. + * + * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. + * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate + * driver preferences for dumb buffers. + */ +#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) +#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) +#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) +#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) +#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) +#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) +#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) +#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) +#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) +#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) +#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) +#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) +#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) + +#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) +#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) +#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) +#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) +#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) +#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) + +#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) +#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) +#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) +#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) + +#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) +#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) +#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) +#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) + +/** + * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. + * + * This queries metadata about a framebuffer. User-space fills + * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the + * struct as the output. + * + * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles + * will be filled with GEM buffer handles. Fresh new GEM handles are always + * returned, even if another GEM handle referring to the same memory object + * already exists on the DRM file description. The caller is responsible for + * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same + * new handle will be returned for multiple planes in case they use the same + * memory object. Planes are valid until one has a zero handle -- this can be + * used to compute the number of planes. + * + * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid + * until one has a zero &drm_mode_fb_cmd2.pitches. + * + * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set + * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the + * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. + * + * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space + * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately + * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not + * double-close handles which are specified multiple times in the array. + */ +#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) + +#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) + +/** + * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. + * + * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL + * argument is a framebuffer object ID. + * + * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable + * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept + * alive. When the plane no longer uses the framebuffer (because the + * framebuffer is replaced with another one, or the plane is disabled), the + * framebuffer is cleaned up. + * + * This is useful to implement flicker-free transitions between two processes. + * + * Depending on the threat model, user-space may want to ensure that the + * framebuffer doesn't expose any sensitive user information: closed + * framebuffers attached to a plane can be read back by the next DRM master. + */ +#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) + +/** + * DRM_IOCTL_SET_CLIENT_NAME - Attach a name to a drm_file + * + * Having a name allows for easier tracking and debugging. + * The length of the name (without null ending char) must be + * <= DRM_CLIENT_NAME_MAX_LEN. + * The call will fail if the name contains whitespaces or non-printable chars. + */ +#define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name) + +/** + * DRM_IOCTL_GEM_CHANGE_HANDLE - Move an object to a different handle + * + * Some applications (notably CRIU) need objects to have specific gem handles. + * This ioctl changes the object at one gem handle to use a new gem handle. + */ +#define DRM_IOCTL_GEM_CHANGE_HANDLE DRM_IOWR(0xD2, struct drm_gem_change_handle) + +/* + * Device specific ioctls should only be in their respective headers + * The device specific ioctl range is from 0x40 to 0x9f. + * Generic IOCTLS restart at 0xA0. + * + * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and + * drmCommandReadWrite(). + */ +#define DRM_COMMAND_BASE 0x40 +#define DRM_COMMAND_END 0xA0 + +/** + * struct drm_event - Header for DRM events + * @type: event type. + * @length: total number of payload bytes (including header). + * + * This struct is a header for events written back to user-space on the DRM FD. + * A read on the DRM FD will always only return complete events: e.g. if the + * read buffer is 100 bytes large and there are two 64 byte events pending, + * only one will be returned. + * + * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and + * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, + * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. + */ +struct drm_event { + __u32 type; + __u32 length; +}; + +/** + * DRM_EVENT_VBLANK - vertical blanking event + * + * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the + * &_DRM_VBLANK_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_VBLANK 0x01 +/** + * DRM_EVENT_FLIP_COMPLETE - page-flip completion event + * + * This event is sent in response to an atomic commit or legacy page-flip with + * the &DRM_MODE_PAGE_FLIP_EVENT flag set. + * + * The event payload is a struct drm_event_vblank. + */ +#define DRM_EVENT_FLIP_COMPLETE 0x02 +/** + * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event + * + * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. + * + * The event payload is a struct drm_event_crtc_sequence. + */ +#define DRM_EVENT_CRTC_SEQUENCE 0x03 + +struct drm_event_vblank { + struct drm_event base; + __u64 user_data; + __u32 tv_sec; + __u32 tv_usec; + __u32 sequence; + __u32 crtc_id; /* 0 on older kernels that do not support this */ +}; + +/* Event delivered at sequence. Time stamp marks when the first pixel + * of the refresh cycle leaves the display engine for the display + */ +struct drm_event_crtc_sequence { + struct drm_event base; + __u64 user_data; + __s64 time_ns; + __u64 sequence; +}; + +/* typedef area */ +#ifndef __KERNEL__ +typedef struct drm_clip_rect drm_clip_rect_t; +typedef struct drm_drawable_info drm_drawable_info_t; +typedef struct drm_tex_region drm_tex_region_t; +typedef struct drm_hw_lock drm_hw_lock_t; +typedef struct drm_version drm_version_t; +typedef struct drm_unique drm_unique_t; +typedef struct drm_list drm_list_t; +typedef struct drm_block drm_block_t; +typedef struct drm_control drm_control_t; +typedef enum drm_map_type drm_map_type_t; +typedef enum drm_map_flags drm_map_flags_t; +typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; +typedef struct drm_map drm_map_t; +typedef struct drm_client drm_client_t; +typedef enum drm_stat_type drm_stat_type_t; +typedef struct drm_stats drm_stats_t; +typedef enum drm_lock_flags drm_lock_flags_t; +typedef struct drm_lock drm_lock_t; +typedef enum drm_dma_flags drm_dma_flags_t; +typedef struct drm_buf_desc drm_buf_desc_t; +typedef struct drm_buf_info drm_buf_info_t; +typedef struct drm_buf_free drm_buf_free_t; +typedef struct drm_buf_pub drm_buf_pub_t; +typedef struct drm_buf_map drm_buf_map_t; +typedef struct drm_dma drm_dma_t; +typedef union drm_wait_vblank drm_wait_vblank_t; +typedef struct drm_agp_mode drm_agp_mode_t; +typedef enum drm_ctx_flags drm_ctx_flags_t; +typedef struct drm_ctx drm_ctx_t; +typedef struct drm_ctx_res drm_ctx_res_t; +typedef struct drm_draw drm_draw_t; +typedef struct drm_update_draw drm_update_draw_t; +typedef struct drm_auth drm_auth_t; +typedef struct drm_irq_busid drm_irq_busid_t; +typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; + +typedef struct drm_agp_buffer drm_agp_buffer_t; +typedef struct drm_agp_binding drm_agp_binding_t; +typedef struct drm_agp_info drm_agp_info_t; +typedef struct drm_scatter_gather drm_scatter_gather_t; +typedef struct drm_set_version drm_set_version_t; +#endif + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/plugins/amdgpu/drm_mode.h b/plugins/amdgpu/drm_mode.h new file mode 100644 index 000000000..c082810c0 --- /dev/null +++ b/plugins/amdgpu/drm_mode.h @@ -0,0 +1,1362 @@ +/* + * Copyright (c) 2007 Dave Airlie + * Copyright (c) 2007 Jakob Bornecrantz + * Copyright (c) 2008 Red Hat Inc. + * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA + * Copyright (c) 2007-2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _DRM_MODE_H +#define _DRM_MODE_H + +#include "drm.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * DOC: overview + * + * DRM exposes many UAPI and structure definitions to have a consistent + * and standardized interface with users. + * Userspace can refer to these structure definitions and UAPI formats + * to communicate to drivers. + */ + +#define DRM_CONNECTOR_NAME_LEN 32 +#define DRM_DISPLAY_MODE_LEN 32 +#define DRM_PROP_NAME_LEN 32 + +#define DRM_MODE_TYPE_BUILTIN (1<<0) /* deprecated */ +#define DRM_MODE_TYPE_CLOCK_C ((1<<1) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_CRTC_C ((1<<2) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ +#define DRM_MODE_TYPE_PREFERRED (1<<3) +#define DRM_MODE_TYPE_DEFAULT (1<<4) /* deprecated */ +#define DRM_MODE_TYPE_USERDEF (1<<5) +#define DRM_MODE_TYPE_DRIVER (1<<6) + +#define DRM_MODE_TYPE_ALL (DRM_MODE_TYPE_PREFERRED | \ + DRM_MODE_TYPE_USERDEF | \ + DRM_MODE_TYPE_DRIVER) + +/* Video mode flags */ +/* bit compatible with the xrandr RR_ definitions (bits 0-13) + * + * ABI warning: Existing userspace really expects + * the mode flags to match the xrandr definitions. Any + * changes that don't match the xrandr definitions will + * likely need a new client cap or some other mechanism + * to avoid breaking existing userspace. This includes + * allocating new flags in the previously unused bits! + */ +#define DRM_MODE_FLAG_PHSYNC (1<<0) +#define DRM_MODE_FLAG_NHSYNC (1<<1) +#define DRM_MODE_FLAG_PVSYNC (1<<2) +#define DRM_MODE_FLAG_NVSYNC (1<<3) +#define DRM_MODE_FLAG_INTERLACE (1<<4) +#define DRM_MODE_FLAG_DBLSCAN (1<<5) +#define DRM_MODE_FLAG_CSYNC (1<<6) +#define DRM_MODE_FLAG_PCSYNC (1<<7) +#define DRM_MODE_FLAG_NCSYNC (1<<8) +#define DRM_MODE_FLAG_HSKEW (1<<9) /* hskew provided */ +#define DRM_MODE_FLAG_BCAST (1<<10) /* deprecated */ +#define DRM_MODE_FLAG_PIXMUX (1<<11) /* deprecated */ +#define DRM_MODE_FLAG_DBLCLK (1<<12) +#define DRM_MODE_FLAG_CLKDIV2 (1<<13) + /* + * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX + * (define not exposed to user space). + */ +#define DRM_MODE_FLAG_3D_MASK (0x1f<<14) +#define DRM_MODE_FLAG_3D_NONE (0<<14) +#define DRM_MODE_FLAG_3D_FRAME_PACKING (1<<14) +#define DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE (2<<14) +#define DRM_MODE_FLAG_3D_LINE_ALTERNATIVE (3<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL (4<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH (5<<14) +#define DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH (6<<14) +#define DRM_MODE_FLAG_3D_TOP_AND_BOTTOM (7<<14) +#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF (8<<14) + +/* Picture aspect ratio options */ +#define DRM_MODE_PICTURE_ASPECT_NONE 0 +#define DRM_MODE_PICTURE_ASPECT_4_3 1 +#define DRM_MODE_PICTURE_ASPECT_16_9 2 +#define DRM_MODE_PICTURE_ASPECT_64_27 3 +#define DRM_MODE_PICTURE_ASPECT_256_135 4 + +/* Content type options */ +#define DRM_MODE_CONTENT_TYPE_NO_DATA 0 +#define DRM_MODE_CONTENT_TYPE_GRAPHICS 1 +#define DRM_MODE_CONTENT_TYPE_PHOTO 2 +#define DRM_MODE_CONTENT_TYPE_CINEMA 3 +#define DRM_MODE_CONTENT_TYPE_GAME 4 + +/* Aspect ratio flag bitmask (4 bits 22:19) */ +#define DRM_MODE_FLAG_PIC_AR_MASK (0x0F<<19) +#define DRM_MODE_FLAG_PIC_AR_NONE \ + (DRM_MODE_PICTURE_ASPECT_NONE<<19) +#define DRM_MODE_FLAG_PIC_AR_4_3 \ + (DRM_MODE_PICTURE_ASPECT_4_3<<19) +#define DRM_MODE_FLAG_PIC_AR_16_9 \ + (DRM_MODE_PICTURE_ASPECT_16_9<<19) +#define DRM_MODE_FLAG_PIC_AR_64_27 \ + (DRM_MODE_PICTURE_ASPECT_64_27<<19) +#define DRM_MODE_FLAG_PIC_AR_256_135 \ + (DRM_MODE_PICTURE_ASPECT_256_135<<19) + +#define DRM_MODE_FLAG_ALL (DRM_MODE_FLAG_PHSYNC | \ + DRM_MODE_FLAG_NHSYNC | \ + DRM_MODE_FLAG_PVSYNC | \ + DRM_MODE_FLAG_NVSYNC | \ + DRM_MODE_FLAG_INTERLACE | \ + DRM_MODE_FLAG_DBLSCAN | \ + DRM_MODE_FLAG_CSYNC | \ + DRM_MODE_FLAG_PCSYNC | \ + DRM_MODE_FLAG_NCSYNC | \ + DRM_MODE_FLAG_HSKEW | \ + DRM_MODE_FLAG_DBLCLK | \ + DRM_MODE_FLAG_CLKDIV2 | \ + DRM_MODE_FLAG_3D_MASK) + +/* DPMS flags */ +/* bit compatible with the xorg definitions. */ +#define DRM_MODE_DPMS_ON 0 +#define DRM_MODE_DPMS_STANDBY 1 +#define DRM_MODE_DPMS_SUSPEND 2 +#define DRM_MODE_DPMS_OFF 3 + +/* Scaling mode options */ +#define DRM_MODE_SCALE_NONE 0 /* Unmodified timing (display or + software can still scale) */ +#define DRM_MODE_SCALE_FULLSCREEN 1 /* Full screen, ignore aspect */ +#define DRM_MODE_SCALE_CENTER 2 /* Centered, no scaling */ +#define DRM_MODE_SCALE_ASPECT 3 /* Full screen, preserve aspect */ + +/* Dithering mode options */ +#define DRM_MODE_DITHERING_OFF 0 +#define DRM_MODE_DITHERING_ON 1 +#define DRM_MODE_DITHERING_AUTO 2 + +/* Dirty info options */ +#define DRM_MODE_DIRTY_OFF 0 +#define DRM_MODE_DIRTY_ON 1 +#define DRM_MODE_DIRTY_ANNOTATE 2 + +/* Link Status options */ +#define DRM_MODE_LINK_STATUS_GOOD 0 +#define DRM_MODE_LINK_STATUS_BAD 1 + +/* + * DRM_MODE_ROTATE_ + * + * Signals that a drm plane is been rotated degrees in counter + * clockwise direction. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_ROTATE_0 (1<<0) +#define DRM_MODE_ROTATE_90 (1<<1) +#define DRM_MODE_ROTATE_180 (1<<2) +#define DRM_MODE_ROTATE_270 (1<<3) + +/* + * DRM_MODE_ROTATE_MASK + * + * Bitmask used to look for drm plane rotations. + */ +#define DRM_MODE_ROTATE_MASK (\ + DRM_MODE_ROTATE_0 | \ + DRM_MODE_ROTATE_90 | \ + DRM_MODE_ROTATE_180 | \ + DRM_MODE_ROTATE_270) + +/* + * DRM_MODE_REFLECT_ + * + * Signals that the contents of a drm plane is reflected along the axis, + * in the same way as mirroring. + * See kerneldoc chapter "Plane Composition Properties" for more details. + * + * This define is provided as a convenience, looking up the property id + * using the name->prop id lookup is the preferred method. + */ +#define DRM_MODE_REFLECT_X (1<<4) +#define DRM_MODE_REFLECT_Y (1<<5) + +/* + * DRM_MODE_REFLECT_MASK + * + * Bitmask used to look for drm plane reflections. + */ +#define DRM_MODE_REFLECT_MASK (\ + DRM_MODE_REFLECT_X | \ + DRM_MODE_REFLECT_Y) + +/* Content Protection Flags */ +#define DRM_MODE_CONTENT_PROTECTION_UNDESIRED 0 +#define DRM_MODE_CONTENT_PROTECTION_DESIRED 1 +#define DRM_MODE_CONTENT_PROTECTION_ENABLED 2 + +/** + * struct drm_mode_modeinfo - Display mode information. + * @clock: pixel clock in kHz + * @hdisplay: horizontal display size + * @hsync_start: horizontal sync start + * @hsync_end: horizontal sync end + * @htotal: horizontal total size + * @hskew: horizontal skew + * @vdisplay: vertical display size + * @vsync_start: vertical sync start + * @vsync_end: vertical sync end + * @vtotal: vertical total size + * @vscan: vertical scan + * @vrefresh: approximate vertical refresh rate in Hz + * @flags: bitmask of misc. flags, see DRM_MODE_FLAG_* defines + * @type: bitmask of type flags, see DRM_MODE_TYPE_* defines + * @name: string describing the mode resolution + * + * This is the user-space API display mode information structure. For the + * kernel version see struct drm_display_mode. + */ +struct drm_mode_modeinfo { + __u32 clock; + __u16 hdisplay; + __u16 hsync_start; + __u16 hsync_end; + __u16 htotal; + __u16 hskew; + __u16 vdisplay; + __u16 vsync_start; + __u16 vsync_end; + __u16 vtotal; + __u16 vscan; + + __u32 vrefresh; + + __u32 flags; + __u32 type; + char name[DRM_DISPLAY_MODE_LEN]; +}; + +struct drm_mode_card_res { + __u64 fb_id_ptr; + __u64 crtc_id_ptr; + __u64 connector_id_ptr; + __u64 encoder_id_ptr; + __u32 count_fbs; + __u32 count_crtcs; + __u32 count_connectors; + __u32 count_encoders; + __u32 min_width; + __u32 max_width; + __u32 min_height; + __u32 max_height; +}; + +struct drm_mode_crtc { + __u64 set_connectors_ptr; + __u32 count_connectors; + + __u32 crtc_id; /**< Id */ + __u32 fb_id; /**< Id of framebuffer */ + + __u32 x; /**< x Position on the framebuffer */ + __u32 y; /**< y Position on the framebuffer */ + + __u32 gamma_size; + __u32 mode_valid; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_PRESENT_TOP_FIELD (1<<0) +#define DRM_MODE_PRESENT_BOTTOM_FIELD (1<<1) + +/* Planes blend with or override other bits on the CRTC */ +struct drm_mode_set_plane { + __u32 plane_id; + __u32 crtc_id; + __u32 fb_id; /* fb object contains surface format type */ + __u32 flags; /* see above flags */ + + /* Signed dest location allows it to be partially off screen */ + __s32 crtc_x; + __s32 crtc_y; + __u32 crtc_w; + __u32 crtc_h; + + /* Source values are 16.16 fixed point */ + __u32 src_x; + __u32 src_y; + __u32 src_h; + __u32 src_w; +}; + +/** + * struct drm_mode_get_plane - Get plane metadata. + * + * Userspace can perform a GETPLANE ioctl to retrieve information about a + * plane. + * + * To retrieve the number of formats supported, set @count_format_types to zero + * and call the ioctl. @count_format_types will be updated with the value. + * + * To retrieve these formats, allocate an array with the memory needed to store + * @count_format_types formats. Point @format_type_ptr to this array and call + * the ioctl again (with @count_format_types still set to the value returned in + * the first ioctl call). + */ +struct drm_mode_get_plane { + /** + * @plane_id: Object ID of the plane whose information should be + * retrieved. Set by caller. + */ + __u32 plane_id; + + /** @crtc_id: Object ID of the current CRTC. */ + __u32 crtc_id; + /** @fb_id: Object ID of the current fb. */ + __u32 fb_id; + + /** + * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's + * are created and they receive an index, which corresponds to their + * position in the bitmask. Bit N corresponds to + * :ref:`CRTC index` N. + */ + __u32 possible_crtcs; + /** @gamma_size: Never used. */ + __u32 gamma_size; + + /** @count_format_types: Number of formats. */ + __u32 count_format_types; + /** + * @format_type_ptr: Pointer to ``__u32`` array of formats that are + * supported by the plane. These formats do not require modifiers. + */ + __u64 format_type_ptr; +}; + +struct drm_mode_get_plane_res { + __u64 plane_id_ptr; + __u32 count_planes; +}; + +#define DRM_MODE_ENCODER_NONE 0 +#define DRM_MODE_ENCODER_DAC 1 +#define DRM_MODE_ENCODER_TMDS 2 +#define DRM_MODE_ENCODER_LVDS 3 +#define DRM_MODE_ENCODER_TVDAC 4 +#define DRM_MODE_ENCODER_VIRTUAL 5 +#define DRM_MODE_ENCODER_DSI 6 +#define DRM_MODE_ENCODER_DPMST 7 +#define DRM_MODE_ENCODER_DPI 8 + +struct drm_mode_get_encoder { + __u32 encoder_id; + __u32 encoder_type; + + __u32 crtc_id; /**< Id of crtc */ + + __u32 possible_crtcs; + __u32 possible_clones; +}; + +/* This is for connectors with multiple signal types. */ +/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */ +enum drm_mode_subconnector { + DRM_MODE_SUBCONNECTOR_Automatic = 0, /* DVI-I, TV */ + DRM_MODE_SUBCONNECTOR_Unknown = 0, /* DVI-I, TV, DP */ + DRM_MODE_SUBCONNECTOR_VGA = 1, /* DP */ + DRM_MODE_SUBCONNECTOR_DVID = 3, /* DVI-I DP */ + DRM_MODE_SUBCONNECTOR_DVIA = 4, /* DVI-I */ + DRM_MODE_SUBCONNECTOR_Composite = 5, /* TV */ + DRM_MODE_SUBCONNECTOR_SVIDEO = 6, /* TV */ + DRM_MODE_SUBCONNECTOR_Component = 8, /* TV */ + DRM_MODE_SUBCONNECTOR_SCART = 9, /* TV */ + DRM_MODE_SUBCONNECTOR_DisplayPort = 10, /* DP */ + DRM_MODE_SUBCONNECTOR_HDMIA = 11, /* DP */ + DRM_MODE_SUBCONNECTOR_Native = 15, /* DP */ + DRM_MODE_SUBCONNECTOR_Wireless = 18, /* DP */ +}; + +#define DRM_MODE_CONNECTOR_Unknown 0 +#define DRM_MODE_CONNECTOR_VGA 1 +#define DRM_MODE_CONNECTOR_DVII 2 +#define DRM_MODE_CONNECTOR_DVID 3 +#define DRM_MODE_CONNECTOR_DVIA 4 +#define DRM_MODE_CONNECTOR_Composite 5 +#define DRM_MODE_CONNECTOR_SVIDEO 6 +#define DRM_MODE_CONNECTOR_LVDS 7 +#define DRM_MODE_CONNECTOR_Component 8 +#define DRM_MODE_CONNECTOR_9PinDIN 9 +#define DRM_MODE_CONNECTOR_DisplayPort 10 +#define DRM_MODE_CONNECTOR_HDMIA 11 +#define DRM_MODE_CONNECTOR_HDMIB 12 +#define DRM_MODE_CONNECTOR_TV 13 +#define DRM_MODE_CONNECTOR_eDP 14 +#define DRM_MODE_CONNECTOR_VIRTUAL 15 +#define DRM_MODE_CONNECTOR_DSI 16 +#define DRM_MODE_CONNECTOR_DPI 17 +#define DRM_MODE_CONNECTOR_WRITEBACK 18 +#define DRM_MODE_CONNECTOR_SPI 19 +#define DRM_MODE_CONNECTOR_USB 20 + +/** + * struct drm_mode_get_connector - Get connector metadata. + * + * User-space can perform a GETCONNECTOR ioctl to retrieve information about a + * connector. User-space is expected to retrieve encoders, modes and properties + * by performing this ioctl at least twice: the first time to retrieve the + * number of elements, the second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_props and @count_encoders to + * zero, set @count_modes to 1, and set @modes_ptr to a temporary struct + * drm_mode_modeinfo element. + * + * To retrieve the elements, allocate arrays for @encoders_ptr, @modes_ptr, + * @props_ptr and @prop_values_ptr, then set @count_modes, @count_props and + * @count_encoders to their capacity. + * + * Performing the ioctl only twice may be racy: the number of elements may have + * changed with a hotplug event in-between the two ioctls. User-space is + * expected to retry the last ioctl until the number of elements stabilizes. + * The kernel won't fill any array which doesn't have the expected length. + * + * **Force-probing a connector** + * + * If the @count_modes field is set to zero and the DRM client is the current + * DRM master, the kernel will perform a forced probe on the connector to + * refresh the connector status, modes and EDID. A forced-probe can be slow, + * might cause flickering and the ioctl will block. + * + * User-space needs to force-probe connectors to ensure their metadata is + * up-to-date at startup and after receiving a hot-plug event. User-space + * may perform a forced-probe when the user explicitly requests it. User-space + * shouldn't perform a forced-probe in other situations. + */ +struct drm_mode_get_connector { + /** @encoders_ptr: Pointer to ``__u32`` array of object IDs. */ + __u64 encoders_ptr; + /** @modes_ptr: Pointer to struct drm_mode_modeinfo array. */ + __u64 modes_ptr; + /** @props_ptr: Pointer to ``__u32`` array of property IDs. */ + __u64 props_ptr; + /** @prop_values_ptr: Pointer to ``__u64`` array of property values. */ + __u64 prop_values_ptr; + + /** @count_modes: Number of modes. */ + __u32 count_modes; + /** @count_props: Number of properties. */ + __u32 count_props; + /** @count_encoders: Number of encoders. */ + __u32 count_encoders; + + /** @encoder_id: Object ID of the current encoder. */ + __u32 encoder_id; + /** @connector_id: Object ID of the connector. */ + __u32 connector_id; + /** + * @connector_type: Type of the connector. + * + * See DRM_MODE_CONNECTOR_* defines. + */ + __u32 connector_type; + /** + * @connector_type_id: Type-specific connector number. + * + * This is not an object ID. This is a per-type connector number. Each + * (type, type_id) combination is unique across all connectors of a DRM + * device. + * + * The (type, type_id) combination is not a stable identifier: the + * type_id can change depending on the driver probe order. + */ + __u32 connector_type_id; + + /** + * @connection: Status of the connector. + * + * See enum drm_connector_status. + */ + __u32 connection; + /** @mm_width: Width of the connected sink in millimeters. */ + __u32 mm_width; + /** @mm_height: Height of the connected sink in millimeters. */ + __u32 mm_height; + /** + * @subpixel: Subpixel order of the connected sink. + * + * See enum subpixel_order. + */ + __u32 subpixel; + + /** @pad: Padding, must be zero. */ + __u32 pad; +}; + +#define DRM_MODE_PROP_PENDING (1<<0) /* deprecated, do not use */ +#define DRM_MODE_PROP_RANGE (1<<1) +#define DRM_MODE_PROP_IMMUTABLE (1<<2) +#define DRM_MODE_PROP_ENUM (1<<3) /* enumerated type with text strings */ +#define DRM_MODE_PROP_BLOB (1<<4) +#define DRM_MODE_PROP_BITMASK (1<<5) /* bitmask of enumerated types */ + +/* non-extended types: legacy bitmask, one bit per type: */ +#define DRM_MODE_PROP_LEGACY_TYPE ( \ + DRM_MODE_PROP_RANGE | \ + DRM_MODE_PROP_ENUM | \ + DRM_MODE_PROP_BLOB | \ + DRM_MODE_PROP_BITMASK) + +/* extended-types: rather than continue to consume a bit per type, + * grab a chunk of the bits to use as integer type id. + */ +#define DRM_MODE_PROP_EXTENDED_TYPE 0x0000ffc0 +#define DRM_MODE_PROP_TYPE(n) ((n) << 6) +#define DRM_MODE_PROP_OBJECT DRM_MODE_PROP_TYPE(1) +#define DRM_MODE_PROP_SIGNED_RANGE DRM_MODE_PROP_TYPE(2) + +/* the PROP_ATOMIC flag is used to hide properties from userspace that + * is not aware of atomic properties. This is mostly to work around + * older userspace (DDX drivers) that read/write each prop they find, + * without being aware that this could be triggering a lengthy modeset. + */ +#define DRM_MODE_PROP_ATOMIC 0x80000000 + +/** + * struct drm_mode_property_enum - Description for an enum/bitfield entry. + * @value: numeric value for this enum entry. + * @name: symbolic name for this enum entry. + * + * See struct drm_property_enum for details. + */ +struct drm_mode_property_enum { + __u64 value; + char name[DRM_PROP_NAME_LEN]; +}; + +/** + * struct drm_mode_get_property - Get property metadata. + * + * User-space can perform a GETPROPERTY ioctl to retrieve information about a + * property. The same property may be attached to multiple objects, see + * "Modeset Base Object Abstraction". + * + * The meaning of the @values_ptr field changes depending on the property type. + * See &drm_property.flags for more details. + * + * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the + * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For + * backwards compatibility, the kernel will always set @count_enum_blobs to + * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must + * ignore these two fields if the property has a different type. + * + * User-space is expected to retrieve values and enums by performing this ioctl + * at least twice: the first time to retrieve the number of elements, the + * second time to retrieve the elements themselves. + * + * To retrieve the number of elements, set @count_values and @count_enum_blobs + * to zero, then call the ioctl. @count_values will be updated with the number + * of elements. If the property has the type &DRM_MODE_PROP_ENUM or + * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well. + * + * To retrieve the elements themselves, allocate an array for @values_ptr and + * set @count_values to its capacity. If the property has the type + * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for + * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl + * again will fill the arrays. + */ +struct drm_mode_get_property { + /** @values_ptr: Pointer to a ``__u64`` array. */ + __u64 values_ptr; + /** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */ + __u64 enum_blob_ptr; + + /** + * @prop_id: Object ID of the property which should be retrieved. Set + * by the caller. + */ + __u32 prop_id; + /** + * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for + * a definition of the flags. + */ + __u32 flags; + /** + * @name: Symbolic property name. User-space should use this field to + * recognize properties. + */ + char name[DRM_PROP_NAME_LEN]; + + /** @count_values: Number of elements in @values_ptr. */ + __u32 count_values; + /** @count_enum_blobs: Number of elements in @enum_blob_ptr. */ + __u32 count_enum_blobs; +}; + +struct drm_mode_connector_set_property { + __u64 value; + __u32 prop_id; + __u32 connector_id; +}; + +#define DRM_MODE_OBJECT_CRTC 0xcccccccc +#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0 +#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0 +#define DRM_MODE_OBJECT_MODE 0xdededede +#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0 +#define DRM_MODE_OBJECT_FB 0xfbfbfbfb +#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb +#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee +#define DRM_MODE_OBJECT_ANY 0 + +struct drm_mode_obj_get_properties { + __u64 props_ptr; + __u64 prop_values_ptr; + __u32 count_props; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_obj_set_property { + __u64 value; + __u32 prop_id; + __u32 obj_id; + __u32 obj_type; +}; + +struct drm_mode_get_blob { + __u32 blob_id; + __u32 length; + __u64 data; +}; + +struct drm_mode_fb_cmd { + __u32 fb_id; + __u32 width; + __u32 height; + __u32 pitch; + __u32 bpp; + __u32 depth; + /* driver specific handle */ + __u32 handle; +}; + +#define DRM_MODE_FB_INTERLACED (1<<0) /* for interlaced framebuffers */ +#define DRM_MODE_FB_MODIFIERS (1<<1) /* enables ->modifier[] */ + +/** + * struct drm_mode_fb_cmd2 - Frame-buffer metadata. + * + * This struct holds frame-buffer metadata. There are two ways to use it: + * + * - User-space can fill this struct and perform a &DRM_IOCTL_MODE_ADDFB2 + * ioctl to register a new frame-buffer. The new frame-buffer object ID will + * be set by the kernel in @fb_id. + * - User-space can set @fb_id and perform a &DRM_IOCTL_MODE_GETFB2 ioctl to + * fetch metadata about an existing frame-buffer. + * + * In case of planar formats, this struct allows up to 4 buffer objects with + * offsets and pitches per plane. The pitch and offset order are dictated by + * the format FourCC as defined by ``drm_fourcc.h``, e.g. NV12 is described as: + * + * YUV 4:2:0 image with a plane of 8-bit Y samples followed by an + * interleaved U/V plane containing 8-bit 2x2 subsampled colour difference + * samples. + * + * So it would consist of a Y plane at ``offsets[0]`` and a UV plane at + * ``offsets[1]``. + * + * To accommodate tiled, compressed, etc formats, a modifier can be specified. + * For more information see the "Format Modifiers" section. Note that even + * though it looks like we have a modifier per-plane, we in fact do not. The + * modifier for each plane must be identical. Thus all combinations of + * different data layouts for multi-plane formats must be enumerated as + * separate modifiers. + * + * All of the entries in @handles, @pitches, @offsets and @modifier must be + * zero when unused. Warning, for @offsets and @modifier zero can't be used to + * figure out whether the entry is used or not since it's a valid value (a zero + * offset is common, and a zero modifier is &DRM_FORMAT_MOD_LINEAR). + */ +struct drm_mode_fb_cmd2 { + /** @fb_id: Object ID of the frame-buffer. */ + __u32 fb_id; + /** @width: Width of the frame-buffer. */ + __u32 width; + /** @height: Height of the frame-buffer. */ + __u32 height; + /** + * @pixel_format: FourCC format code, see ``DRM_FORMAT_*`` constants in + * ``drm_fourcc.h``. + */ + __u32 pixel_format; + /** + * @flags: Frame-buffer flags (see &DRM_MODE_FB_INTERLACED and + * &DRM_MODE_FB_MODIFIERS). + */ + __u32 flags; + + /** + * @handles: GEM buffer handle, one per plane. Set to 0 if the plane is + * unused. The same handle can be used for multiple planes. + */ + __u32 handles[4]; + /** @pitches: Pitch (aka. stride) in bytes, one per plane. */ + __u32 pitches[4]; + /** @offsets: Offset into the buffer in bytes, one per plane. */ + __u32 offsets[4]; + /** + * @modifier: Format modifier, one per plane. See ``DRM_FORMAT_MOD_*`` + * constants in ``drm_fourcc.h``. All planes must use the same + * modifier. Ignored unless &DRM_MODE_FB_MODIFIERS is set in @flags. + */ + __u64 modifier[4]; +}; + +#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 +#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02 +#define DRM_MODE_FB_DIRTY_FLAGS 0x03 + +#define DRM_MODE_FB_DIRTY_MAX_CLIPS 256 + +/* + * Mark a region of a framebuffer as dirty. + * + * Some hardware does not automatically update display contents + * as a hardware or software draw to a framebuffer. This ioctl + * allows userspace to tell the kernel and the hardware what + * regions of the framebuffer have changed. + * + * The kernel or hardware is free to update more then just the + * region specified by the clip rects. The kernel or hardware + * may also delay and/or coalesce several calls to dirty into a + * single update. + * + * Userspace may annotate the updates, the annotates are a + * promise made by the caller that the change is either a copy + * of pixels or a fill of a single color in the region specified. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then + * the number of updated regions are half of num_clips given, + * where the clip rects are paired in src and dst. The width and + * height of each one of the pairs must match. + * + * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller + * promises that the region specified of the clip rects is filled + * completely with a single color as given in the color argument. + */ + +struct drm_mode_fb_dirty_cmd { + __u32 fb_id; + __u32 flags; + __u32 color; + __u32 num_clips; + __u64 clips_ptr; +}; + +struct drm_mode_mode_cmd { + __u32 connector_id; + struct drm_mode_modeinfo mode; +}; + +#define DRM_MODE_CURSOR_BO 0x01 +#define DRM_MODE_CURSOR_MOVE 0x02 +#define DRM_MODE_CURSOR_FLAGS 0x03 + +/* + * depending on the value in flags different members are used. + * + * CURSOR_BO uses + * crtc_id + * width + * height + * handle - if 0 turns the cursor off + * + * CURSOR_MOVE uses + * crtc_id + * x + * y + */ +struct drm_mode_cursor { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; +}; + +struct drm_mode_cursor2 { + __u32 flags; + __u32 crtc_id; + __s32 x; + __s32 y; + __u32 width; + __u32 height; + /* driver specific handle */ + __u32 handle; + __s32 hot_x; + __s32 hot_y; +}; + +struct drm_mode_crtc_lut { + __u32 crtc_id; + __u32 gamma_size; + + /* pointers to arrays */ + __u64 red; + __u64 green; + __u64 blue; +}; + +struct drm_color_ctm { + /* + * Conversion matrix in S31.32 sign-magnitude + * (not two's complement!) format. + * + * out matrix in + * |R| |0 1 2| |R| + * |G| = |3 4 5| x |G| + * |B| |6 7 8| |B| + */ + __u64 matrix[9]; +}; + +struct drm_color_lut { + /* + * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and + * 0xffff == 1.0. + */ + __u16 red; + __u16 green; + __u16 blue; + __u16 reserved; +}; + +/** + * struct drm_plane_size_hint - Plane size hints + * @width: The width of the plane in pixel + * @height: The height of the plane in pixel + * + * The plane SIZE_HINTS property blob contains an + * array of struct drm_plane_size_hint. + */ +struct drm_plane_size_hint { + __u16 width; + __u16 height; +}; + +/** + * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. + * + * HDR Metadata Infoframe as per CTA 861.G spec. This is expected + * to match exactly with the spec. + * + * Userspace is expected to pass the metadata information as per + * the format described in this structure. + */ +struct hdr_metadata_infoframe { + /** + * @eotf: Electro-Optical Transfer Function (EOTF) + * used in the stream. + */ + __u8 eotf; + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u8 metadata_type; + /** + * @display_primaries: Color Primaries of the Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @display_primaries.x: X coordinate of color primary. + * @display_primaries.y: Y coordinate of color primary. + */ + struct { + __u16 x, y; + } display_primaries[3]; + /** + * @white_point: White Point of Colorspace Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @white_point.x: X coordinate of whitepoint of color primary. + * @white_point.y: Y coordinate of whitepoint of color primary. + */ + struct { + __u16 x, y; + } white_point; + /** + * @max_display_mastering_luminance: Max Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_display_mastering_luminance; + /** + * @min_display_mastering_luminance: Min Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of + * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF + * represents 6.5535 cd/m2. + */ + __u16 min_display_mastering_luminance; + /** + * @max_cll: Max Content Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_cll; + /** + * @max_fall: Max Frame Average Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ + __u16 max_fall; +}; + +/** + * struct hdr_output_metadata - HDR output metadata + * + * Metadata Information to be passed from userspace + */ +struct hdr_output_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ + __u32 metadata_type; + /** + * @hdmi_metadata_type1: HDR Metadata Infoframe. + */ + union { + struct hdr_metadata_infoframe hdmi_metadata_type1; + }; +}; + +/** + * DRM_MODE_PAGE_FLIP_EVENT + * + * Request that the kernel sends back a vblank event (see + * struct drm_event_vblank) with the &DRM_EVENT_FLIP_COMPLETE type when the + * page-flip is done. + */ +#define DRM_MODE_PAGE_FLIP_EVENT 0x01 +/** + * DRM_MODE_PAGE_FLIP_ASYNC + * + * Request that the page-flip is performed as soon as possible, ie. with no + * delay due to waiting for vblank. This may cause tearing to be visible on + * the screen. + * + * When used with atomic uAPI, the driver will return an error if the hardware + * doesn't support performing an asynchronous page-flip for this update. + * User-space should handle this, e.g. by falling back to a regular page-flip. + * + * Note, some hardware might need to perform one last synchronous page-flip + * before being able to switch to asynchronous page-flips. As an exception, + * the driver will return success even though that first page-flip is not + * asynchronous. + */ +#define DRM_MODE_PAGE_FLIP_ASYNC 0x02 +#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 +#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8 +#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \ + DRM_MODE_PAGE_FLIP_TARGET_RELATIVE) +/** + * DRM_MODE_PAGE_FLIP_FLAGS + * + * Bitmask of flags suitable for &drm_mode_crtc_page_flip_target.flags. + */ +#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \ + DRM_MODE_PAGE_FLIP_ASYNC | \ + DRM_MODE_PAGE_FLIP_TARGET) + +/* + * Request a page flip on the specified crtc. + * + * This ioctl will ask KMS to schedule a page flip for the specified + * crtc. Once any pending rendering targeting the specified fb (as of + * ioctl time) has completed, the crtc will be reprogrammed to display + * that fb after the next vertical refresh. The ioctl returns + * immediately, but subsequent rendering to the current fb will block + * in the execbuffer ioctl until the page flip happens. If a page + * flip is already pending as the ioctl is called, EBUSY will be + * returned. + * + * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank + * event (see drm.h: struct drm_event_vblank) when the page flip is + * done. The user_data field passed in with this ioctl will be + * returned as the user_data field in the vblank event struct. + * + * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen + * 'as soon as possible', meaning that it not delay waiting for vblank. + * This may cause tearing on the screen. + * + * The reserved field must be zero. + */ + +struct drm_mode_crtc_page_flip { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 reserved; + __u64 user_data; +}; + +/* + * Request a page flip on the specified crtc. + * + * Same as struct drm_mode_crtc_page_flip, but supports new flags and + * re-purposes the reserved field: + * + * The sequence field must be zero unless either of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When + * the ABSOLUTE flag is specified, the sequence field denotes the absolute + * vblank sequence when the flip should take effect. When the RELATIVE + * flag is specified, the sequence field denotes the relative (to the + * current one when the ioctl is called) vblank sequence when the flip + * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to + * make sure the vblank sequence before the target one has passed before + * calling this ioctl. The purpose of the + * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify + * the target for when code dealing with a page flip runs during a + * vertical blank period. + */ + +struct drm_mode_crtc_page_flip_target { + __u32 crtc_id; + __u32 fb_id; + __u32 flags; + __u32 sequence; + __u64 user_data; +}; + +/** + * struct drm_mode_create_dumb - Create a KMS dumb buffer for scanout. + * @height: buffer height in pixels + * @width: buffer width in pixels + * @bpp: bits per pixel + * @flags: must be zero + * @handle: buffer object handle + * @pitch: number of bytes between two consecutive lines + * @size: size of the whole buffer in bytes + * + * User-space fills @height, @width, @bpp and @flags. If the IOCTL succeeds, + * the kernel fills @handle, @pitch and @size. + */ +struct drm_mode_create_dumb { + __u32 height; + __u32 width; + __u32 bpp; + __u32 flags; + + __u32 handle; + __u32 pitch; + __u64 size; +}; + +/* set up for mmap of a dumb scanout buffer */ +struct drm_mode_map_dumb { + /** Handle for the object being mapped. */ + __u32 handle; + __u32 pad; + /** + * Fake offset to use for subsequent mmap call + * + * This is a fixed-size type for 32/64 compatibility. + */ + __u64 offset; +}; + +struct drm_mode_destroy_dumb { + __u32 handle; +}; + +/** + * DRM_MODE_ATOMIC_TEST_ONLY + * + * Do not apply the atomic commit, instead check whether the hardware supports + * this configuration. + * + * See &drm_mode_config_funcs.atomic_check for more details on test-only + * commits. + */ +#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100 +/** + * DRM_MODE_ATOMIC_NONBLOCK + * + * Do not block while applying the atomic commit. The &DRM_IOCTL_MODE_ATOMIC + * IOCTL returns immediately instead of waiting for the changes to be applied + * in hardware. Note, the driver will still check that the update can be + * applied before retuning. + */ +#define DRM_MODE_ATOMIC_NONBLOCK 0x0200 +/** + * DRM_MODE_ATOMIC_ALLOW_MODESET + * + * Allow the update to result in temporary or transient visible artifacts while + * the update is being applied. Applying the update may also take significantly + * more time than a page flip. All visual artifacts will disappear by the time + * the update is completed, as signalled through the vblank event's timestamp + * (see struct drm_event_vblank). + * + * This flag must be set when the KMS update might cause visible artifacts. + * Without this flag such KMS update will return a EINVAL error. What kind of + * update may cause visible artifacts depends on the driver and the hardware. + * User-space that needs to know beforehand if an update might cause visible + * artifacts can use &DRM_MODE_ATOMIC_TEST_ONLY without + * &DRM_MODE_ATOMIC_ALLOW_MODESET to see if it fails. + * + * To the best of the driver's knowledge, visual artifacts are guaranteed to + * not appear when this flag is not set. Some sinks might display visual + * artifacts outside of the driver's control. + */ +#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400 + +/** + * DRM_MODE_ATOMIC_FLAGS + * + * Bitfield of flags accepted by the &DRM_IOCTL_MODE_ATOMIC IOCTL in + * &drm_mode_atomic.flags. + */ +#define DRM_MODE_ATOMIC_FLAGS (\ + DRM_MODE_PAGE_FLIP_EVENT |\ + DRM_MODE_PAGE_FLIP_ASYNC |\ + DRM_MODE_ATOMIC_TEST_ONLY |\ + DRM_MODE_ATOMIC_NONBLOCK |\ + DRM_MODE_ATOMIC_ALLOW_MODESET) + +struct drm_mode_atomic { + __u32 flags; + __u32 count_objs; + __u64 objs_ptr; + __u64 count_props_ptr; + __u64 props_ptr; + __u64 prop_values_ptr; + __u64 reserved; + __u64 user_data; +}; + +struct drm_format_modifier_blob { +#define FORMAT_BLOB_CURRENT 1 + /* Version of this blob format */ + __u32 version; + + /* Flags */ + __u32 flags; + + /* Number of fourcc formats supported */ + __u32 count_formats; + + /* Where in this blob the formats exist (in bytes) */ + __u32 formats_offset; + + /* Number of drm_format_modifiers */ + __u32 count_modifiers; + + /* Where in this blob the modifiers exist (in bytes) */ + __u32 modifiers_offset; + + /* __u32 formats[] */ + /* struct drm_format_modifier modifiers[] */ +}; + +struct drm_format_modifier { + /* Bitmask of formats in get_plane format list this info applies to. The + * offset allows a sliding window of which 64 formats (bits). + * + * Some examples: + * In today's world with < 65 formats, and formats 0, and 2 are + * supported + * 0x0000000000000005 + * ^-offset = 0, formats = 5 + * + * If the number formats grew to 128, and formats 98-102 are + * supported with the modifier: + * + * 0x0000007c00000000 0000000000000000 + * ^ + * |__offset = 64, formats = 0x7c00000000 + * + */ + __u64 formats; + __u32 offset; + __u32 pad; + + /* The modifier that applies to the >get_plane format list bitmask. */ + __u64 modifier; +}; + +/** + * struct drm_mode_create_blob - Create New blob property + * + * Create a new 'blob' data property, copying length bytes from data pointer, + * and returning new blob ID. + */ +struct drm_mode_create_blob { + /** @data: Pointer to data to copy. */ + __u64 data; + /** @length: Length of data to copy. */ + __u32 length; + /** @blob_id: Return: new property ID. */ + __u32 blob_id; +}; + +/** + * struct drm_mode_destroy_blob - Destroy user blob + * @blob_id: blob_id to destroy + * + * Destroy a user-created blob property. + * + * User-space can release blobs as soon as they do not need to refer to them by + * their blob object ID. For instance, if you are using a MODE_ID blob in an + * atomic commit and you will not make another commit re-using the same ID, you + * can destroy the blob as soon as the commit has been issued, without waiting + * for it to complete. + */ +struct drm_mode_destroy_blob { + __u32 blob_id; +}; + +/** + * struct drm_mode_create_lease - Create lease + * + * Lease mode resources, creating another drm_master. + * + * The @object_ids array must reference at least one CRTC, one connector and + * one plane if &DRM_CLIENT_CAP_UNIVERSAL_PLANES is enabled. Alternatively, + * the lease can be completely empty. + */ +struct drm_mode_create_lease { + /** @object_ids: Pointer to array of object ids (__u32) */ + __u64 object_ids; + /** @object_count: Number of object ids */ + __u32 object_count; + /** @flags: flags for new FD (O_CLOEXEC, etc) */ + __u32 flags; + + /** @lessee_id: Return: unique identifier for lessee. */ + __u32 lessee_id; + /** @fd: Return: file descriptor to new drm_master file */ + __u32 fd; +}; + +/** + * struct drm_mode_list_lessees - List lessees + * + * List lesses from a drm_master. + */ +struct drm_mode_list_lessees { + /** + * @count_lessees: Number of lessees. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_lessees; + /** @pad: Padding. */ + __u32 pad; + + /** + * @lessees_ptr: Pointer to lessees. + * + * Pointer to __u64 array of lessee ids + */ + __u64 lessees_ptr; +}; + +/** + * struct drm_mode_get_lease - Get Lease + * + * Get leased objects. + */ +struct drm_mode_get_lease { + /** + * @count_objects: Number of leased objects. + * + * On input, provides length of the array. + * On output, provides total number. No + * more than the input number will be written + * back, so two calls can be used to get + * the size and then the data. + */ + __u32 count_objects; + /** @pad: Padding. */ + __u32 pad; + + /** + * @objects_ptr: Pointer to objects. + * + * Pointer to __u32 array of object ids. + */ + __u64 objects_ptr; +}; + +/** + * struct drm_mode_revoke_lease - Revoke lease + */ +struct drm_mode_revoke_lease { + /** @lessee_id: Unique ID of lessee */ + __u32 lessee_id; +}; + +/** + * struct drm_mode_rect - Two dimensional rectangle. + * @x1: Horizontal starting coordinate (inclusive). + * @y1: Vertical starting coordinate (inclusive). + * @x2: Horizontal ending coordinate (exclusive). + * @y2: Vertical ending coordinate (exclusive). + * + * With drm subsystem using struct drm_rect to manage rectangular area this + * export it to user-space. + * + * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS. + */ +struct drm_mode_rect { + __s32 x1; + __s32 y1; + __s32 x2; + __s32 y2; +}; + +/** + * struct drm_mode_closefb + * @fb_id: Framebuffer ID. + * @pad: Must be zero. + */ +struct drm_mode_closefb { + __u32 fb_id; + __u32 pad; +}; + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h index b88fe20cf..a63d453f0 100644 --- a/plugins/amdgpu/kfd_ioctl.h +++ b/plugins/amdgpu/kfd_ioctl.h @@ -23,9 +23,12 @@ #ifndef KFD_IOCTL_H_INCLUDED #define KFD_IOCTL_H_INCLUDED -#include #include +/* Define __user as empty for kernel headers in user-space */ +#define __user +#include "drm.h" + /* * - 1.1 - initial version * - 1.3 - Add SMI events support @@ -39,8 +42,8 @@ #define KFD_IOCTL_MINOR_VERSION 8 struct kfd_ioctl_get_version_args { - __u32 major_version; /* from KFD */ - __u32 minor_version; /* from KFD */ + uint32_t major_version; /* from KFD */ + uint32_t minor_version; /* from KFD */ }; /* For kfd_ioctl_create_queue_args.queue_type. */ @@ -53,51 +56,51 @@ struct kfd_ioctl_get_version_args { #define KFD_MAX_QUEUE_PRIORITY 15 struct kfd_ioctl_create_queue_args { - __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ - __u64 doorbell_offset; /* from KFD */ + uint64_t ring_base_address; /* to KFD */ + uint64_t write_pointer_address; /* from KFD */ + uint64_t read_pointer_address; /* from KFD */ + uint64_t doorbell_offset; /* from KFD */ - __u32 ring_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 queue_type; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ - __u32 queue_id; /* from KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t queue_type; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ + uint32_t queue_id; /* from KFD */ - __u64 eop_buffer_address; /* to KFD */ - __u64 eop_buffer_size; /* to KFD */ - __u64 ctx_save_restore_address; /* to KFD */ - __u32 ctx_save_restore_size; /* to KFD */ - __u32 ctl_stack_size; /* to KFD */ + uint64_t eop_buffer_address; /* to KFD */ + uint64_t eop_buffer_size; /* to KFD */ + uint64_t ctx_save_restore_address; /* to KFD */ + uint32_t ctx_save_restore_size; /* to KFD */ + uint32_t ctl_stack_size; /* to KFD */ }; struct kfd_ioctl_destroy_queue_args { - __u32 queue_id; /* to KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_update_queue_args { - __u64 ring_base_address; /* to KFD */ + uint64_t ring_base_address; /* to KFD */ - __u32 queue_id; /* to KFD */ - __u32 ring_size; /* to KFD */ - __u32 queue_percentage; /* to KFD */ - __u32 queue_priority; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ }; struct kfd_ioctl_set_cu_mask_args { - __u32 queue_id; /* to KFD */ - __u32 num_cu_mask; /* to KFD */ - __u64 cu_mask_ptr; /* to KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t num_cu_mask; /* to KFD */ + uint64_t cu_mask_ptr; /* to KFD */ }; struct kfd_ioctl_get_queue_wave_state_args { - __u64 ctl_stack_address; /* to KFD */ - __u32 ctl_stack_used_size; /* from KFD */ - __u32 save_area_used_size; /* from KFD */ - __u32 queue_id; /* to KFD */ - __u32 pad; + uint64_t ctl_stack_address; /* to KFD */ + uint32_t ctl_stack_used_size; /* from KFD */ + uint32_t save_area_used_size; /* from KFD */ + uint32_t queue_id; /* to KFD */ + uint32_t pad; }; /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ @@ -105,13 +108,13 @@ struct kfd_ioctl_get_queue_wave_state_args { #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 struct kfd_ioctl_set_memory_policy_args { - __u64 alternate_aperture_base; /* to KFD */ - __u64 alternate_aperture_size; /* to KFD */ + uint64_t alternate_aperture_base; /* to KFD */ + uint64_t alternate_aperture_size; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 default_policy; /* to KFD */ - __u32 alternate_policy; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t default_policy; /* to KFD */ + uint32_t alternate_policy; /* to KFD */ + uint32_t pad; }; /* @@ -122,24 +125,24 @@ struct kfd_ioctl_set_memory_policy_args { */ struct kfd_ioctl_get_clock_counters_args { - __u64 gpu_clock_counter; /* from KFD */ - __u64 cpu_clock_counter; /* from KFD */ - __u64 system_clock_counter; /* from KFD */ - __u64 system_clock_freq; /* from KFD */ + uint64_t gpu_clock_counter; /* from KFD */ + uint64_t cpu_clock_counter; /* from KFD */ + uint64_t system_clock_counter; /* from KFD */ + uint64_t system_clock_freq; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_process_device_apertures { - __u64 lds_base; /* from KFD */ - __u64 lds_limit; /* from KFD */ - __u64 scratch_base; /* from KFD */ - __u64 scratch_limit; /* from KFD */ - __u64 gpuvm_base; /* from KFD */ - __u64 gpuvm_limit; /* from KFD */ - __u32 gpu_id; /* from KFD */ - __u32 pad; + uint64_t lds_base; /* from KFD */ + uint64_t lds_limit; /* from KFD */ + uint64_t scratch_base; /* from KFD */ + uint64_t scratch_limit; /* from KFD */ + uint64_t gpuvm_base; /* from KFD */ + uint64_t gpuvm_limit; /* from KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t pad; }; /* @@ -152,20 +155,20 @@ struct kfd_ioctl_get_process_apertures_args { struct kfd_process_device_apertures process_apertures[NUM_OF_SUPPORTED_GPUS]; /* from KFD */ /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; struct kfd_ioctl_get_process_apertures_new_args { /* User allocated. Pointer to struct kfd_process_device_apertures * filled in by Kernel */ - __u64 kfd_process_device_apertures_ptr; + uint64_t kfd_process_device_apertures_ptr; /* to KFD - indicates amount of memory present in kfd_process_device_apertures_ptr * from KFD - Number of entries filled by KFD. */ - __u32 num_of_nodes; - __u32 pad; + uint32_t num_of_nodes; + uint32_t pad; }; #define MAX_ALLOWED_NUM_POINTS 100 @@ -173,25 +176,25 @@ struct kfd_ioctl_get_process_apertures_new_args { #define MAX_ALLOWED_WAC_BUFF_SIZE 128 struct kfd_ioctl_dbg_register_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_unregister_args { - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_dbg_address_watch_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; struct kfd_ioctl_dbg_wave_control_args { - __u64 content_ptr; /* a pointer to the actual content */ - __u32 gpu_id; /* to KFD */ - __u32 buf_size_in_bytes; /*including gpu_id and buf_size */ + uint64_t content_ptr; /* a pointer to the actual content */ + uint32_t gpu_id; /* to KFD */ + uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ }; #define KFD_INVALID_FD 0xffffffff @@ -228,43 +231,43 @@ struct kfd_ioctl_dbg_wave_control_args { #define KFD_MEM_ERR_GPU_HANG 3 struct kfd_ioctl_create_event_args { - __u64 event_page_offset; /* from KFD */ - __u32 event_trigger_data; /* from KFD - signal events only */ - __u32 event_type; /* to KFD */ - __u32 auto_reset; /* to KFD */ - __u32 node_id; /* to KFD - only valid for certain event types */ - __u32 event_id; /* from KFD */ - __u32 event_slot_index; /* from KFD */ + uint64_t event_page_offset; /* from KFD */ + uint32_t event_trigger_data; /* from KFD - signal events only */ + uint32_t event_type; /* to KFD */ + uint32_t auto_reset; /* to KFD */ + uint32_t node_id; /* to KFD - only valid for certain event types */ + uint32_t event_id; /* from KFD */ + uint32_t event_slot_index; /* from KFD */ }; struct kfd_ioctl_destroy_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_set_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_reset_event_args { - __u32 event_id; /* to KFD */ - __u32 pad; + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_memory_exception_failure { - __u32 NotPresent; /* Page not present or supervisor privilege */ - __u32 ReadOnly; /* Write access to a read-only page */ - __u32 NoExecute; /* Execute access to a page marked NX */ - __u32 imprecise; /* Can't determine the exact fault address */ + uint32_t NotPresent; /* Page not present or supervisor privilege */ + uint32_t ReadOnly; /* Write access to a read-only page */ + uint32_t NoExecute; /* Execute access to a page marked NX */ + uint32_t imprecise; /* Can't determine the exact fault address */ }; /* memory exception data */ struct kfd_hsa_memory_exception_data { struct kfd_memory_exception_failure failure; - __u64 va; - __u32 gpu_id; - __u32 ErrorType; /* 0 = no RAS error, + uint64_t va; + uint32_t gpu_id; + uint32_t ErrorType; /* 0 = no RAS error, * 1 = ECC_SRAM, * 2 = Link_SYNFLOOD (poison), * 3 = GPU hang (not attributable to a specific cause), @@ -274,10 +277,10 @@ struct kfd_hsa_memory_exception_data { /* hw exception data */ struct kfd_hsa_hw_exception_data { - __u32 reset_type; - __u32 reset_cause; - __u32 memory_lost; - __u32 gpu_id; + uint32_t reset_type; + uint32_t reset_cause; + uint32_t memory_lost; + uint32_t gpu_id; }; /* Event data */ @@ -286,57 +289,57 @@ struct kfd_event_data { struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; }; /* From KFD */ - __u64 kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - __u32 event_id; /* to KFD */ - __u32 pad; + uint64_t kfd_event_data_ext; /* pointer to an extension structure for future exception types */ + uint32_t event_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_wait_events_args { - __u64 events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - __u32 num_events; /* to KFD */ - __u32 wait_for_all; /* to KFD */ - __u32 timeout; /* to KFD */ - __u32 wait_result; /* from KFD */ + uint64_t events_ptr; /* pointed to struct kfd_event_data array, to KFD */ + uint32_t num_events; /* to KFD */ + uint32_t wait_for_all; /* to KFD */ + uint32_t timeout; /* to KFD */ + uint32_t wait_result; /* from KFD */ }; struct kfd_ioctl_set_scratch_backing_va_args { - __u64 va_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t va_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_get_tile_config_args { /* to KFD: pointer to tile array */ - __u64 tile_config_ptr; + uint64_t tile_config_ptr; /* to KFD: pointer to macro tile array */ - __u64 macro_tile_config_ptr; + uint64_t macro_tile_config_ptr; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_tile_configs; + uint32_t num_tile_configs; /* to KFD: array size allocated by user mode * from KFD: array size filled by kernel */ - __u32 num_macro_tile_configs; + uint32_t num_macro_tile_configs; - __u32 gpu_id; /* to KFD */ - __u32 gb_addr_config; /* from KFD */ - __u32 num_banks; /* from KFD */ - __u32 num_ranks; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t gb_addr_config; /* from KFD */ + uint32_t num_banks; /* from KFD */ + uint32_t num_ranks; /* from KFD */ /* struct size can be extended later if needed without breaking ABI compatibility */ }; struct kfd_ioctl_set_trap_handler_args { - __u64 tba_addr; /* to KFD */ - __u64 tma_addr; /* to KFD */ - __u32 gpu_id; /* to KFD */ - __u32 pad; + uint64_t tba_addr; /* to KFD */ + uint64_t tma_addr; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t pad; }; struct kfd_ioctl_acquire_vm_args { - __u32 drm_fd; /* to KFD */ - __u32 gpu_id; /* to KFD */ + uint32_t drm_fd; /* to KFD */ + uint32_t gpu_id; /* to KFD */ }; /* Allocation flags: memory types */ @@ -367,12 +370,12 @@ struct kfd_ioctl_acquire_vm_args { * @flags: memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above */ struct kfd_ioctl_alloc_memory_of_gpu_args { - __u64 va_addr; /* to KFD */ - __u64 size; /* to KFD */ - __u64 handle; /* from KFD */ - __u64 mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ - __u32 gpu_id; /* to KFD */ - __u32 flags; + uint64_t va_addr; /* to KFD */ + uint64_t size; /* to KFD */ + uint64_t handle; /* from KFD */ + uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ + uint32_t gpu_id; /* to KFD */ + uint32_t flags; }; /* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu @@ -380,13 +383,13 @@ struct kfd_ioctl_alloc_memory_of_gpu_args { * @handle: memory handle returned by alloc */ struct kfd_ioctl_free_memory_of_gpu_args { - __u64 handle; /* to KFD */ + uint64_t handle; /* to KFD */ }; /* Map memory to one or more GPUs * * @handle: memory handle returned by alloc - * @device_ids_array_ptr: array of gpu_ids (__u32 per device) + * @device_ids_array_ptr: array of gpu_ids (uint32_t per device) * @n_devices: number of devices in the array * @n_success: number of devices mapped successfully * @@ -399,10 +402,10 @@ struct kfd_ioctl_free_memory_of_gpu_args { * n_devices. */ struct kfd_ioctl_map_memory_to_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Unmap memory from one or more GPUs @@ -410,10 +413,10 @@ struct kfd_ioctl_map_memory_to_gpu_args { * same arguments as for mapping */ struct kfd_ioctl_unmap_memory_from_gpu_args { - __u64 handle; /* to KFD */ - __u64 device_ids_array_ptr; /* to KFD */ - __u32 n_devices; /* to KFD */ - __u32 n_success; /* to/from KFD */ + uint64_t handle; /* to KFD */ + uint64_t device_ids_array_ptr; /* to KFD */ + uint32_t n_devices; /* to KFD */ + uint32_t n_success; /* to/from KFD */ }; /* Allocate GWS for specific queue @@ -424,28 +427,28 @@ struct kfd_ioctl_unmap_memory_from_gpu_args { * only support contiguous GWS allocation */ struct kfd_ioctl_alloc_queue_gws_args { - __u32 queue_id; /* to KFD */ - __u32 num_gws; /* to KFD */ - __u32 first_gws; /* from KFD */ - __u32 pad; + uint32_t queue_id; /* to KFD */ + uint32_t num_gws; /* to KFD */ + uint32_t first_gws; /* from KFD */ + uint32_t pad; }; struct kfd_ioctl_get_dmabuf_info_args { - __u64 size; /* from KFD */ - __u64 metadata_ptr; /* to KFD */ - __u32 metadata_size; /* to KFD (space allocated by user) + uint64_t size; /* from KFD */ + uint64_t metadata_ptr; /* to KFD */ + uint32_t metadata_size; /* to KFD (space allocated by user) * from KFD (actual metadata size) */ - __u32 gpu_id; /* from KFD */ - __u32 flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ - __u32 dmabuf_fd; /* to KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ + uint32_t dmabuf_fd; /* to KFD */ }; struct kfd_ioctl_import_dmabuf_args { - __u64 va_addr; /* to KFD */ - __u64 handle; /* from KFD */ - __u32 gpu_id; /* to KFD */ - __u32 dmabuf_fd; /* to KFD */ + uint64_t va_addr; /* to KFD */ + uint64_t handle; /* from KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t dmabuf_fd; /* to KFD */ }; /* @@ -463,8 +466,8 @@ enum kfd_smi_event { #define KFD_SMI_EVENT_MSG_SIZE 96 struct kfd_ioctl_smi_events_args { - __u32 gpuid; /* to KFD */ - __u32 anon_fd; /* from KFD */ + uint32_t gpuid; /* to KFD */ + uint32_t anon_fd; /* from KFD */ }; /************************************************************************************************** @@ -510,33 +513,33 @@ enum kfd_criu_op { * Return: 0 on success, -errno on failure */ struct kfd_ioctl_criu_args { - __u64 devices; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 bos; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data; /* Used during ops: CHECKPOINT, RESTORE */ - __u64 priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ - __u32 pid; /* Used during ops: PROCESS_INFO, RESUME */ - __u32 op; + uint64_t devices; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t bos; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data; /* Used during ops: CHECKPOINT, RESTORE */ + uint64_t priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ + uint32_t pid; /* Used during ops: PROCESS_INFO, RESUME */ + uint32_t op; }; struct kfd_criu_device_bucket { - __u32 user_gpu_id; - __u32 actual_gpu_id; - __u32 drm_fd; - __u32 pad; + uint32_t user_gpu_id; + uint32_t actual_gpu_id; + uint32_t drm_fd; + uint32_t pad; }; struct kfd_criu_bo_bucket { - __u64 addr; - __u64 size; - __u64 offset; - __u64 restored_offset; /* During restore, updated offset for BO */ - __u32 gpu_id; /* This is the user_gpu_id */ - __u32 alloc_flags; - __u32 dmabuf_fd; - __u32 pad; + uint64_t addr; + uint64_t size; + uint64_t offset; + uint64_t restored_offset; /* During restore, updated offset for BO */ + uint32_t gpu_id; /* This is the user_gpu_id */ + uint32_t alloc_flags; + uint32_t dmabuf_fd; + uint32_t pad; }; /* CRIU IOCTLs - END */ @@ -616,8 +619,8 @@ enum kfd_ioctl_svm_attr_type { * @value: attribute value */ struct kfd_ioctl_svm_attribute { - __u32 type; - __u32 value; + uint32_t type; + uint32_t value; }; /** @@ -659,10 +662,10 @@ struct kfd_ioctl_svm_attribute { * attribute type to indicate the access for the specified GPU. */ struct kfd_ioctl_svm_args { - __u64 start_addr; - __u64 size; - __u32 op; - __u32 nattr; + uint64_t start_addr; + uint64_t size; + uint32_t op; + uint32_t nattr; /* Variable length array of attributes */ struct kfd_ioctl_svm_attribute attrs[0]; }; diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile new file mode 100644 index 000000000..2c1944a34 --- /dev/null +++ b/plugins/cuda/Makefile @@ -0,0 +1,40 @@ +PLUGIN_NAME := cuda_plugin +PLUGIN_SOBJ := cuda_plugin.so + +DEPS_CUDA := $(PLUGIN_SOBJ) + +PLUGIN_INCLUDE := -iquote../../include +PLUGIN_INCLUDE += -iquote../../criu/include +PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ +PLUGIN_INCLUDE += -iquote../../ + +COMPEL := ../../compel/compel-host + +PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC + +__nmk_dir ?= ../../scripts/nmk/scripts/ +include $(__nmk_dir)msg.mk + +all: $(DEPS_CUDA) + +cuda_plugin.so: cuda_plugin.c + $(call msg-gen, $@) + $(Q) $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) + +clean: + $(call msg-clean, $@) + $(Q) $(RM) $(PLUGIN_SOBJ) +.PHONY: clean + +mrproper: clean + +install: + $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) + $(E) " INSTALL " $(PLUGIN_NAME) + $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) +.PHONY: install + +uninstall: + $(E) " UNINSTALL" $(PLUGIN_NAME) + $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) +.PHONY: uninstall diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md new file mode 100644 index 000000000..7b91f6998 --- /dev/null +++ b/plugins/cuda/README.md @@ -0,0 +1,59 @@ +Checkpoint and Restore for CUDA applications with CRIU +====================================================== + +# Requirements +The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555 +or higher GPU driver is required for CUDA CRIU integration support. + +## cuda-checkpoint +The cuda-checkpoint utility can be found at: +https://github.com/NVIDIA/cuda-checkpoint + +cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA +applications. Updating the cuda-checkpoint utility between driver releases +should not be necessary as the utility simply exposes some extra driver behavior +so driver updates are all that's needed to get access to newer features. + +# Checkpointing Procedure +cuda-checkpoint exposes 4 actions used in the checkpointing process: lock, +checkpoint, restore, unlock. + +* lock - Used with the PAUSE_DEVICES hook while a process is still running to + quiesce the application into a state where it can be checkpointed +* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been + seized/frozen to perform the actual checkpointing operation +* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA + state and release the process back to it's running state + +These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA +plugin will re-wake when needed. + +# Known Limitations +* Currently GPU memory contents are brought into main system memory and CRIU + then checkpoints that as part of the normal procedure. On systems with many + GPU's with high GPU memory usage this can cause memory thrashing. A future + CUDA release will add support for dumping the memory contents to files to + alleviate this as well as support in the CRIU plugin. +* There's currently a small race between when a PAUSE_DEVICES hook is called on + a running process and a process calls cuInit() and finishes initializing CUDA + after the PAUSE is issued but before the process is frozen to checkpoint. This + will cause cuda-checkpoint to report that the process is in an illegal state + for checkpointing and it's recommended to just attempt the CRIU procedure + again, this should be very rare. +* Applications that use NVML will leave some leftover device references as NVML + is not currently supported for checkpointing. There will be support for this + in later drivers. A possible temporary workaround is to have the + {DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N} + remaining references for these applications as in most cases NVML is used to + get info such as gpu count and some capabilities and these values are never + accessed again and unlikely to change. +* CUDA applications that fork() but don't call exec() but also don't issue any + CUDA API calls will have some leftover references to /dev/nvidia* and fail to + checkpoint as a result. This can be worked around in a similar fashion to the + NVML case where the leftover references can be ignored as CUDA is not fork() + safe anyway. +* Restore currently requires that you restore on a system with similar GPU's and + same GPU count. +* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process + Service) are currently not supported for checkpointing. Future CUDA releases + will add support for these. diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c new file mode 100644 index 000000000..9ccb04224 --- /dev/null +++ b/plugins/cuda/cuda_plugin.c @@ -0,0 +1,631 @@ +#include "criu-log.h" +#include "plugin.h" +#include "util.h" +#include "cr_options.h" +#include "pid.h" +#include "proc_parse.h" +#include "seize.h" +#include "fault-injection.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +/* cuda-checkpoint binary should live in your PATH */ +#define CUDA_CHECKPOINT "cuda-checkpoint" + +/* cuda-checkpoint --action flags */ +#define ACTION_LOCK "lock" +#define ACTION_CHECKPOINT "checkpoint" +#define ACTION_RESTORE "restore" +#define ACTION_UNLOCK "unlock" + +typedef enum { + CUDA_TASK_RUNNING = 0, + CUDA_TASK_LOCKED, + CUDA_TASK_CHECKPOINTED, + CUDA_TASK_UNKNOWN = -1 +} cuda_task_state_t; + +#define CUDA_CKPT_BUF_SIZE (128) + +#ifdef LOG_PREFIX +#undef LOG_PREFIX +#endif +#define LOG_PREFIX "cuda_plugin: " + +/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver + * version doesn't support --action flag + */ +bool plugin_disabled = false; + +bool plugin_added_to_inventory = false; + +struct pid_info { + int pid; + char checkpointed; + cuda_task_state_t initial_task_state; + struct list_head list; +}; + +/* Used to track which PID's we've paused CUDA operations on so far so we can + * release them after we're done with the DUMP + */ +static LIST_HEAD(cuda_pids); + +static void dealloc_pid_buffer(struct list_head *pid_buf) +{ + struct pid_info *info; + struct pid_info *n; + + list_for_each_entry_safe(info, n, pid_buf, list) { + list_del(&info->list); + xfree(info); + } +} + +static int add_pid_to_buf(struct list_head *pid_buf, int pid, cuda_task_state_t state) +{ + struct pid_info *new = xmalloc(sizeof(*new)); + + if (new == NULL) { + return -1; + } + + new->pid = pid; + new->checkpointed = 0; + new->initial_task_state = state; + list_add_tail(&new->list, pid_buf); + + return 0; +} + +static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) +{ +#define READ 0 +#define WRITE 1 + int fd[2], buf_off; + + if (pipe(fd) != 0) { + pr_perror("Couldn't create pipes for reading cuda-checkpoint output"); + return -1; + } + + buf[0] = '\0'; + + int child_pid = fork(); + if (child_pid == -1) { + pr_perror("Failed to fork to exec cuda-checkpoint"); + close(fd[READ]); + close(fd[WRITE]); + return -1; + } + + if (child_pid == 0) { // child + if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDOUT_FILENO); + _exit(EXIT_FAILURE); + } + if (dup2(fd[WRITE], STDERR_FILENO) == -1) { + pr_perror("unable to clone fd %d->%d", fd[WRITE], STDERR_FILENO); + _exit(EXIT_FAILURE); + } + close(fd[READ]); + + close_fds(STDERR_FILENO + 1); + + execvp(args[0], (char **)args); + + /* We can't use pr_error() as log file fd is closed. */ + fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno)); + + _exit(EXIT_FAILURE); + } + + close(fd[WRITE]); + buf_off = 0; + /* Reserve one byte for the null charracter. */ + buf_size--; + while (buf_off < buf_size) { + int bytes_read; + bytes_read = read(fd[READ], buf + buf_off, buf_size - buf_off); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; + } + if (bytes_read == 0) + break; + buf_off += bytes_read; + } + buf[buf_off] = '\0'; + + /* Clear out any of the remaining output in the pipe in case the buffer wasn't large enough */ + while (true) { + char scratch[1024]; + int bytes_read; + bytes_read = read(fd[READ], scratch, sizeof(scratch)); + if (bytes_read == -1) { + pr_perror("Unable to read output of cuda-checkpoint"); + goto err; + } + if (bytes_read == 0) + break; + } + close(fd[READ]); + + int status, exit_code = -1; + if (waitpid(child_pid, &status, 0) == -1) { + pr_perror("Unable to wait for the cuda-checkpoint process %d", child_pid); + goto err; + } + if (WIFSIGNALED(status)) { + int sig = WTERMSIG(status); + pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); + } else if (WIFEXITED(status)) { + exit_code = WEXITSTATUS(status); + } else { + pr_err("cuda-checkpoint exited improperly: %u\n", status); + } + + if (exit_code != EXIT_SUCCESS) + pr_debug("cuda-checkpoint output ===>\n%s\n" + "<=== cuda-checkpoint output\n", + buf); + + return exit_code; +err: + kill(child_pid, SIGKILL); + waitpid(child_pid, NULL, 0); + return -1; +} + +/** + * Checks if a given flag is supported by the cuda-checkpoint utility + * + * Returns: + * 1 if the flag is supported, + * 0 if the flag is not supported, + * -1 if there was an error launching the cuda-checkpoint utility. + */ +static int cuda_checkpoint_supports_flag(const char *flag) +{ + char msg_buf[2048]; + const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; + + if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0) + return -1; + + if (strstr(msg_buf, flag) == NULL) + return 0; + + return 1; +} + +/* Retrieve the cuda restore thread TID from the root pid */ +static int get_cuda_restore_tid(int root_pid) +{ + char pid_buf[16]; + char pid_out[CUDA_CKPT_BUF_SIZE]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid); + + const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL }; + int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out)); + if (ret != 0) { + pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out); + return -1; + } + + return atoi(pid_out); +} + +static cuda_task_state_t get_task_state_enum(const char *state_str) +{ + if (strncmp(state_str, "running", 7) == 0) + return CUDA_TASK_RUNNING; + + if (strncmp(state_str, "locked", 6) == 0) + return CUDA_TASK_LOCKED; + + if (strncmp(state_str, "checkpointed", 12) == 0) + return CUDA_TASK_CHECKPOINTED; + + pr_err("Unknown CUDA state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; +} + +static cuda_task_state_t get_cuda_state(pid_t pid) +{ + char pid_buf[16]; + char state_str[CUDA_CKPT_BUF_SIZE]; + const char *args[] = { CUDA_CHECKPOINT, "--get-state", "--pid", pid_buf, NULL }; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + if (launch_cuda_checkpoint(args, state_str, sizeof(state_str))) { + pr_err("Failed to launch cuda-checkpoint to retrieve state: %s\n", state_str); + return CUDA_TASK_UNKNOWN; + } + + return get_task_state_enum(state_str); +} + +static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, + int buf_size) +{ + char pid_buf[16]; + char timeout_buf[16]; + + snprintf(pid_buf, sizeof(pid_buf), "%d", pid); + + const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */, + NULL /* timeout_val */, NULL }; + if (timeout > 0) { + snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout); + args[5] = "--timeout"; + args[6] = timeout_buf; + } + + return launch_cuda_checkpoint(args, msg_buf, buf_size); +} + +static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset) +{ + /* Since we resumed a thread that CRIU previously already froze we need to + * INTERRUPT it once again, task was already SEIZE'd so we don't need to do + * a compel_interrupt_task() + */ + if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { + pr_perror("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state", + restore_tid); + return -1; + } + + struct proc_status_creds creds; + if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) { + pr_err("compel_wait_task failed after interrupt\n"); + return -1; + } + + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { + pr_perror("Failed to set ptrace options on interrupt for restore tid %d", restore_tid); + return -1; + } + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { + pr_perror("Unable to restore original sigmask to restore tid %d", restore_tid); + return -1; + } + + return 0; +} + +static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) +{ + k_rtsigset_t block; + + if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { + pr_perror("Failed to get current sigmask for restore tid %d", restore_tid); + return -1; + } + + ksigfillset(&block); + ksigdelset(&block, SIGTRAP); + + if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { + pr_perror("Failed to block signals on restore tid %d", restore_tid); + return -1; + } + + // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread + if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { + pr_perror("Could not clear ptrace options on restore tid %d", restore_tid); + return -1; + } + + if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { + pr_perror("Could not resume cuda restore tid %d", restore_tid); + return -1; + } + + return 0; +} + +int cuda_plugin_checkpoint_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int int_ret; + int status; + k_rtsigset_t save_sigset; + struct pid_info *task_info; + bool pid_found = false; + + if (plugin_disabled) { + return -ENOTSUP; + } + + restore_tid = get_cuda_restore_tid(pid); + + /* We can possibly hit a race with cuInit() where we are past the point of + * locking the process but at lock time cuInit() hadn't completed in which + * case cuda-checkpoint will report that we're in an invalid state to + * checkpoint + */ + if (restore_tid == -1) { + pr_info("No need to checkpoint devices on pid %d\n", pid); + return 0; + } + + /* Check if the process is already in a checkpointed state */ + list_for_each_entry(task_info, &cuda_pids, list) { + if (task_info->pid == pid) { + if (task_info->initial_task_state == CUDA_TASK_CHECKPOINTED) { + pr_info("pid %d already in a checkpointed state\n", pid); + return 0; + } + pid_found = true; + break; + } + } + + if (pid_found == false) { + /* We return an error here. The task should be restored + * to its original state at cuda_plugin_fini(). + */ + pr_err("Failed to track pid %d\n", pid); + return -1; + } + + pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); + /* We need to resume the checkpoint thread to prepare the mappings for + * checkpointing + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + + task_info->checkpointed = 1; + status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); + } + + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + return status != 0 ? -1 : int_ret; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); + +int cuda_plugin_pause_devices(int pid) +{ + int restore_tid; + char msg_buf[CUDA_CKPT_BUF_SIZE]; + cuda_task_state_t task_state; + + if (plugin_disabled) { + return -ENOTSUP; + } + + restore_tid = get_cuda_restore_tid(pid); + + if (restore_tid == -1) { + pr_info("no need to pause devices on pid %d\n", pid); + return 0; + } + + task_state = get_cuda_state(restore_tid); + if (task_state == CUDA_TASK_UNKNOWN) { + pr_err("Failed to get CUDA state for PID %d\n", restore_tid); + return -1; + } + + if (!plugin_added_to_inventory) { + if (add_inventory_plugin(CR_PLUGIN_DESC.name)) { + pr_err("Failed to add CUDA plugin to inventory image\n"); + return -1; + } + plugin_added_to_inventory = true; + } + + if (task_state == CUDA_TASK_LOCKED) { + pr_info("pid %d already in a locked state\n", pid); + /* Leave this PID in a "locked" state at resume_device() */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_LOCKED); + return 0; + } + + if (task_state == CUDA_TASK_CHECKPOINTED) { + /* We need to skip this PID in cuda_plugin_checkpoint_devices(), + * and leave it in a "checkpoined" state at resume_device(). */ + add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_CHECKPOINTED); + return 0; + } + + pr_info("pausing devices on pid %d\n", pid); + int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); + if (alarm_timeouted()) + goto unlock; + return -1; + } + + if (add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_RUNNING)) { + pr_err("unable to track paused pid %d\n", pid); + goto unlock; + } + + return 0; +unlock: + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); + } + return -1; +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) + +int resume_device(int pid, int checkpointed, cuda_task_state_t initial_task_state) +{ + char msg_buf[CUDA_CKPT_BUF_SIZE]; + int status; + int ret = 0; + int int_ret; + k_rtsigset_t save_sigset; + + if (initial_task_state == CUDA_TASK_UNKNOWN) { + pr_info("skip resume for PID %d (unknown state)\n", pid); + return 0; + } + + int restore_tid = get_cuda_restore_tid(pid); + if (restore_tid == -1) { + pr_info("No need to resume devices on pid %d\n", pid); + return 0; + } + + pr_info("resuming devices on pid %d\n", pid); + /* The resuming process has to stay frozen during this time otherwise + * attempting to access a UVM pointer will crash if we haven't restored the + * underlying mappings yet + */ + pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid); + /* wakeup the restore thread so we can handle the restore for this pid, + * rseq_cs has to be restored before execution + */ + if (resume_restore_thread(restore_tid, &save_sigset)) { + return -1; + } + + if (checkpointed && (initial_task_state == CUDA_TASK_RUNNING || initial_task_state == CUDA_TASK_LOCKED)) { + /* If the process was "locked" or "running" before checkpointing it, we need to restore it */ + status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); + ret = -1; + goto interrupt; + } + } + + if (initial_task_state == CUDA_TASK_RUNNING) { + /* If the process was "running" before we paused it, we need to unlock it */ + status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); + if (status) { + pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); + ret = -1; + } + } + +interrupt: + int_ret = interrupt_restore_thread(restore_tid, &save_sigset); + + return ret != 0 ? ret : int_ret; +} + +int cuda_plugin_resume_devices_late(int pid) +{ + if (plugin_disabled) { + return -ENOTSUP; + } + + /* RESUME_DEVICES_LATE is used during `criu restore`. + * Here, we assume that users expect the target process + * to be in a "running" state after restore, even if it was + * in a "locked" or "checkpointed" state during `criu dump`. + */ + return resume_device(pid, 1, CUDA_TASK_RUNNING); +} +CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) + +/** + * Check if a CUDA device is available on the system + */ +static bool is_cuda_device_available(void) +{ + const char *gpu_path = "/proc/driver/nvidia/gpus/"; + struct stat sb; + + if (stat(gpu_path, &sb) != 0) + return false; + + return S_ISDIR(sb.st_mode); +} + +int cuda_plugin_init(int stage) +{ + int ret; + + /* Disable CUDA checkpointing with pre-dump */ + if (stage == CR_PLUGIN_STAGE__PRE_DUMP) { + plugin_disabled = true; + return 0; + } + + if (stage == CR_PLUGIN_STAGE__RESTORE) { + if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { + plugin_disabled = true; + return 0; + } + } + + if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) { + pr_info("No GPU device found; CUDA plugin is disabled\n"); + plugin_disabled = true; + return 0; + } + + ret = cuda_checkpoint_supports_flag("--action"); + if (ret == -1) { + pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT); + plugin_disabled = true; + return 0; + } + + if (ret == 0) { + pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); + plugin_disabled = true; + return 0; + } + + pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage); + + /* In the DUMP stage track all the PID's we've paused CUDA operations on to + * release them when we're done if the user requested the leave-running option + */ + if (stage == CR_PLUGIN_STAGE__DUMP) { + INIT_LIST_HEAD(&cuda_pids); + } + + set_compel_interrupt_only_mode(); + + return 0; +} + +void cuda_plugin_fini(int stage, int ret) +{ + if (plugin_disabled) { + return; + } + + pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret); + + /* Release all the paused PID's at the end of the DUMP stage in case the + * user provides the -R (leave-running) flag or an error occurred + */ + if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { + struct pid_info *info; + list_for_each_entry(info, &cuda_pids, list) { + resume_device(info->pid, info->checkpointed, info->initial_task_state); + } + } + if (stage == CR_PLUGIN_STAGE__DUMP) { + dealloc_pid_buffer(&cuda_pids); + } +} +CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini) diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index cab72e8a1..ed883f300 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -1,51 +1,14 @@ FROM alpine ARG CC=gcc -RUN apk update && apk add \ - $CC \ - bash \ - build-base \ - coreutils \ - procps \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - nftables \ - nftables-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - py3-pip \ - py3-protobuf \ - python3 \ - sudo - COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date -RUN apk add \ - ip6tables \ - iptables \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - e2fsprogs \ - py-yaml \ - py3-flake8 \ - asciidoctor +RUN apk add --no-cache "$CC" && /criu/contrib/dependencies/apk-packages.sh + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip3 install junit_xml - -# For zdtm we need an unversioned python binary -RUN ln -s /usr/bin/python3 /usr/bin/python - RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm index c0d181b03..ed66ae4fe 100644 --- a/scripts/build/Dockerfile.amd-rocm +++ b/scripts/build/Dockerfile.amd-rocm @@ -55,8 +55,8 @@ RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-insta protobuf-compiler \ python-protobuf \ python3-minimal \ - python3-future \ python-ipaddress \ + uuid-dev \ curl \ wget \ vim \ diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index d226244ee..261bd2d79 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -2,39 +2,14 @@ FROM docker.io/library/archlinux:latest ARG CC=gcc -RUN pacman -Syu --noconfirm \ - $CC \ - bash \ - make \ - coreutils \ - git \ - gnutls \ - libaio \ - libcap \ - libnet \ - libnl \ - nftables \ - pkgconfig \ - protobuf-c \ - protobuf \ - python-pip \ - python-protobuf \ - which \ - sudo \ - iptables \ - nftables \ - iproute2 \ - tar \ - bash \ - go \ - python-yaml \ - flake8 \ - asciidoctor \ - python-junit-xml \ - diffutils +# Initialize machine ID +RUN systemd-machine-id-setup COPY . /criu WORKDIR /criu + +RUN pacman -Syu --noconfirm "$CC" && contrib/dependencies/pacman-packages.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.centos7 b/scripts/build/Dockerfile.centos7 deleted file mode 100644 index 21e70ff0e..000000000 --- a/scripts/build/Dockerfile.centos7 +++ /dev/null @@ -1,45 +0,0 @@ -FROM centos:7 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm -RUN yum install -y \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - protobuf-python \ - python \ - python-flake8 \ - python-ipaddress \ - python2-future \ - python2-junit_xml \ - python-yaml \ - python-six \ - sudo \ - tar \ - which \ - e2fsprogs \ - python2-pip \ - rubygem-asciidoctor - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 deleted file mode 100644 index 488f95d65..000000000 --- a/scripts/build/Dockerfile.centos8 +++ /dev/null @@ -1,52 +0,0 @@ -FROM registry.centos.org/centos/centos:8 - -ARG CC=gcc - -RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core -RUN yum config-manager --set-enabled powertools -RUN yum install -y --allowerasing \ - asciidoc \ - coreutils \ - chkconfig \ - diffutils \ - findutils \ - gcc \ - git \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libselinux-devel \ - make \ - procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-devel \ - python3-flake8 \ - python3-PyYAML \ - python3-future \ - python3-protobuf \ - python3-pip \ - sudo \ - tar \ - which \ - xmlto - -RUN alternatives --set python /usr/bin/python3 -ENV PYTHON=python3 - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && make -j $(nproc) CC="$CC" && date - -# The rpc test cases are running as user #1000, let's add the user -RUN adduser -u 1000 test - -RUN pip3 install junit_xml - -RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 9d3bb0f87..c26a5fd57 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,11 +1,10 @@ ARG CC=gcc -COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh -RUN /bin/prepare-for-fedora-rawhide.sh - COPY . /criu WORKDIR /criu +RUN dnf install -y "$CC" && scripts/ci/prepare-for-fedora-rawhide.sh + RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine new file mode 100644 index 000000000..cd632dddf --- /dev/null +++ b/scripts/build/Dockerfile.hotspot-alpine @@ -0,0 +1,11 @@ +FROM docker.io/library/eclipse-temurin:11-alpine +ARG CC=gcc + +COPY . /criu +WORKDIR /criu + +RUN apk add --no-cache maven "$CC" && contrib/dependencies/apk-packages.sh + +RUN make mrproper && make -j $(nproc) CC="$CC" + +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu new file mode 100644 index 000000000..a459e1ec7 --- /dev/null +++ b/scripts/build/Dockerfile.hotspot-ubuntu @@ -0,0 +1,11 @@ +FROM docker.io/library/eclipse-temurin:11-jammy +ARG CC=gcc + +COPY . /criu +WORKDIR /criu + +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + +RUN make mrproper && make -j $(nproc) CC="$CC" + +ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index a15038631..a37f16e49 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -1,32 +1,10 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-minimal \ - python3-future - COPY . /criu WORKDIR /criu +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN uname -m && setarch linux32 uname -m && setarch --list RUN make mrproper && date && \ diff --git a/scripts/build/Dockerfile.openj9-alpine b/scripts/build/Dockerfile.openj9-alpine deleted file mode 100644 index f92011283..000000000 --- a/scripts/build/Dockerfile.openj9-alpine +++ /dev/null @@ -1,32 +0,0 @@ -# FIXME: Replace with eclipse-temurin once Alpine support has been added. -# https://github.com/adoptium/containers/pull/60 -FROM adoptopenjdk/openjdk8-openj9:alpine -ARG CC=gcc - -RUN apk update && apk add \ - bash \ - build-base \ - coreutils \ - git \ - gnutls-dev \ - libaio-dev \ - libcap-dev \ - libnet-dev \ - libnl3-dev \ - pkgconfig \ - protobuf-c-dev \ - protobuf-dev \ - python3 \ - sudo \ - maven \ - ip6tables \ - iptables \ - bash - -COPY . /criu -WORKDIR /criu - -RUN make mrproper && make -j $(nproc) CC="$CC" - -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 8936adf81..18664f100 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,34 +1,12 @@ -FROM docker.io/library/eclipse-temurin:8-focal +FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-jammy ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -RUN apt-install protobuf-c-compiler \ - libprotobuf-c-dev \ - libaio-dev \ - python3-future \ - libprotobuf-dev \ - protobuf-compiler \ - libcap-dev \ - libnl-3-dev \ - gdb \ - bash \ - python3-protobuf \ - python3-yaml \ - libnet-dev \ - libnl-route-3-dev \ - libbsd-dev \ - make \ - git \ - pkg-config \ - iptables \ - gcc \ - maven - +RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf COPY . /criu WORKDIR /criu +RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh + RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT mvn -q -f test/javaTests/pom.xml test - +ENTRYPOINT ["mvn", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.hdr b/scripts/build/Dockerfile.riscv64-stable-cross.hdr new file mode 100644 index 000000000..d4c414023 --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.hdr @@ -0,0 +1,5 @@ +FROM ubuntu:jammy + +ENV ARCH=riscv64 +ENV DEBIAN_ARCH=riscv64 +ENV CROSS_TRIPLET=riscv64-linux-gnu diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl new file mode 100644 index 000000000..8933a6c82 --- /dev/null +++ b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl @@ -0,0 +1,31 @@ +# Add the cross compiler sources +RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 + +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 871920D1991BC93C 8D69674688B6CB36 B523E5F3FC4E5F2C + +COPY scripts/ci/riscv64-cross/amd64-sources.list /etc/apt/sources.list + +COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ + +RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ + apt-get update -y + +ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLET} \ + AS=/usr/bin/${CROSS_TRIPLET}-as \ + AR=/usr/bin/${CROSS_TRIPLET}-ar \ + CC=/usr/bin/${CROSS_TRIPLET}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ + LD=/usr/bin/${CROSS_TRIPLET}-ld \ + FC=/usr/bin/${CROSS_TRIPLET}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig + +COPY . /criu +WORKDIR /criu + +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl index 6a68cd1ca..56104081f 100644 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ b/scripts/build/Dockerfile.stable-cross.tmpl @@ -1,28 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -39,4 +18,12 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +# amdgpu_plugin with armv7 is not supported +RUN make mrproper && date && \ + make -j $(nproc) && \ + if [ "$SUBARCH" != "armv7" ]; then \ + make -j $(nproc) amdgpu_plugin; \ + fi && \ + make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index e0e72372d..498b99be9 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -1,41 +1,12 @@ ARG CC=gcc -COPY scripts/ci/apt-install /bin/apt-install - -# On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default -# We need to install kmod to enable iptables to load these modules for us. -RUN apt-install \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnftables-dev \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - iproute2 \ - kmod \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python-is-python3 \ - python3-minimal \ - python3-protobuf \ - python3-yaml \ - python3-future - COPY . /criu WORKDIR /criu +# On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default +# We need to install kmod to enable iptables to load these modules for us. +RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh + RUN git clean -dfx && date && \ # Check single object build make -j $(nproc) CC="$CC" criu/parasite-syscall.o && \ diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl index dacfd96ef..7edb289b6 100644 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ b/scripts/build/Dockerfile.unstable-cross.tmpl @@ -1,28 +1,7 @@ -COPY scripts/ci/apt-install /bin/apt-install - # Add the cross compiler sources RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ dpkg --add-architecture ${DEBIAN_ARCH} -RUN apt-install \ - crossbuild-essential-${DEBIAN_ARCH} \ - libc6-dev-${DEBIAN_ARCH}-cross \ - libc6-${DEBIAN_ARCH}-cross \ - libbz2-dev:${DEBIAN_ARCH} \ - libexpat1-dev:${DEBIAN_ARCH} \ - ncurses-dev:${DEBIAN_ARCH} \ - libssl-dev:${DEBIAN_ARCH} \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf \ - libnl-3-dev:${DEBIAN_ARCH} \ - libprotobuf-dev:${DEBIAN_ARCH} \ - libnet-dev:${DEBIAN_ARCH} \ - libprotobuf-c-dev:${DEBIAN_ARCH} \ - libcap-dev:${DEBIAN_ARCH} \ - libaio-dev:${DEBIAN_ARCH} \ - libnl-route-3-dev:${DEBIAN_ARCH} - ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ CROSS_ROOT=/usr/${CROSS_TRIPLET} \ AS=/usr/bin/${CROSS_TRIPLET}-as \ @@ -39,4 +18,6 @@ ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ COPY . /criu WORKDIR /criu -RUN make mrproper && date && make -j $(nproc) zdtm && date +RUN contrib/dependencies/apt-cross-packages.sh + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index 32fc2978a..a666f6c26 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,5 +1,5 @@ -FROM ubuntu:focal +FROM ubuntu:24.04 -COPY scripts/ci/apt-install /bin/apt-install +COPY contrib/apt-install /bin/apt-install RUN apt-install gcc-multilib diff --git a/scripts/build/Makefile b/scripts/build/Makefile index 2c006ad87..a420cea94 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,5 +1,5 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide centos7 armv7hf centos8 -STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross +ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf +STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) CREATE_DOCKERFILES := $(ARCHES) $(NON_CLANG) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index 120f561e4..bad8065f2 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,7 +11,7 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide centos7 centos8 archlinux +TARGETS := alpine fedora-rawhide archlinux ZDTM_OPTS := UNAME := $(shell uname -m) export UNAME @@ -20,14 +20,6 @@ export CONTAINER_RUNTIME alpine: ZDTM_OPTS=-x zdtm/static/binfmt_misc -x zdtm/static/sched_policy00 -define DOCKER_JSON -{ - "storage-driver": "devicemapper" -} -endef - -export DOCKER_JSON - ifeq ($(GITHUB_ACTIONS),true) # GitHub Actions does not give us a real TTY and errors out with # 'the input device is not a TTY' if using '-t' @@ -38,43 +30,29 @@ endif export CONTAINER_TERMINAL +# Here we assume that any CPU architecture besides x86_64 is running in containers +# that may not support running docker with '--privileged'. ifeq ($(UNAME),x86_64) - # On anything besides x86_64 Travis is running unprivileged LXD - # containers which do not support running docker with '--privileged'. CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run else CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run endif ifeq ($(CONTAINER_RUNTIME),podman) - # Just as Docker needs to use devicemapper Podman needs vfs - # as graphdriver as overlayfs does not support all test cases - STORAGE_DRIVER := vfs # Podman limits the number of processes in a container using cgroups. # Disable it as it breaks the thread-bomb test CONTAINER_OPTS += --pids-limit=0 endif -export STORAGE_DRIVER - -restart-docker: - if [ "$$UNAME" = "x86_64" ] && [ "$$CONTAINER_RUNTIME" = "docker" ]; then \ - echo "$$DOCKER_JSON" > /etc/docker/daemon.json; \ - cat /etc/docker/daemon.json; \ - systemctl status docker; \ - systemctl restart docker; \ - systemctl status docker; \ - fi - export ZDTM_OPTS -$(TARGETS): restart-docker +$(TARGETS): $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run --env-file docker.env $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh + $(CONTAINER_RUNTIME) run --env-file docker.env -v `pwd`/../../:/criu $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh -fedora-asan: restart-docker +fedora-asan: $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) + $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) -v `pwd`/../../:/criu criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) docker-test: ./docker-test.sh @@ -82,11 +60,8 @@ docker-test: podman-test: ./podman-test.sh -# overlayfs behaves differently on Ubuntu and breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 -# Switch to devicemapper -openj9-test: restart-docker - ./openj9-test.sh +java-test: + ./java-test.sh setup-vagrant: ./vagrant.sh setup @@ -97,7 +72,23 @@ vagrant-fedora-no-vdso: setup-vagrant vagrant-fedora-rawhide: setup-vagrant ./vagrant.sh fedora-rawhide -.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide +vagrant-fedora-non-root: setup-vagrant + ./vagrant.sh fedora-non-root + +.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root + +check-commit: + ($(MAKE) -j $$(nproc) -C ../.. && \ + echo "Commit $$(git rev-parse --short HEAD) built successfully") || \ + (echo "Build failed for $$(git rev-list -n 1 --pretty HEAD)" && \ + exit 1) + +.PHONY: check-commit + +loongarch64-qemu-test: + ./loongarch64-qemu-test.sh + +.PHONY: loongarch64-qemu-test %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index 8113b9b19..8b72fa5f1 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -1,11 +1,12 @@ #!/bin/bash -# shellcheck disable=2044 - set -x cat /proc/self/mountinfo +time make ASAN=1 -j 4 V=1 +time make -j4 -C test/zdtm V=1 + chmod 0777 test chmod 0777 test/zdtm/transition/ chmod 0777 test/zdtm/static @@ -13,7 +14,8 @@ chmod 0777 test/zdtm/static ./test/zdtm.py run -a --keep-going -k always --parallel 4 -x zdtm/static/rtc "$@" ret=$? -for i in $(find / -name 'asan.log*'); do +shopt -s globstar nullglob +for i in /**/asan.log*; do echo "$i" echo ======================================== cat "$i" diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index f36b4e458..c1c745544 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -1,46 +1,35 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2015 - set -x -e -o pipefail -./apt-install \ - apt-transport-https \ - ca-certificates \ - curl \ - software-properties-common +# Workaround: Docker 28.x and 29.x has a known regression that breaks the checkpoint and +# restore (C/R) feature. Let's install previous, or next major version. See +# https://github.com/moby/moby/issues/50750 for details on the bug. +export DEBIAN_FRONTEND=noninteractive +apt remove -y docker-ce docker-ce-cli +../../contrib/apt-install -y ca-certificates curl +install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc +chmod a+r /etc/apt/keyrings/docker.asc +# shellcheck disable=SC1091 +echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list +apt update -y +apt-cache madison docker-ce | awk '{ print $3 }' +verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -Ev ':(28|29)\.'| tail -n 1)" +../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - -add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable test" - -./apt-install docker-ce - -. /etc/lsb-release - -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use devicemapper storage drive as a work-around -echo '{ "experimental": true, "storage-driver": "devicemapper" }' > /etc/docker/daemon.json +# docker checkpoint and restore is an experimental feature +echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf -service docker stop -systemctl stop containerd.service - -# Always use the latest containerd release. -# Restore with containerd versions after v1.2.14 and before v1.5.0-beta.0 are broken. -# https://github.com/checkpoint-restore/criu/issues/1223 -CONTAINERD_DOWNLOAD_URL=$(curl -s https://api.github.com/repos/containerd/containerd/releases/latest | grep '"browser_download_url":.*/containerd-.*-linux-amd64.tar.gz.$' | cut -d\" -f4) -wget -nv "$CONTAINERD_DOWNLOAD_URL" -O - | tar -xz -C /usr/ - -systemctl restart containerd.service -service docker restart +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf export SKIP_CI_TEST=1 @@ -88,17 +77,35 @@ checkpoint_container () { docker wait cr } -restore_container () { - CHECKPOINT_NAME=$1 - - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { +print_logs () { cat "$(grep log 'log file:' | sed 's/log file:\s*//')" || true docker logs cr || true cat $CRIU_LOG || true dmesg docker ps exit 1 - } +} + +declare -i max_restore_container_tries=3 + +restore_container () { + CHECKPOINT_NAME=$1 + + for i in $(seq $max_restore_container_tries); do + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log && break + + # FIXME: There is a race condition in docker/containerd that causes + # docker to occasionally fail when starting a container from a + # checkpoint immediately after the checkpoint has been created. + # https://github.com/moby/moby/issues/42900 + if grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log; then + echo "Retry container restore: $i/$max_restore_container_tries" + sleep 1; + else + print_logs + fi + + done } # Scenario: Create multiple containers and checkpoint and restore them once diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh new file mode 100755 index 000000000..a5b13a107 --- /dev/null +++ b/scripts/ci/java-test.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +cd ../.. || exit 1 + +sudo modprobe iptable_filter + +failures="" + +docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . +if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then + failures="$failures openj9-ubuntu" +fi + +docker build -t criu-hotspot-alpine-test:latest -f scripts/build/Dockerfile.hotspot-alpine . +if ! docker run --rm --privileged criu-hotspot-alpine-test:latest; then + failures="$failures hotspot-alpine" +fi + +docker build -t criu-hotspot-ubuntu-test:latest -f scripts/build/Dockerfile.hotspot-ubuntu . +if ! docker run --rm --privileged criu-hotspot-ubuntu-test:latest; then + failures="$failures hotspot-ubuntu" +fi + +if [ -n "$failures" ]; then + echo "Tests failed on $failures" + exit 1 +fi diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh new file mode 100755 index 000000000..7e00ab65a --- /dev/null +++ b/scripts/ci/loongarch64-qemu-test.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o nounset +set -o errexit +set -x + +../../contrib/apt-install \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common \ + sshpass \ + openssh-client + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable test" + +../../contrib/apt-install docker-ce + +# shellcheck source=/dev/null +. /etc/lsb-release + +# docker checkpoint and restore is an experimental feature +echo '{ "experimental": true }' > /etc/docker/daemon.json +service docker restart + +docker info + +# run a loongarch64 vm + +PORT='2222' +USER='root' +PASSWORD='loongarch64' +NAME='vm' + +docker run \ + -d \ + --net host \ + --name $NAME \ + merore/archlinux-loongarch64 + +run() { + if [ -z "$1" ]; then + echo "Command cannot be empty." + exit 1 + fi + sshpass -p $PASSWORD ssh -o StrictHostKeyChecking=no -p $PORT $USER@127.0.0.1 "$1" +} + +# wait vm to start +while (! run "uname -a") +do + echo "Wait vm to start..." + sleep 1 +done +echo "The loongarch64 vm is started!" + +# Tar criu and send to vm +tar -cf criu.tar ../../../criu +sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127.0.0.1:/root + +# build and test +run 'cd /root; tar -xf criu.tar' +run 'cd /root/criu; make -j4 && make -j4 -C test/zdtm' +run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" diff --git a/scripts/ci/openj9-test.sh b/scripts/ci/openj9-test.sh deleted file mode 100755 index b8c07f180..000000000 --- a/scripts/ci/openj9-test.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -cd ../.. || exit 1 - -failures="" - -docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . -if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then - failures="$failures ubuntu" -fi - -docker build -t criu-openj9-alpine-test:latest -f scripts/build/Dockerfile.openj9-alpine . -if ! docker run --rm --privileged criu-openj9-alpine-test:latest; then - failures="$failures alpine" -fi - -if [ -n "$failures" ]; then - echo "Tests failed on $failures" - exit 1 -fi diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 414004514..185783011 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -11,29 +11,23 @@ make install PREFIX=/usr criu --version -# Install crun build dependencies -scripts/ci/apt-install libyajl-dev libseccomp-dev libsystemd-dev +# FIXME: Disable checkpoint/restore of cgroups +# https://github.com/checkpoint-restore/criu/issues/2091 +mkdir -p /etc/criu +echo "manage-cgroups ignore" > /etc/criu/runc.conf +sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers.conf -# Install crun from source to test libcriu integration -tmp_dir=$(mktemp -d -t ci-XXXXXXXXXX) -pushd "${tmp_dir}" -git clone --depth=1 https://github.com/containers/crun -cd crun -./autogen.sh && ./configure --prefix=/usr -make -j"$(nproc)" -make install -popd -rm -rf "${tmp_dir}" +# Test checkpoint/restore with action script +echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf -# overlayfs with current Ubuntu kernel breaks CRIU -# https://bugs.launchpad.net/ubuntu/+source/linux-azure/+bug/1967924 -# Use VFS storage drive as a work-around -export STORAGE_DRIVER=vfs -podman --storage-driver vfs info +cat /proc/self/mountinfo +podman info -# shellcheck disable=SC2016 podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' +# Show criu logs in case of error +trap 'cat /var/lib/containers/storage/overlay-containers/*/userdata/*.log' EXIT + sleep 1 for i in $(seq 20); do echo "Test $i for podman container checkpoint" @@ -74,3 +68,5 @@ for i in $(seq 20); do podman ps -a rm -f /tmp/chkpt.tar.gz done + +trap 'echo PASS' EXIT \ No newline at end of file diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index f4d3155f9..b0b45fcc3 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -1,40 +1,21 @@ #!/bin/bash set -e -x +contrib/dependencies/dnf-packages.sh dnf install -y \ diffutils \ + e2fsprogs \ findutils \ - gcc \ - git \ - gnutls-devel \ + gawk \ gzip \ - iproute \ - iptables \ - nftables \ - nftables-devel \ - libaio-devel \ - libasan \ - libcap-devel \ - libnet-devel \ - libnl3-devel \ - libbsd-devel \ - make \ + kmod \ + libselinux-utils \ procps-ng \ - protobuf-c-devel \ - protobuf-devel \ - python3-flake8 \ - python3-PyYAML \ - python3-future \ - python3-protobuf \ - python3-junit_xml \ + python3-pip \ python-unversioned-command \ redhat-rpm-config \ sudo \ - tar \ - which \ - e2fsprogs \ - rubygem-asciidoctor \ - kmod + tar # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/ci/riscv64-cross/amd64-sources.list b/scripts/ci/riscv64-cross/amd64-sources.list new file mode 100644 index 000000000..72dad920c --- /dev/null +++ b/scripts/ci/riscv64-cross/amd64-sources.list @@ -0,0 +1,10 @@ +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates universe +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates multiverse +deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security main restricted +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security universe +deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security multiverse \ No newline at end of file diff --git a/scripts/ci/riscv64-cross/riscv64-sources.list b/scripts/ci/riscv64-cross/riscv64-sources.list new file mode 100644 index 000000000..67b8067b6 --- /dev/null +++ b/scripts/ci/riscv64-cross/riscv64-sources.list @@ -0,0 +1,42 @@ +# See http://help.ubuntu.com/community/UpgradeNotes for how to upgrade to +# newer versions of the distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted + +## Major bug fix updates produced after the final release of the +## distribution. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team. Also, please note that software in universe WILL NOT receive any +## review or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe + +## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu +## team, and may not be under a free licence. Please satisfy yourself as to +## your rights to use the software. Also, please note that software in +## multiverse WILL NOT receive any review or updates from the Ubuntu +## security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse + +## N.B. software from this repository may not have been tested as +## extensively as that contained in the main release, although it includes +## newer versions of some applications which may provide useful features. +## Also, please note that software in backports WILL NOT receive any review +## or updates from the Ubuntu security team. +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse + +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe +deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse +# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse \ No newline at end of file diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 8d9de6e55..05a3b71e8 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,25 +1,20 @@ #!/bin/bash set -x -e -CI_PKGS="protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev - libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev - libnl-3-dev gdb bash libnet-dev util-linux asciidoctor - libnl-route-3-dev time flake8 libbsd-dev python3-yaml - libperl-dev pkg-config python3-future python3-protobuf - python3-junit.xml" +CI_PKGS=() -X86_64_PKGS="gcc-multilib" +X86_64_PKGS=(gcc-multilib) + +# Convert from string to array. +IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # For Travis only x86_64 seems to be baremetal. Other - # architectures are running in unprivileged LXD containers. - # That seems to block most of CRIU's interfaces. - - # But with the introduction of baremetal aarch64 systems in - # Travis (arch: arm64-graviton2) we can override this using - # an environment variable + # Some tests rely on kernel features that may not be available + # when running in a container. Here we assume that x86_64 systems + # are baremetal, and skip the tests for all other CPU architectures. + # The RUN_TESTS environment variable can override this, e.g., for aarch64. [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi @@ -33,9 +28,13 @@ ci_prep () { # not run anymore with 'sudo -u \#1000' if the UID does not exist. adduser -u 1000 --disabled-password --gecos "criutest" criutest || : - # This can fail on aarch64 travis + # This can fail on aarch64 service apport stop || : + # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user + # namespaces by unprivileged users. We need this for some of our tests. + sysctl kernel.apparmor_restrict_unprivileged_userns=0 || : + if [ "$CLANG" = "1" ]; then # clang support CC=clang @@ -46,19 +45,16 @@ ci_prep () { else CC=gcc fi - CI_PKGS="$CI_PKGS $CC" + CI_PKGS+=("$CC") # Do not install x86_64 specific packages on other architectures if [ "$UNAME_M" = "x86_64" ]; then - CI_PKGS="$CI_PKGS $X86_64_PKGS" + CI_PKGS+=("${X86_64_PKGS[@]}") fi - scripts/ci/apt-install "$CI_PKGS" + contrib/dependencies/apt-packages.sh + contrib/apt-install "${CI_PKGS[@]}" chmod a+x "$HOME" - - # zdtm uses an unversioned python binary to run the tests. - # let's point python to python3 - ln -sf /usr/bin/python3 /usr/bin/python } test_stream() { @@ -69,9 +65,8 @@ test_stream() { # restorer and eventually close the page read. However, image-streamer expects the # whole image to be read and the image is not reopened, sent twice. These MAP_HUGETLB # test cases will result in EPIPE error at the moment. - STREAM_TEST_EXCLUDE="-x maps09 -x maps10" - # shellcheck disable=SC2086 - ./test/zdtm.py run --stream -p 2 --keep-going -a $STREAM_TEST_EXCLUDE $ZDTM_OPTS + STREAM_TEST_EXCLUDE=(-x maps09 -x maps10) + ./test/zdtm.py run --stream -p 2 --keep-going -a "${STREAM_TEST_EXCLUDE[@]}" "${ZDTM_OPTS[@]}" } print_header() { @@ -123,8 +118,14 @@ if [ "${CD_TO_TOP}" = "1" ]; then fi export GCOV CC +if [ -z "$COMPILE_FLAGS" ]; then + LOCAL_COMPILE_FLAGS=("V=1") +else + IFS=" " read -r -a LOCAL_COMPILE_FLAGS <<< "$COMPILE_FLAGS" + LOCAL_COMPILE_FLAGS=("V=1" "${LOCAL_COMPILE_FLAGS[@]}") +fi $CC --version -time make CC="$CC" -j4 V=1 +time make CC="$CC" -j4 "${LOCAL_COMPILE_FLAGS[@]}" ./criu/criu -v4 cpuinfo dump || : ./criu/criu -v4 cpuinfo check || : @@ -142,11 +143,17 @@ time make unittest [ -n "$SKIP_CI_TEST" ] && exit 0 +# Umount cpuset in cgroupv1 to make it move to cgroupv2 +if [ -d /sys/fs/cgroup/cpuset ]; then + umount /sys/fs/cgroup/cpuset +fi + ulimit -c unlimited cgid=$$ cleanup_cgroup() { ./test/zdtm_umount_cgroups $cgid + dmesg } trap cleanup_cgroup EXIT ./test/zdtm_mount_cgroups $cgid @@ -160,21 +167,20 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then # for 32-bit tests. A better way would involve launching docker.. # But it would require making zdtm.py aware of docker and launching # tests inside the CT. - INCOMPATIBLE_LIBS="libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev" - IA32_PKGS="" + INCOMPATIBLE_LIBS=(libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev) + IA32_PKGS=() REFUGE=64-refuge mkdir "$REFUGE" - for i in $INCOMPATIBLE_LIBS ; do + for i in "${INCOMPATIBLE_LIBS[@]}" ; do for j in $(dpkg --listfiles "$i" | grep '\.so$') ; do cp "$j" "$REFUGE/" done - IA32_PKGS="$IA32_PKGS $i:i386" + IA32_PKGS+=("$i:i386") done - # shellcheck disable=SC2086 - apt-get remove $INCOMPATIBLE_LIBS + apt-get remove "${INCOMPATIBLE_LIBS[@]}" dpkg --add-architecture i386 - scripts/ci/apt-install "$IA32_PKGS" + contrib/apt-install "${IA32_PKGS[@]}" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi @@ -211,15 +217,12 @@ if [ "${STREAM_TEST}" = "1" ]; then exit 0 fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going "${ZDTM_OPTS[@]}" if criu/criu check --feature move_mount_set_group; then - # shellcheck disable=SC2086 - ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going $ZDTM_OPTS + ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${ZDTM_OPTS[@]}" fi -# shellcheck disable=SC2086 -./test/zdtm.py run -a -p 2 --keep-going --criu-config $ZDTM_OPTS +./test/zdtm.py run -a -p 2 --keep-going --criu-config "${ZDTM_OPTS[@]}" # Newer kernels are blocking access to userfaultfd: # uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults must be handled without obtaining CAP_SYS_PTRACE capability @@ -227,17 +230,14 @@ if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then echo 1 > /proc/sys/vm/unprivileged_userfaultfd fi -LAZY_EXCLUDE="-x maps04 -x cmdlinenv00 -x maps007" +LAZY_EXCLUDE=(-x maps04 -x cmdlinenv00 -x maps007) LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*' -LAZY_OPTS="-p 2 -T $LAZY_TESTS $LAZY_EXCLUDE $ZDTM_OPTS" +LAZY_OPTS=(-p 2 -T "$LAZY_TESTS" "${LAZY_EXCLUDE[@]}" "${ZDTM_OPTS[@]}") -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages -# shellcheck disable=SC2086 -./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls +./test/zdtm.py run "${LAZY_OPTS[@]}" --lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages +./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages --tls bash -x ./test/jenkins/criu-fault.sh if [ "$UNAME_M" == "x86_64" ]; then @@ -255,15 +255,20 @@ if [ -z "$SKIP_EXT_DEV_TEST" ]; then fi make -C test/others/make/ run CC="$CC" -if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then +if [ -n "$CIRCLECI" ]; then # GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with: # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi +make -C test/others/criu-ns/ run +make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling +./test/zdtm.py run -t zdtm/static/maps00 --preload-libfault +./test/zdtm.py run -t zdtm/static/maps02 --preload-libfault + ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server @@ -288,6 +293,45 @@ ip net add test ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/transition/fork -t zdtm/static/ghost_holes00 -t zdtm/static/socket-tcp -t zdtm/static/msgque -k always ./test/crit-recode.py +# Rootless tests +# Check if cap_checkpoint_restore is supported and also if unshare -c is supported. +# +# Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). +# Before v6.8-rc1~215^2~6, the kernel currently did not show correct device and +# inode numbers in /proc/pid/maps for stackable file systems. +skip=0 +findmnt -no FSTYPE / | grep overlay && { + ./criu/criu check --feature overlayfs_maps || skip=1 +} +unshare -c /bin/true || skip=1 +capsh --supports=cap_checkpoint_restore || skip=1 + +if [ "$skip" == 0 ]; then + make -C test/zdtm/ cleanout + rm -rf test/dump + setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then + # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. + selinuxmode=$(getenforce) + if [ "$selinuxmode" != "Disabled" ]; then + setenforce Permissive + fi + + fi + # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore + # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, + # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. + sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" + if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then + if [ "$selinuxmode" != "Disabled" ]; then + setenforce "$selinuxmode" + fi + fi + setcap -r criu/criu +else + echo "Skipping unprivileged mode tests" +fi + # more crit testing make -C test/others/crit run @@ -303,6 +347,9 @@ make -C test/others/ns_ext run # config file parser and parameter testing make -C test/others/config-file run +# action script testing +make -C test/others/action-script run + # Skip all further tests when running with GCOV=1 # The one test which currently cannot handle GCOV testing is compel/test # Probably because the GCOV Makefile infrastructure does not exist in compel @@ -310,3 +357,15 @@ make -C test/others/config-file run # compel testing make -C compel/test + +# amdgpu and cuda plugin testing +make amdgpu_plugin +make -C plugins/amdgpu/ test_topology_remap +./plugins/amdgpu/test_topology_remap + +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu +./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda +./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled + +./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index af0f7335a..5f2de32b8 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -6,42 +6,42 @@ set -e set -x -VAGRANT_VERSION=2.2.19 -FEDORA_VERSION=35 -FEDORA_BOX_VERSION=35.20211026.0 +VAGRANT_VERSION=2.4.7 +FEDORA_VERSION=42 +FEDORA_BOX_VERSION=1.1.0 setup() { - if [ -n "$TRAVIS" ]; then - # Load the kvm modules for vagrant to use qemu - modprobe kvm kvm_intel - fi - # Tar up the git checkout to have vagrant rsync it to the VM - tar cf criu.tar ../../../criu + tar cf /tmp/criu.tar -C ../../../ criu # Cirrus has problems with the following certificate. - wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_"$(uname -m)".deb -O /tmp/vagrant.deb && \ + wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ - ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ - openssh-client + ../../contrib/apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ + ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt - vagrant init fedora/${FEDORA_VERSION}-cloud-base --box-version ${FEDORA_BOX_VERSION} + vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} + # The default libvirt Vagrant VM uses 512MB. - # Travis VMs should have around 7.5GB. + # VMs in our CI typically have around 16GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' + # Sync /tmp/criu.tar into the VM + # We want to use $HOME without expansion + # shellcheck disable=SC2016 + sed -i Vagrantfile -e 's|^end$| config.vm.provision "file", source: "/tmp/criu.tar", destination: "$HOME/criu.tar"'"\n"'end|g' + vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config - ssh default sudo dnf upgrade -y - ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ - libasan libcap-devel libnet-devel libnl3-devel libbsd-devel make protobuf-c-devel \ - protobuf-devel python3-flake8 python3-future python3-protobuf \ - python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel + # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd + + ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' + ssh default sudo dnf upgrade -y + ssh default sudo /vagrant/criu/contrib/dependencies/dnf-packages.sh ssh default cat /proc/cmdline } @@ -49,7 +49,7 @@ fedora-no-vdso() { ssh default sudo grubby --update-kernel ALL --args="vdso=0" vagrant reload ssh default cat /proc/cmdline - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + ssh default 'cd /vagrant/criu; make -j' ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only @@ -57,6 +57,15 @@ fedora-no-vdso() { } fedora-rawhide() { + # Upgrade the kernel to the latest vanilla one + ssh default sudo dnf -y copr enable @kernel-vanilla/stable + ssh default sudo dnf upgrade -y + + # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks + # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously + # installed this reboots the VM. + vagrant reload + ssh default uname -a # # Workaround the problem: # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected @@ -65,7 +74,28 @@ fedora-rawhide() { # ssh default 'sudo dnf remove -y crun || true' ssh default sudo dnf install -y podman runc - ssh default 'cd /vagrant; tar xf criu.tar; cd criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' + # Some tests in the container need selinux to be disabled. + # In the container it is not possible to change the state of selinux. + # Let's just disable it for this test run completely. + ssh default 'sudo setenforce Permissive' + ssh default 'cd /vagrant/criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' +} + +fedora-non-root() { + ssh default uname -a + ssh default 'cd /vagrant/criu; make -j' + # Setting the capability should be the only line needed to run as non-root on Fedora + # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu + ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' + # Run it once as non-root + ssh default 'cd /vagrant/criu; criu/criu check --unprivileged; ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' + # Run it as root with '--rootless' + ssh default 'cd /vagrant/criu; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h; sudo chmod 777 test/dump/zdtm/static/{env00,pthread00}; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' + # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore + # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, + # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. + # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. + ssh default 'cd /vagrant/criu; selinuxmode=`getenforce` && sudo setenforce Permissive && unshare -Ucfpm --mount-proc bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" && sudo setenforce $selinuxmode' } $1 diff --git a/scripts/crit-setup.py b/scripts/crit-setup.py deleted file mode 100644 index 13df03e3b..000000000 --- a/scripts/crit-setup.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -from distutils.core import setup - -criu_version = "0.0.1" -env = os.environ - -if 'CRIU_VERSION_MAJOR' in env and 'CRIU_VERSION_MINOR' in env: - criu_version = '{}.{}'.format( - env['CRIU_VERSION_MAJOR'], - env['CRIU_VERSION_MINOR'] - ) - - if 'CRIU_VERSION_SUBLEVEL' in env and env['CRIU_VERSION_SUBLEVEL']: - criu_version += '.' + env['CRIU_VERSION_SUBLEVEL'] - -setup(name="crit", - version=criu_version, - description="CRiu Image Tool", - author="CRIU team", - author_email="criu@openvz.org", - license="GPLv2", - url="https://github.com/checkpoint-restore/criu", - package_dir={'pycriu': 'lib/py'}, - packages=["pycriu", "pycriu.images"], - scripts=["crit/crit"]) diff --git a/scripts/criu-ns b/scripts/criu-ns index 9fc58b640..5950d7c50 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -4,6 +4,9 @@ import ctypes.util import errno import sys import os +import fcntl +import termios +import time # constants for unshare CLONE_NEWNS = 0x00020000 @@ -68,7 +71,19 @@ def _wait_for_process_status(criu_pid): try: (pid, status) = os.wait() if pid == criu_pid: - return os.waitstatus_to_exitcode(status) + # The following code block is based on + # os.waitstatus_to_exitcode() introduced in Python 3.9 + # and we implement this for comparability with older + # versions of Python. + if os.WIFSIGNALED(status): + return os.WTERMSIG(status) + elif os.WIFEXITED(status): + return os.WEXITSTATUS(status) + elif os.WIFSTOPPED(status): + return os.WSTOPSIG(status) + else: + raise Exception("CRIU was terminated by an " + "unidentified reason") except OSError: return -251 @@ -78,8 +93,21 @@ def run_criu(args): Spawn CRIU binary """ print(sys.argv) - os.execlp('criu', *['criu'] + args) - raise OSError(errno.ENOENT, "No such command") + + if "--criu-binary" in args: + try: + opt_index = args.index("--criu-binary") + path = args[opt_index + 1] + del args[opt_index:opt_index + 2] + args.insert(0, "criu") + os.execv(path, args) + raise OSError(errno.ENOENT, "No such command") + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--criu-binary missing argument") + else: + args.insert(0, "criu") + os.execvp("criu", args) + raise OSError(errno.ENOENT, "No such command") # pidns_holder creates a process that is reparented to the init. @@ -108,8 +136,8 @@ def wrap_restore(): if '--restore-sibling' in restore_args: raise OSError(errno.EINVAL, "--restore-sibling is not supported") - # Unshare pid and mount namespaces - if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: + # Unshare pid namespace + if _unshare(CLONE_NEWPID) != 0: _errno = ctypes.get_errno() raise OSError(_errno, errno.errorcode[_errno]) @@ -121,12 +149,65 @@ def wrap_restore(): restore_detached = True restore_args.remove('--restore-detached') + restore_pidfile = None + if '--pidfile' in restore_args: + try: + opt_index = restore_args.index('--pidfile') + restore_pidfile = restore_args[opt_index + 1] + del restore_args[opt_index:opt_index + 2] + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, "--pidfile missing argument") + + if not restore_pidfile.startswith('/'): + for base_dir_opt in ['--work-dir', '-W', '--images-dir', '-D']: + if base_dir_opt in restore_args: + try: + opt_index = restore_args.index(base_dir_opt) + restore_pidfile = os.path.join(restore_args[opt_index + 1], restore_pidfile) + break + except (ValueError, IndexError, FileNotFoundError): + raise OSError(errno.ENOENT, base_dir_opt + " missing argument") + criu_pid = os.fork() if criu_pid == 0: + # Unshare mount namespace + if _unshare(CLONE_NEWNS) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + os.setsid() + # Set stdin tty to be a controlling tty of our new session, this is + # required by --shell-job option, as for it CRIU would try to set a + # process group of restored root task to be a foreground group on the + # terminal. + if '--shell-job' in restore_args or '-j' in restore_args: + if os.isatty(sys.stdin.fileno()): + fcntl.ioctl(sys.stdin.fileno(), termios.TIOCSCTTY, 1) + else: + raise OSError(errno.EINVAL, 'The stdin is not a tty for a --shell-job') + _mount_new_proc() run_criu(restore_args) + if restore_pidfile: + restored_pid = None + retry = 5 + + while not restored_pid and retry: + with open('/proc/%d/task/%d/children' % (criu_pid, criu_pid)) as f: + line = f.readline().strip() + if len(line): + restored_pid = line + break + retry -= 1 + time.sleep(1) + + if restored_pid: + with open(restore_pidfile, 'w+') as f: + f.write(restored_pid) + else: + print("Warn: Search of restored pid for --pidfile option timeouted") + if restore_detached: return 0 @@ -135,7 +216,7 @@ def wrap_restore(): def get_varg(args): for i in range(1, len(sys.argv)): - if not sys.argv[i] in args: + if sys.argv[i] not in args: continue if i + 1 >= len(sys.argv): @@ -153,9 +234,9 @@ def _set_namespace(fd): raise OSError(_errno, errno.errorcode[_errno]) -def is_my_namespace(fd): +def is_my_namespace(fd, ns): """Returns True if fd refers to current namespace""" - return os.stat('/proc/self/ns/pid').st_ino != os.fstat(fd).st_ino + return os.stat('/proc/self/ns/%s' % ns).st_ino == os.fstat(fd).st_ino def set_pidns(tpid, pid_idx): @@ -165,7 +246,7 @@ def set_pidns(tpid, pid_idx): pid namespace. """ ns_fd = os.open('/proc/%s/ns/pid' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "pid"): for line in open('/proc/%s/status' % tpid): if not line.startswith('NSpid:'): continue @@ -190,7 +271,7 @@ def set_mntns(tpid): will be the same in target mntns. """ ns_fd = os.open('/proc/%s/ns/mnt' % tpid, os.O_RDONLY) - if is_my_namespace(ns_fd): + if not is_my_namespace(ns_fd, "mnt"): root_st = os.stat('/') cwd_st = os.stat('.') cwd_path = os.path.realpath('.') diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 592552cb8..727e9689e 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -35,34 +35,6 @@ int main(void) } endef -define FEATURE_TEST_STRLCPY - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcpy(NULL, NULL, 0); -} -endef - -define FEATURE_TEST_STRLCAT - -#include - -#ifdef CONFIG_HAS_LIBBSD -# include -#endif - -int main(void) -{ - return strlcat(NULL, NULL, 0); -} -endef - define FEATURE_TEST_PTRACE_PEEKSIGINFO #include @@ -137,19 +109,6 @@ ENTRY(main) END(main) endef -define FEATURE_TEST_FSCONFIG - -#include - -int main(void) -{ - if (FSCONFIG_CMD_CREATE > 0) - return 0; - return 0; -} - -endef - define FEATURE_TEST_NFTABLES_LIB_API_0 #include @@ -196,3 +155,22 @@ int main(void) return 0; } endef + +define FEATURE_TEST_NO_LIBC_RSEQ_DEFS + +#ifdef __has_include +#if __has_include(\"sys/rseq.h\") +#include +#endif +#endif + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +int main(void) +{ + return 0; +} +endef diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index 0e9545f2d..5b6037d61 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -8,9 +8,11 @@ URL="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/.c curl -s "${URL}" | sed -e " s,^\( *\)#\([A-Z]\),\1\2,g; s,ControlStatements,ControlStatementsExceptForEachMacros,g; - s,ColumnLimit: 80,ColumnLimit: 120,g; + s,ColumnLimit: 80,ColumnLimit: 0,g; s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; + s,ForEachMacros:,ForEachMacros:\n - 'for_each_bit',g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; s,\(AlignTrailingComments:.*\)$,\1\nAlignConsecutiveMacros: true,g; s,AlignTrailingComments: false,AlignTrailingComments: true,g; + s,\(IndentCaseLabels: false\),\1\nIndentGotoLabels: false,g; " > .clang-format diff --git a/scripts/github-indent-warnings.py b/scripts/github-indent-warnings.py new file mode 100755 index 000000000..04f82d6c1 --- /dev/null +++ b/scripts/github-indent-warnings.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 +import sys +import re + +re_file = r'^diff --git a/(\S\S*)\s.*$' +re_line = r'^@@ -(\d\d*)\D.*@@.*$' + +if __name__ == '__main__': + if len(sys.argv) != 1 and len(sys.argv) != 2: + print(f'usage: {sys.argv[0]} ') + print(f'usage: | {sys.argv[0]}') + exit(1) + + input_file = sys.stdin.fileno() + if len(sys.argv) == 2: + input_file = sys.argv[1] + + with open(input_file, 'r') as fi: + file_name = None + line_number = None + for line in fi: + file_matches = re.findall(re_file, line) + if len(file_matches) == 1: + file_name = file_matches[0] + continue + + if file_name is None: + continue + + line_matches = re.findall(re_line, line) + if len(line_matches) == 1: + line_number = int(line_matches[0]) + 3 + print(f'::warning file={file_name},line={line_number}::clang-format: Possible coding style problem (https://github.com/checkpoint-restore/criu/blob/criu-dev/CONTRIBUTING.md#automatic-tools-to-fix-coding-style)') diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh deleted file mode 100755 index 540c2c094..000000000 --- a/scripts/install-debian-pkgs.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Install required packages for development environment in Debian Distro - -REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} - -help_msg="Install required packages for development environment in Debian Distro -Usage: - scripts/install-debian-pkgs.sh" - -function print_help() -{ - exec echo -e "$help_msg" -} - -function process() -{ - sudo apt-get update - sudo apt-get install -yq "$( sed 's/\#.*$//' ${REQ_PKGS} )" -} - -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - print_help -else - process -fi diff --git a/scripts/magic-gen.py b/scripts/magic-gen.py index 3b1f29fb5..38dff1424 100755 --- a/scripts/magic-gen.py +++ b/scripts/magic-gen.py @@ -1,4 +1,4 @@ -#!/bin/env python2 +#!/bin/env python3 import sys diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index c1c1e94af..603c322cf 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -20,7 +20,9 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ - -e s/aarch64.*/aarch64/) + -e s/aarch64.*/aarch64/ \ + -e s/riscv64.*/riscv64/ \ + -e s/loongarch64.*/loongarch64/) export SUBARCH ARCH diff --git a/scripts/nmk/scripts/main.mk b/scripts/nmk/scripts/main.mk index 493a164f8..7f11bda23 100644 --- a/scripts/nmk/scripts/main.mk +++ b/scripts/nmk/scripts/main.mk @@ -1,7 +1,7 @@ ifndef ____nmk_defined__main # -# Genaral inclusion statement +# General inclusion statement ifndef ____nmk_defined__include include $(__nmk_dir)include.mk diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index 1681d4e90..de5782c13 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell which python3 2>/dev/null || which python2 2>/dev/null) +FULL_PYTHON := $(shell command -v python3 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ @@ -36,7 +36,7 @@ CTAGS := ctags export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE -export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) +export USE_ASCIIDOCTOR ?= $(shell command -v asciidoctor 2>/dev/null) # # Footer. diff --git a/scripts/protobuf-gen.sh b/scripts/protobuf-gen.sh index 0c738f13a..25d2feaeb 100644 --- a/scripts/protobuf-gen.sh +++ b/scripts/protobuf-gen.sh @@ -1,15 +1,15 @@ #!/bin/bash -# shellcheck disable=SC2013,SC1004 - TR="y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/" -for x in $(sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { +sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { /PB_AUTOGEN_ST/d; + /^[ \t]*$/d; s/,.*$//; s/\tPB_//; p; - }' criu/include/protobuf-desc.h); do + }' criu/include/protobuf-desc.h | \ +while IFS= read -r x; do x_la=$(echo "$x" | sed $TR) x_uf=$(echo "$x" | sed -nr 's/^./&#\\\ /; diff --git a/scripts/ruff.toml b/scripts/ruff.toml new file mode 100644 index 000000000..2b0385976 --- /dev/null +++ b/scripts/ruff.toml @@ -0,0 +1,4 @@ +# Ignore `E401` (import violations) in all `__init__.py` files +[lint.per-file-ignores] +"__init__.py" = ["F401"] + diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py new file mode 100755 index 000000000..2da63c800 --- /dev/null +++ b/scripts/uninstall_module.py @@ -0,0 +1,76 @@ +#!/usr/bin/python3 +""" +`pip uninstall` doesn't support `--prefix`. +https://github.com/pypa/pip/issues/11213 +""" +import argparse +import os +import shutil +import site +import subprocess +import sys + +# With Python 3.13 the subprocess module now uses the `posix_spawn()` +# function which requires loading the `signal` module: +# https://docs.python.org/3/whatsnew/3.13.html#subprocess +# +# We need to load this module here, before PYTHONPATH and sys.path +# have been modified to use the path specified with `--prefix`. +# +# flake8: noqa: F401 +import signal + +import importlib_metadata + + +def add_site_dir(prefix: str): + """ + Add site directory with prefix to sys.path and update PYTHONPATH. + """ + # If prefix is used, we need to make sure that we + # do not uninstall other packages from the system paths. + sys.path = [] + site.PREFIXES = [prefix] + pkgs = site.getsitepackages() + for path in pkgs: + site.addsitedir(path) + if 'dist-packages' in path: + # Ubuntu / Debian might use both dist- and site- packages. + site.addsitedir(path.replace('dist-packages', 'site-packages')) + os.environ['PYTHONPATH'] = os.pathsep.join(sys.path) + + +def uninstall_module(package_name: str, prefix=None): + """ + Enable support for '--prefix' with 'pip uninstall'. + """ + dist_info_path = None + if prefix: + add_site_dir(prefix) + try: + distribution = next(importlib_metadata.Distribution.discover(name=package_name)) + dist_info_path = str(distribution._path) + except StopIteration: + print(f"Skipping {package_name} as it is not installed.") + sys.exit(0) + + command = [sys.executable, '-m', 'pip', 'uninstall', '-y', package_name] + try: + subprocess.check_call(command, env=os.environ) + if dist_info_path and os.path.isdir(dist_info_path): + # .dist-info files are not cleaned up when the package + # has been installed with --prefix. + # https://github.com/pypa/pip/issues/5573 + shutil.rmtree(dist_info_path) + if 'dist-packages' in dist_info_path: + shutil.rmtree(dist_info_path.replace('dist-packages', 'site-packages')) + except subprocess.CalledProcessError as err: + print(f'Error uninstalling package {package_name}: {err}') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('module_name', help='The name of the module to uninstall') + parser.add_argument('--prefix', help='The prefix where the module was installed') + args = parser.parse_args() + uninstall_module(args.module_name, args.prefix) diff --git a/soccr/soccr.c b/soccr/soccr.c index abea93703..8e1ce1c63 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -781,7 +781,7 @@ int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsi return 0; } -static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) +static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __u32 len) { int ret, err = -1, max_chunk; int off; @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) continue; } - logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", queue, ret, chunk, len, max_chunk); + logerr("Can't restore %s queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); goto err; } off += ret; @@ -837,7 +837,7 @@ static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) return -1; } - return __send_queue(sk, queue, buf, len); + return __send_queue(sk, queue == TCP_RECV_QUEUE ? "recv" : "send", buf, len); } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, @@ -876,7 +876,7 @@ static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_dat * they can be restored without any tricks. */ tcp_repair_off(sk->fd); - if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) + if (__send_queue(sk, "not-sent send", buf + len, ulen)) return -3; if (tcp_repair_on(sk->fd)) return -4; diff --git a/soccr/test/Makefile b/soccr/test/Makefile index 458540045..499901b0c 100644 --- a/soccr/test/Makefile +++ b/soccr/test/Makefile @@ -21,7 +21,6 @@ tcp-conn-v6: tcp-conn-v6.c test: tcp-constructor tcp-conn tcp-conn-v6 unshare -n sh -c "ip link set up dev lo; ./tcp-conn" unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" - python run.py ./$(RUN) + python3 run.py ./$(RUN) .PHONY: test - diff --git a/soccr/test/run.py b/soccr/test/run.py index 1ffe58a58..57c556e36 100644 --- a/soccr/test/run.py +++ b/soccr/test/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys, os import hashlib diff --git a/soccr/test/tcp-test.py b/soccr/test/tcp-test.py index ff3fe29dc..b48f532eb 100755 --- a/soccr/test/tcp-test.py +++ b/soccr/test/tcp-test.py @@ -1,6 +1,5 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 -from __future__ import print_function import sys, socket import hashlib diff --git a/test/Makefile b/test/Makefile index 8416b1961..0bfdab680 100644 --- a/test/Makefile +++ b/test/Makefile @@ -12,7 +12,7 @@ all: $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job criu-ns skip-file-rwx-check other: for t in $(TESTS); do \ @@ -45,10 +45,6 @@ zdtm-freezer: ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:f .PHONY: zdtm-freezer -fault-injection: - $(MAKE) -C fault-injection -.PHONY: fault-injection - override CFLAGS += -D_GNU_SOURCE clean_root: diff --git a/test/check_actions.py b/test/check_actions.py deleted file mode 100755 index 4973e3938..000000000 --- a/test/check_actions.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -import sys -import os - -actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ - 'post-setup-namespaces', 'post-restore', 'post-resume', \ - 'network-lock', 'network-unlock' ]) -errors = [] -af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' - -for act in open(af): - act = act.strip().split() - act.append('EMPTY') - act.append('EMPTY') - - if act[0] == 'EMPTY': - raise Exception("Error in test, bogus actions line") - - if act[1] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) - - if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ - 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): - if act[2] == 'EMPTY': - errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) - elif not act[2].isdigit() or int(act[2]) == 0: - errors.append('Action %s PID is not number (%s)' % - (act[0], act[2])) - - actions -= set([act[0]]) - -if actions: - errors.append('Not all actions called: %r' % actions) - -if errors: - for x in errors: - print(x) - sys.exit(1) - -print('PASS') diff --git a/test/crit-recode.py b/test/crit-recode.py index 4135681e1..f119271d8 100755 --- a/test/crit-recode.py +++ b/test/crit-recode.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import pycriu import sys import os diff --git a/test/cuda-checkpoint/.gitignore b/test/cuda-checkpoint/.gitignore new file mode 100644 index 000000000..717fb7028 --- /dev/null +++ b/test/cuda-checkpoint/.gitignore @@ -0,0 +1 @@ +cuda-checkpoint diff --git a/test/cuda-checkpoint/Makefile b/test/cuda-checkpoint/Makefile new file mode 100644 index 000000000..c59dadddc --- /dev/null +++ b/test/cuda-checkpoint/Makefile @@ -0,0 +1,17 @@ +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) + +BIN := cuda-checkpoint +SRC := cuda-checkpoint.c +DEP := $(SRC:%.c=%.d) +OBJ := $(SRC:%.c=%.o) +TARGETS := $(BIN) + +include ../zdtm/Makefile.inc + +all: $(TARGETS) +.PHONY: all + +clean-more: + $(RM) $(TARGETS) +.PHONY: clean-more +clean: clean-more diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c new file mode 100644 index 000000000..3b7ce8b9f --- /dev/null +++ b/test/cuda-checkpoint/cuda-checkpoint.c @@ -0,0 +1,57 @@ +/* The mocked version of cuda-checkpoint. */ +#include +#include +#include + +int main(int argc, char *argv[]) +{ + int c; + + while (1) { + int option_index = 0; + static struct option long_options[] = { + { "pid", required_argument, 0, 'p' }, + { "get-state", no_argument, 0, 's' }, + { "get-restore-tid", no_argument, 0, 'g' }, + { "action", required_argument, 0, 'a' }, + { "timeout", required_argument, 0, 't' }, + { "help", no_argument, 0, 'h' }, + { 0, 0, 0, 0 } + }; + + c = getopt_long(argc, argv, "p:ga:ht:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'p': + printf("%s\n", optarg); + break; + case 'g': + case 'a': + case 't': + break; + case 's': + printf("running\n"); + break; + case 'h': + printf("--action - execute an action"); + break; + + default: + fprintf(stderr, "getopt returned character code 0%o ??\n", c); + return 1; + } + } + + if (optind < argc) { + fprintf(stderr, "non-option ARGV-elements: "); + while (optind < argc) + fprintf(stderr, "%s ", argv[optind++]); + fprintf(stderr, "\n"); + return 1; + } + + return 0; +} diff --git a/test/exhaustive/pipe.py b/test/exhaustive/pipe.py index 7f1c53d34..afe20846a 100755 --- a/test/exhaustive/pipe.py +++ b/test/exhaustive/pipe.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import os diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 5b4c972cb..689b1fb3a 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os @@ -462,7 +462,7 @@ fail_desc = { def chk_real_state(st): - # Before enything else -- check that we still have + # Before anything else -- check that we still have # all the sockets at hands for sk in st.sockets: if not sk.visible: diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 252778969..32c57d929 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -1,7 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import ctypes libc = ctypes.CDLL(None) -# libc may not have memfd_create (e.g., centos on travis) +# libc may not have memfd_create (e.g., centos) libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml index faae44d1b..ddb6c89cf 100644 --- a/test/javaTests/pom.xml +++ b/test/javaTests/pom.xml @@ -38,7 +38,7 @@ org.testng testng - 6.3.1 + 7.7.0 diff --git a/test/jenkins/actions.sh b/test/jenkins/actions.sh deleted file mode 100755 index 801904500..000000000 --- a/test/jenkins/actions.sh +++ /dev/null @@ -1,8 +0,0 @@ -# Check how crit de/encodes images -set -e -source `dirname $0`/criu-lib.sh -# prep -rm -f actions_called.txt -./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail -./test/check_actions.py || fail -exit 0 diff --git a/test/jenkins/criu-dedup.sh b/test/jenkins/criu-dedup.sh index 842d218bd..edb1b653d 100755 --- a/test/jenkins/criu-dedup.sh +++ b/test/jenkins/criu-dedup.sh @@ -4,7 +4,7 @@ set -e source `dirname $0`/criu-lib.sh prep -./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 -x maps09 -x maps10 || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 -f h --pre 2 --dedup -x maps04 -x maps007 || fail # Additionally run these tests as they touch a lot of # memory and it makes sense to additionally check it diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index f41073230..6ee7ce33a 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -9,7 +9,7 @@ prep ./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail # FIXME: fhandles looks broken on btrfs -cat /proc/self/mountinfo | grep -P "/.* / " | grep -q btrfs || NOBTRFS=$? +findmnt --noheadings --target . | grep -q btrfs || NOBTRFS=$? if [ $NOBTRFS -eq 1 ] ; then ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail fi @@ -39,3 +39,13 @@ fi ./test/zdtm.py run -t zdtm/static/fpu03 --fault 134 -f h --norst || fail # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail + +# check set_compel_interrupt_only_mode +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst +# check set_compel_interrupt_only_mode when test cgroup is frozen +./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:f --fault 137 + +if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then + fail +fi diff --git a/test/jenkins/criu-lazy-migration.pipeline b/test/jenkins/criu-lazy-migration.pipeline index 2c863f170..45dc2c776 100644 --- a/test/jenkins/criu-lazy-migration.pipeline +++ b/test/jenkins/criu-lazy-migration.pipeline @@ -21,7 +21,6 @@ pipeline { stage('Test'){ steps { sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' - junit 'test/report/criu-testreport*.xml' } } } diff --git a/test/jenkins/criu-lazy-migration.sh b/test/jenkins/criu-lazy-migration.sh index b23f31c79..02a212e0d 100755 --- a/test/jenkins/criu-lazy-migration.sh +++ b/test/jenkins/criu-lazy-migration.sh @@ -15,7 +15,7 @@ LAZY_MIGRATE_EXCLUDE="-x fifo_loop -x file_locks -x ptrace_sig -x overmount_file --lazy-migrate $LAZY_EXCLUDE $LAZY_MIGRATE_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 -f uns \ diff --git a/test/jenkins/criu-lazy-pages.sh b/test/jenkins/criu-lazy-pages.sh index f62912090..9ef721739 100755 --- a/test/jenkins/criu-lazy-pages.sh +++ b/test/jenkins/criu-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --lazy-pages $LAZY_EXCLUDE || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from images with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-pre-dump.sh b/test/jenkins/criu-pre-dump.sh index b2972d941..137f7c23f 100755 --- a/test/jenkins/criu-pre-dump.sh +++ b/test/jenkins/criu-pre-dump.sh @@ -5,6 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -# FIXME: https://github.com/checkpoint-restore/criu/issues/1868 -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --page-server -x 'maps04' || fail diff --git a/test/jenkins/criu-remote-lazy-pages.sh b/test/jenkins/criu-remote-lazy-pages.sh index 48787f3f6..1c677e333 100755 --- a/test/jenkins/criu-remote-lazy-pages.sh +++ b/test/jenkins/criu-remote-lazy-pages.sh @@ -12,7 +12,7 @@ source `dirname $0`/criu-lazy-common.sh --remote-lazy-pages $LAZY_EXCLUDE -x maps04 || fail # During pre-dump + lazy-pages we leave VM_NOHUGEPAGE set -LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02 -x maps09 -x maps10" +LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps02" # lazy restore from "remote" dump with pre-dumps ./test/zdtm.py run --all --keep-going --report report --parallel 4 \ diff --git a/test/jenkins/criu-snap.sh b/test/jenkins/criu-snap.sh index d8fdf02b3..b08c57f52 100755 --- a/test/jenkins/criu-snap.sh +++ b/test/jenkins/criu-snap.sh @@ -5,5 +5,5 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' -x 'maps09' -x 'maps10' || fail -./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' -x 'maps09' -x 'maps10' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps -x 'maps04' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --pre 3 --snaps --page-server -x 'maps04' || fail diff --git a/test/libfault/Makefile b/test/libfault/Makefile new file mode 100644 index 000000000..cbe47fdf2 --- /dev/null +++ b/test/libfault/Makefile @@ -0,0 +1,21 @@ +CC = gcc +CFLAGS = -c -fPIC -ldl + +SRC = libfault.c +OBJ = $(SRC:.c=.o) + +LIB = libfault.so + +.PHONY: all clean run + +all: $(LIB) + +$(LIB): $(OBJ) + $(CC) -shared -o $(LIB) $(OBJ) + +$(OBJ): $(SRC) + $(CC) $(CFLAGS) $< + +clean: + rm -f $(OBJ) $(LIB) + diff --git a/test/libfault/libfault.c b/test/libfault/libfault.c new file mode 100644 index 000000000..650bf08ca --- /dev/null +++ b/test/libfault/libfault.c @@ -0,0 +1,31 @@ +#define _GNU_SOURCE +#include +#include +#include + +ssize_t (*original_pread)(int fd, void *buf, size_t count, off_t offset) = NULL; + +/** + * This function is a wrapper around pread() that is used for testing CRIU's + * handling of cases where pread() returns less data than requested. + * + * pmc_fill() in criu/pagemap.c is a good example of where this can happen. + */ +ssize_t pread64(int fd, void *buf, size_t count, off_t offset) +{ + if (!original_pread) { + original_pread = dlsym(RTLD_NEXT, "pread"); + if (!original_pread) { + errno = EIO; + return -1; + } + } + + /* The following aims to simulate the case when pread() returns less + * data than requested. We need to ensure that CRIU handles such cases. */ + if (count > 2048) { + count -= 1024; + } + + return original_pread(fd, buf, count, offset); +} diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore new file mode 100644 index 000000000..ca9a0b541 --- /dev/null +++ b/test/others/action-script/.gitignore @@ -0,0 +1 @@ +actions_called.txt diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile new file mode 100644 index 000000000..594edc070 --- /dev/null +++ b/test/others/action-script/Makefile @@ -0,0 +1,3 @@ +run: + ./run.sh +.PHONY: run diff --git a/test/others/action-script/check_actions.py b/test/others/action-script/check_actions.py new file mode 100755 index 000000000..0140d8762 --- /dev/null +++ b/test/others/action-script/check_actions.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import os +import sys + +EXPECTED_ACTIONS = [ + 'pre-dump', + 'network-lock', + 'post-dump', + 'pre-restore', + 'setup-namespaces', + 'post-setup-namespaces', + 'post-restore', + 'network-unlock', + 'pre-resume', + 'post-resume', +] + +errors = [] +actions_called = [] +actions_called_file = os.path.join(os.path.dirname(__file__), 'actions_called.txt') + +with open(actions_called_file) as f: + for index, line in enumerate(f): + parts = line.strip().split() + parts += ['EMPTY'] * (3 - len(parts)) + action_hook, image_dir, pid = parts + + if action_hook == 'EMPTY': + raise ValueError("Error in test: bogus actions line") + + expected_action = EXPECTED_ACTIONS[index] if index < len(EXPECTED_ACTIONS) else None + if action_hook != expected_action: + raise ValueError(f"Invalid action: {action_hook} != {expected_action}") + + if image_dir == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_IMAGE_DIR') + + if action_hook != 'pre-restore': + if pid == 'EMPTY': + errors.append(f'Action {action_hook} misses CRTOOLS_INIT_PID') + elif not pid.isdigit() or int(pid) == 0: + errors.append(f'Action {action_hook} PID is not a valid number ({pid})') + + actions_called.append(action_hook) + +if actions_called != EXPECTED_ACTIONS: + errors.append(f'Not all actions called: {actions_called!r}') + +if errors: + print('\n'.join(errors)) + sys.exit(1) + +print('Check Actions PASS') diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh new file mode 100755 index 000000000..574f6fc86 --- /dev/null +++ b/test/others/action-script/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +rm -f "${SCRIPT_DIR}"/actions_called.txt +"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 -f ns --script "$SCRIPT_DIR/show_action.sh" || exit 1 +"${SCRIPT_DIR}"/check_actions.py || exit 1 + +exit 0 diff --git a/test/show_action.sh b/test/others/action-script/show_action.sh similarity index 66% rename from test/show_action.sh rename to test/others/action-script/show_action.sh index 86468b67a..afbfc3f27 100755 --- a/test/show_action.sh +++ b/test/others/action-script/show_action.sh @@ -1,3 +1,4 @@ #!/bin/bash + echo "${CRTOOLS_SCRIPT_ACTION} ${CRTOOLS_IMAGE_DIR} ${CRTOOLS_INIT_PID}" \ - >> "$(dirname $0)/actions_called.txt" + >> "$(dirname "$0")/actions_called.txt" diff --git a/test/others/app-emu/java/HelloWorld/run.sh b/test/others/app-emu/java/HelloWorld/run.sh index 0ed6afd14..e6dcbd9fc 100644 --- a/test/others/app-emu/java/HelloWorld/run.sh +++ b/test/others/app-emu/java/HelloWorld/run.sh @@ -18,7 +18,7 @@ setsid java HelloWorld & pid=${!} -echo Lanuched java application with pid $pid in background +echo Launched java application with pid $pid in background ${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { echo "Dump failed" diff --git a/test/others/app-emu/make/run.sh b/test/others/app-emu/make/run.sh index 7cb44c770..d871b7d9c 100644 --- a/test/others/app-emu/make/run.sh +++ b/test/others/app-emu/make/run.sh @@ -28,7 +28,7 @@ setsid make -j4 & pid=${!} -echo Lanuched make in $pid background +echo Launched make in $pid background sleep 2 ${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { diff --git a/test/others/bers/bers.c b/test/others/bers/bers.c index 37cf84dd3..b291e3bcb 100644 --- a/test/others/bers/bers.c +++ b/test/others/bers/bers.c @@ -391,7 +391,7 @@ usage: pr_msg(" -f|--files create files for each task\n"); pr_msg(" -m|--memory allocate megabytes for each task\n"); pr_msg(" --memory-chunks split memory to equal parts\n"); - pr_msg(" --mem-fill fill memory with data dependin on :\n"); + pr_msg(" --mem-fill fill memory with data depending on :\n"); pr_msg(" all fill every byte of memory\n"); pr_msg(" light fill first bytes of every page\n"); pr_msg(" dirtify fill every page\n"); diff --git a/test/others/config-file/run.sh b/test/others/config-file/run.sh index 92195883e..26b835b45 100755 --- a/test/others/config-file/run.sh +++ b/test/others/config-file/run.sh @@ -11,7 +11,7 @@ set -xbm -#shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh if [ ! -d /etc/criu ]; then diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 0d38043d7..2698bbd3c 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -1,11 +1,12 @@ #!/bin/bash -# shellcheck disable=SC1091,SC2002 +# shellcheck disable=SC2002 set -x +# shellcheck source=test/others/env.sh source ../env.sh -images_list="" +images_list=() function gen_imgs { PID=$(../loop) @@ -16,15 +17,15 @@ function gen_imgs { exit 1 fi - images_list=$(ls -1 ./*.img) - if [ -z "$images_list" ]; then + images_list=(./*.img) + if [ "${#images_list[@]}" -eq 0 ]; then echo "Failed to generate images" exit 1 fi } function run_test1 { - for x in $images_list + for x in "${images_list[@]}" do echo "=== $x" if [[ $x == *pages* ]]; then @@ -45,15 +46,16 @@ function run_test1 { function run_test2 { - mapfile -t array <<< "$images_list" - - PROTO_IN=${array[0]} + PROTO_IN="${images_list[0]}" JSON_IN=$(mktemp -p ./ tmp.XXXXXXXXXX.json) OUT=$(mktemp -p ./ tmp.XXXXXXXXXX.log) # prepare ${CRIT} decode -i "${PROTO_IN}" -o "${JSON_IN}" + # show info about image + ${CRIT} info "${PROTO_IN}" + # proto in - json out decode cat "${PROTO_IN}" | ${CRIT} decode || exit 1 cat "${PROTO_IN}" | ${CRIT} decode -o "${OUT}" || exit 1 @@ -99,6 +101,8 @@ function run_test2 { ${CRIT} x ./ rss || exit 1 } +${CRIT} --version + gen_imgs run_test1 run_test2 diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index dd774e298..2be82e64c 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -1,11 +1,11 @@ #!/bin/bash set -x -# shellcheck disable=SC1091 +# shellcheck source=test/others/env.sh source ../env.sh || exit 1 function gen_imgs { - PID=$(../loop) + PID=$(../loop with a very very very very very very very very very very very very long cmdline) if ! $CRIU dump -v4 -o dump.log -D ./ -t "$PID"; then echo "Failed to checkpoint process $PID" cat dump.log @@ -43,5 +43,12 @@ function run_test { echo "= done" } +UNAME_M=$(uname -m) + +if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "armv7l" &&"$UNAME_M" != "x86_64" ]]; then + echo "criu-coredump only supports aarch64 armv7l, and x86_64. skipping." + exit 0 +fi + gen_imgs run_test diff --git a/test/others/criu-ns/Makefile b/test/others/criu-ns/Makefile new file mode 100644 index 000000000..4d901a111 --- /dev/null +++ b/test/others/criu-ns/Makefile @@ -0,0 +1,3 @@ +run: + @make -C ../.. zdtm_ct + ../../zdtm_ct run.py diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py new file mode 100755 index 000000000..0a36438e8 --- /dev/null +++ b/test/others/criu-ns/run.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 + +import fcntl +import os +import pathlib +import pty +import shutil +import subprocess +import sys +import termios +import time + + +CRIU_BIN = "../../../criu/criu" +CRIU_NS = "../../../scripts/criu-ns" +IMG_DIR = "dumpdir" +DUMP_LOG = "dump.log" +RESTORE_LOG = "restore.log" +PIDFILE = "pidfile" + + +def check_dumpdir(path=IMG_DIR): + if os.path.isdir(path): + shutil.rmtree(path) + os.mkdir(path, 0o755) + + +def run_task_with_own_pty(task): + fd_m, fd_s = pty.openpty() + + pid = os.fork() + if pid == 0: + os.close(fd_m) + os.setsid() + os.dup2(fd_s, 0) + os.dup2(fd_s, 1) + os.dup2(fd_s, 2) + fcntl.ioctl(fd_s, termios.TIOCSCTTY, 1) + os.close(fd_s) + task() + exit(0) + + os.close(fd_s) + fd_m = os.fdopen(fd_m, "rb") + os.set_blocking(fd_m.fileno(), False) + + while True: + try: + data = fd_m.read() + except IOError: + break + if data is not None: + print(data.decode("utf-8")) + + _, status = os.waitpid(pid, 0) + + try: + data = fd_m.read() + except IOError as err: + print(err) + + if data is not None: + print(data.decode("utf-8")) + fd_m.close() + + if status != 0: + print("task %s exited badly: %d" % (task.__name__, status)) + exit(1) + + return 0 + + +def create_pty(): + fd_m, fd_s = pty.openpty() + return (os.fdopen(fd_m, "wb"), os.fdopen(fd_s, "wb")) + + +def create_isolated_dumpee(): + pathlib.Path("running").touch() + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + os.dup2(fd_s.fileno(), 0) + os.dup2(fd_s.fileno(), 1) + os.dup2(fd_s.fileno(), 2) + fcntl.ioctl(fd_s.fileno(), termios.TIOCSCTTY, 1) + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + fd_m.close() + fd_s.close() + return pid + + +def criu_ns_dump(pid, shell_job=False): + cmd = [CRIU_NS, "dump", "-D", IMG_DIR, "-v4", "-t", str(pid), + "--log-file", DUMP_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + ret = subprocess.Popen(cmd).wait() + return ret + + +def criu_ns_restore(shell_job=False, restore_detached=False): + cmd = [CRIU_NS, "restore", "-D", IMG_DIR, "-v4", "--log-file", + RESTORE_LOG, "--criu-binary", CRIU_BIN] + if shell_job: + cmd.append("--shell-job") + if restore_detached: + cmd += ["--restore-detached", "--pidfile", PIDFILE] + ret = subprocess.Popen(cmd).wait() + return ret + + +def read_log_file(filename): + logfile_path = os.path.join(IMG_DIR, filename) + with open(logfile_path) as logfile: + print(logfile.read()) + + +def test_dump_and_restore_with_shell_job(): + print("Test criu-ns dump and restore with --shell-job option") + check_dumpdir() + pathlib.Path("running").touch() + pid = os.fork() + if pid == 0: + while True: + if not os.access("running", os.F_OK): + sys.exit(0) + time.sleep(1) + + ret = criu_ns_dump(pid, shell_job=True) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + os.unlink("running") + fd_m, fd_s = create_pty() + pid = os.fork() + if pid == 0: + os.setsid() + fd_m.close() + # since criu-ns takes control of the tty stdin + os.dup2(fd_s.fileno(), 0) + ret = criu_ns_restore(shell_job=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + fd_s.close() + os.waitpid(pid, 0) + + +def test_dump_and_restore_without_shell_job(restore_detached=False): + print("Test criu-ns dump and restore with an isolated process" + "(%d)" % restore_detached) + check_dumpdir() + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + if not restore_detached: + os.unlink("running") + + pid = os.fork() + if pid == 0: + os.setsid() + ret = criu_ns_restore(restore_detached=restore_detached) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + os._exit(0) + + os.waitpid(pid, 0) + + +def test_dump_and_restore_in_pidns(): + if os.system("grep NSpid /proc/self/status"): + return + + print("Test criu-ns dump and restore in namespaces") + + def _dump(): + pid = create_isolated_dumpee() + ret = criu_ns_dump(pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _restore(): + ret = criu_ns_restore(restore_detached=True) + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + def _get_restored_pid(): + restored_pid = 0 + pidfile_path = os.path.join(IMG_DIR, PIDFILE) + if not os.path.exists(pidfile_path): + raise FileNotFoundError("pidfile not found") + with open(pidfile_path, "r") as pidfile: + restored_pid = pidfile.read().strip() + return int(restored_pid) + + def _redump(): + global IMG_DIR + try: + restored_pid = _get_restored_pid() + except FileNotFoundError: + sys.exit(1) + IMG_DIR = "dumpdir2" + check_dumpdir(IMG_DIR) + ret = criu_ns_dump(restored_pid) + if ret != 0: + read_log_file(DUMP_LOG) + sys.exit(ret) + + def _re_restore(): + os.unlink("running") + ret = criu_ns_restore() + if ret != 0: + read_log_file(RESTORE_LOG) + sys.exit(ret) + + check_dumpdir() + _dump() + _restore() + _redump() + _re_restore() + + +def main(): + test_dump_and_restore_with_shell_job() + test_dump_and_restore_without_shell_job() + test_dump_and_restore_without_shell_job(restore_detached=True) + test_dump_and_restore_in_pidns() + + +if __name__ == "__main__": + run_task_with_own_pty(main) diff --git a/test/others/env.sh b/test/others/env.sh index 45066f760..6fa2c9691 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -1,17 +1,13 @@ #!/bin/sh -CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) +BASE_DIR="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/../../")" + +CRIU="${BASE_DIR}/criu/criu" criu=$CRIU -if [ $(which python3) ]; then - PYTHON=python3 -elif [ $(which python2) ]; then - PYTHON=python2 -else - echo "FAIL: Neither python3 nor python2" - exit 1 -fi -#export PYTHON -CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") + +export PYTHONPATH="${BASE_DIR}/lib:${BASE_DIR}/crit:${PYTHONPATH-}" +CRIT="python3 -m crit" crit=$CRIT -CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../coredump/coredump-"${PYTHON}") + +CRIU_COREDUMP="${BASE_DIR}/coredump/coredump" criu_coredump=$CRIU_COREDUMP diff --git a/test/others/ext-tty/run.py b/test/others/ext-tty/run.py index 8109033cb..2c268a2c8 100755 --- a/test/others/ext-tty/run.py +++ b/test/others/ext-tty/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import os, sys, time, signal, pty diff --git a/test/others/libcriu/.gitignore b/test/others/libcriu/.gitignore index 0f6e52bb4..30a56999c 100644 --- a/test/others/libcriu/.gitignore +++ b/test/others/libcriu/.gitignore @@ -8,3 +8,4 @@ test_pre_dump test_feature_check output/ libcriu.so.* +test_rpc_config diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index ae7330533..927f17c23 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -3,10 +3,12 @@ include ../../../../criu/Makefile.versions TESTS += test_sub TESTS += test_self TESTS += test_notify +TESTS += test_rpc_config TESTS += test_iters TESTS += test_errno TESTS += test_join_ns TESTS += test_pre_dump +TESTS += test_check TESTS += test_feature_check all: $(TESTS) diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 77bdfb87e..6b36d4496 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -9,7 +9,7 @@ TEST_LOG="${TEST_DIR}/test.log" DUMP_LOG="${TEST_DIR}/dump.log" RESTORE_LOG="${TEST_DIR}/restore.log" -# shellcheck disable=1091 +# shellcheck source=test/others/env.sh source "${MAIN_DIR}/../env.sh" || exit 1 echo "== Clean" @@ -55,6 +55,7 @@ run_test() { run_test test_sub run_test test_self run_test test_notify +run_test test_rpc_config if [ "$(uname -m)" = "x86_64" ]; then # Skip this on aarch64 as aarch64 has no dirty page tracking run_test test_iters @@ -62,6 +63,7 @@ if [ "$(uname -m)" = "x86_64" ]; then fi run_test test_errno run_test test_join_ns +run_test test_check if criu check --feature mem_dirty_track > /dev/null; then export CRIU_FEATURE_MEM_TRACK=1 fi diff --git a/test/others/libcriu/test_check.c b/test/others/libcriu/test_check.c new file mode 100644 index 000000000..4af3b3630 --- /dev/null +++ b/test/others/libcriu/test_check.c @@ -0,0 +1,17 @@ +#include +#include "criu.h" +#include "lib.h" + +int main(int argc, char **argv) +{ + int ret; + + printf("--- Start check ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + + if (criu_check()) + return -1; + + return 0; +} diff --git a/test/others/libcriu/test_rpc_config.c b/test/others/libcriu/test_rpc_config.c new file mode 100644 index 000000000..529f13637 --- /dev/null +++ b/test/others/libcriu/test_rpc_config.c @@ -0,0 +1,223 @@ +#include "criu.h" +#include "lib.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RANDOM_NAME_LEN 6 +#define PATH_BUF_SIZE 128 + +static volatile sig_atomic_t stop = 0; +static char base_name[RANDOM_NAME_LEN + 1]; +static char log_file[PATH_BUF_SIZE]; +static char conf_file[PATH_BUF_SIZE]; + +static void handle_signal(int sig) +{ + (void)sig; + stop = 1; +} + +static void generate_random_base_name(void) +{ + const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + size_t charset_len; + int i; + + charset_len = sizeof(charset) - 1; + + for (i = 0; i < RANDOM_NAME_LEN; i++) { + base_name[i] = charset[rand() % charset_len]; + } + base_name[i] = '\0'; + + snprintf(log_file, sizeof(log_file), "/tmp/criu-%s.log", base_name); + snprintf(conf_file, sizeof(conf_file), "/tmp/criu-%s.conf", base_name); +} + +static int create_criu_config_file(void) +{ + int fd; + FILE *fp; + + srand(time(NULL)); + generate_random_base_name(); + + fd = open(conf_file, O_CREAT | O_EXCL | O_WRONLY, 0600); + if (fd < 0) { + perror("Failed to create config file"); + return -1; + } + + fp = fdopen(fd, "w"); + if (!fp) { + perror("fdopen failed"); + close(fd); + unlink(conf_file); + return -1; + } + + fprintf(fp, "log-file=%s\n", log_file); + fflush(fp); + fclose(fp); + + return 0; +} + +static int check_log_file(void) +{ + struct stat st; + + if (stat(log_file, &st) < 0) { + perror("Config file does not exist"); + return -1; + } + + if (st.st_size == 0) { + fprintf(stderr, "Config file is empty\n"); + return -1; + } + + unlink(log_file); + return 0; +} + +int main(int argc, char **argv) +{ + int pipe_fd[2]; + pid_t pid; + int ret; + int child_ret; + + int img_fd = open(argv[2], O_DIRECTORY); + if (img_fd < 0) { + perror("Failed to open images directory"); + goto cleanup; + } + + if (create_criu_config_file() < 0) { + printf("Failed to create config file\n"); + return EXIT_FAILURE; + } + + if (pipe(pipe_fd) < 0) { + perror("pipe"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + perror("fork failed"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /** child process **/ + printf(" `- loop: initializing\n"); + + if (setsid() < 0 || signal(SIGUSR1, handle_signal) == SIG_ERR) { + _exit(EXIT_FAILURE); + } + + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + close(pipe_fd[0]); + + child_ret = SUCC_ECODE; + write(pipe_fd[1], &child_ret, sizeof(child_ret)); + close(pipe_fd[1]); + + while (!stop) { + sleep(1); + } + + _exit(SUCC_ECODE); + } + + /** parent process **/ + close(pipe_fd[1]); + + ret = -1; + if (read(pipe_fd[0], &ret, sizeof(ret)) != sizeof(ret) || ret != SUCC_ECODE) { + printf("Error starting loop\n"); + goto cleanup; + } + + read(pipe_fd[0], &ret, 1); + close(pipe_fd[0]); + + printf("--- Loop process started (pid: %d) ---\n", pid); + + printf("--- Checkpoint ---\n"); + criu_init_opts(); + criu_set_service_binary(argv[1]); + criu_set_images_dir_fd(img_fd); + criu_set_pid(pid); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting dump RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("dump.log"); + + ret = criu_dump(); + if (ret < 0) { + what_err_ret_mean(ret); + kill(pid, SIGKILL); + printf("criu dump failed\n"); + goto cleanup; + } + + printf(" `- Dump succeeded\n"); + waitpid(pid, NULL, 0); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + + printf("--- Restore loop ---\n"); + criu_init_opts(); + criu_set_images_dir_fd(img_fd); + criu_set_log_level(CRIU_LOG_DEBUG); + + /* The RPC config file should overwrite the log-file set below */ + printf("Setting restore RPC config file: %s\n", conf_file); + criu_set_config_file(conf_file); + criu_set_log_file("restore.log"); + + pid = criu_restore_child(); + if (pid <= 0) { + what_err_ret_mean(pid); + ret = EXIT_FAILURE; + goto cleanup; + } + + printf(" `- Restore returned pid %d\n", pid); + kill(pid, SIGUSR1); + + if (check_log_file()) { + printf("Error: log file not overwritten by RPC config file\n"); + goto cleanup; + } + +cleanup: + if (waitpid(pid, &ret, 0) < 0) { + perror("waitpid failed"); + return EXIT_FAILURE; + } + + printf("Remove RPC config file: %s\n", conf_file); + unlink(conf_file); + return chk_exit(ret, SUCC_ECODE); +} diff --git a/test/others/mem-snap/run-predump-2.sh b/test/others/mem-snap/run-predump-2.sh index 46af8063b..5ef1422b4 100755 --- a/test/others/mem-snap/run-predump-2.sh +++ b/test/others/mem-snap/run-predump-2.sh @@ -28,7 +28,7 @@ function stop_test { wtime=1 cd ../../zdtm/static/ make maps04.stop - cat maps04.out | fgrep PASS || fail "Test failed" + fgrep PASS maps04.out || fail "Test failed" echo "OK" } diff --git a/test/others/mem-snap/run-predump.sh b/test/others/mem-snap/run-predump.sh index d06d2d8fc..06ba74737 100755 --- a/test/others/mem-snap/run-predump.sh +++ b/test/others/mem-snap/run-predump.sh @@ -72,6 +72,6 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" echo "Test PASSED" diff --git a/test/others/mem-snap/run-snap-auto-dedup.sh b/test/others/mem-snap/run-snap-auto-dedup.sh index f77aa1fcb..a3801f5b4 100755 --- a/test/others/mem-snap/run-snap-auto-dedup.sh +++ b/test/others/mem-snap/run-snap-auto-dedup.sh @@ -84,7 +84,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-dedup-on-restore.sh b/test/others/mem-snap/run-snap-dedup-on-restore.sh index 6ae050bc7..5dbb5bf44 100755 --- a/test/others/mem-snap/run-snap-dedup-on-restore.sh +++ b/test/others/mem-snap/run-snap-dedup-on-restore.sh @@ -78,7 +78,7 @@ fi cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" if [ $restore_dedup_ok -ne 0 ]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-dedup.sh b/test/others/mem-snap/run-snap-dedup.sh index 27fcd55a9..40db95325 100755 --- a/test/others/mem-snap/run-snap-dedup.sh +++ b/test/others/mem-snap/run-snap-dedup.sh @@ -90,7 +90,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-maps04.sh b/test/others/mem-snap/run-snap-maps04.sh index 2def909d9..267d51deb 100755 --- a/test/others/mem-snap/run-snap-maps04.sh +++ b/test/others/mem-snap/run-snap-maps04.sh @@ -58,7 +58,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log --auto-dedup -d -v4 || fa make -C ../../zdtm/static/ maps04.stop sleep 1 -cat "../zdtm/static/maps04.out" | fgrep PASS || fail "Test failed" +fgrep PASS "../zdtm/static/maps04.out" || fail "Test failed" size=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) if [ $size -ne 0 ] ; then diff --git a/test/others/mem-snap/run-snap.sh b/test/others/mem-snap/run-snap.sh index b97bd295e..c91cd0098 100755 --- a/test/others/mem-snap/run-snap.sh +++ b/test/others/mem-snap/run-snap.sh @@ -69,6 +69,6 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -cat mem-touch.out | fgrep PASS || fail "Test failed" +fgrep PASS mem-touch.out || fail "Test failed" echo "Test PASSED" diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 5a1f44450..5cdbc45a8 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -2,16 +2,14 @@ set -e -x # construct root -python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns +python3 ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop mkfs.ext4 -F zdtm.loop dev=`losetup --find --show zdtm.loop` -mkdir -p ../../dev -cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev -python ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? +python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev unlink zdtm.loop exit $ret diff --git a/test/others/mounts/mounts.sh b/test/others/mounts/mounts.sh index 19116d0cf..bed156a50 100755 --- a/test/others/mounts/mounts.sh +++ b/test/others/mounts/mounts.sh @@ -12,7 +12,7 @@ cd $INMNTNS mount --make-rprivate / -for i in `cat /proc/self/mounts | awk '{ print $2 }'`; do +for i in `awk '{ print $2 }' < /proc/self/mounts`; do [ '/' = "$i" ] && continue [ '/proc' = "$i" ] && continue [ '/dev' = "$i" ] && continue @@ -20,7 +20,7 @@ for i in `cat /proc/self/mounts | awk '{ print $2 }'`; do umount -l $i done -python mounts.py +python3 mounts.py kill $INMNTNS_PID while :; do sleep 10 diff --git a/test/others/mounts/run.sh b/test/others/mounts/run.sh index 35927fb5e..d665a726a 100755 --- a/test/others/mounts/run.sh +++ b/test/others/mounts/run.sh @@ -12,12 +12,12 @@ kill -0 $pid || exit cat /proc/$pid/mountinfo | sort -k 4 echo "Suspend server" ${CRIU} dump -D dump -o dump.log -t $pid -v4 || { - cat dump/dump.log | grep Error + grep Error dump/dump.log exit 1 } echo "Resume server" ${CRIU} restore -d -D dump -o restore.log -v4 || { - cat dump/dump.log | grep Error + grep Error dump/dump.log exit 1 } cat /proc/$pid/mountinfo | sort -k 4 diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index 2e9a6fe86..5d1e139d7 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -2,10 +2,13 @@ set -x +if ! ../../zdtm/static/macvlan.checkskip; then + echo "No macvlan support. Skipping" + exit 0 +fi + if [[ "$1" == "pid" ]]; then NS=pid - # CentOS 7 kernels do not have NSpid -> skip this test - grep NSpid /proc/self/status || exit 0 else NS=net fi @@ -61,7 +64,7 @@ exec 33< $MNT1 exec 34< $MNT2 $CRIU dump -v4 -t $pid -o dump.log -D images --external $NS[$ino]:test_ns --external $NS[$ino2]:test_ns2 RESULT=$? -cat images/dump.log | grep -B 5 Error || echo ok +grep -B 5 Error images/dump.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU dump failed" echo FAIL @@ -70,7 +73,7 @@ cat images/dump.log | grep -B 5 Error || echo ok $CRIU restore -v4 -o restore.log -D images --inherit-fd fd[33]:test_ns --inherit-fd fd[34]:test_ns2 -d RESULT=$? -cat images/restore.log | grep -B 5 Error || echo ok +grep -B 5 Error images/restore.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU restore failed" echo FAIL diff --git a/test/others/ns_ext/run_pidns.sh b/test/others/ns_ext/run_pidns.sh index 7ac855a18..db12106e0 100755 --- a/test/others/ns_ext/run_pidns.sh +++ b/test/others/ns_ext/run_pidns.sh @@ -2,9 +2,6 @@ set -e -# CentOS 7 kernels do not have NSpid -> skip this test -grep NSpid /proc/self/status || exit 0 - # This test creates a process in non-host pidns and then dumps it and restores # it into host pidns. We use pid >100000 in non-host pidns to make sure it does # not intersect with some host pid on restore but it is potentially racy so @@ -36,7 +33,7 @@ mkdir -p images_pidns echo "$CRIU dump -v4 -o dump.log -t $PID -D images_pidns --external $PIDNS:exti" $CRIU dump -v4 -o dump.log -t $PID -D images_pidns --external $PIDNS:exti RESULT=$? -cat images_pidns/dump.log | grep -B 5 Error || echo ok +grep -B 5 Error images_pidns/dump.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU dump failed" echo FAIL @@ -48,7 +45,7 @@ exec {pidns_fd}< /proc/self/ns/pid echo "$CRIU restore -v4 -o restore.log -D images_pidns --restore-detached --inherit-fd fd[$pidns_fd]:exti" $CRIU restore -v4 -o restore.log -D images_pidns --restore-detached --inherit-fd fd[$pidns_fd]:exti --pidfile test.pidfile RESULT=$? -cat images_pidns/restore.log | grep -B 5 Error || echo ok +grep -B 5 Error images_pidns/restore.log || echo ok [ "$RESULT" != "0" ] && { echo "CRIU restore failed" echo FAIL diff --git a/test/others/pycriu/.gitignore b/test/others/pycriu/.gitignore new file mode 100644 index 000000000..567609b12 --- /dev/null +++ b/test/others/pycriu/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/test/others/pycriu/Makefile b/test/others/pycriu/Makefile new file mode 100644 index 000000000..b6e3b4814 --- /dev/null +++ b/test/others/pycriu/Makefile @@ -0,0 +1,63 @@ +.SHELLFLAGS := -eu -o pipefail -c +.ONESHELL: + +CRIU ?= ../../../criu/criu +BUILD_DIR ?= build +SOCKET_NAME ?= criu_service.socket +PIDFILE_NAME ?= pidfile +SERVICE_LOG ?= service.log +PYTHON ?= python3 + +PIDFILE := $(BUILD_DIR)/$(PIDFILE_NAME) +CRIU_SOCKET := $(BUILD_DIR)/$(SOCKET_NAME) +STATUS_FIFO := $(BUILD_DIR)/startup.status +STATUS_FD := 200 + +run: start + cleanup() { $(MAKE) --no-print-directory stop || true; } + trap cleanup EXIT INT TERM + "$(PYTHON)" test_check.py + "$(PYTHON)" test_check_fail.py + "$(PYTHON)" test_check_images_dir.py + "$(PYTHON)" test_check_work_dir_fd.py + +start: + mkdir -p "$(BUILD_DIR)" + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + echo "Service running (PID $$(cat "$(PIDFILE)"))." + exit 0 + fi + if ! command -v "$(CRIU)" >/dev/null 2>&1; then + echo "CRIU not found at $(CRIU)" + exit 1 + fi + mkfifo "$(STATUS_FIFO)" + exec $(STATUS_FD)<>"$(STATUS_FIFO)" + "$(CRIU)" service \ + -v4 \ + -W "$(BUILD_DIR)" \ + --address "$(SOCKET_NAME)" \ + -d \ + --pidfile "$(PIDFILE_NAME)" \ + -o "$(SERVICE_LOG)" \ + --status-fd "$(STATUS_FD)" + "$(PYTHON)" read.py "$(STATUS_FIFO)" + +stop: + if [ ! -s "$(PIDFILE)" ]; then + echo "pidfile missing or empty" + exit 1 + fi + pid=$$(cat "$(PIDFILE)") + if kill -0 "$$pid" 2>/dev/null; then + kill -9 "$$pid" || true + fi + rm -f "$(PIDFILE)" "$(CRIU_SOCKET)" "$(STATUS_FIFO)" + +clean: + if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then + kill -9 "$$(cat "$(PIDFILE)")" || true + fi + rm -rf "$(BUILD_DIR)" + +.PHONY: start stop clean run \ No newline at end of file diff --git a/test/others/pycriu/read.py b/test/others/pycriu/read.py new file mode 120000 index 000000000..c2c1e1365 --- /dev/null +++ b/test/others/pycriu/read.py @@ -0,0 +1 @@ +../rpc/read.py \ No newline at end of file diff --git a/test/others/pycriu/test_check.py b/test/others/pycriu/test_check.py new file mode 100755 index 000000000..9888158db --- /dev/null +++ b/test/others/pycriu/test_check.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_fail.py b/test/others/pycriu/test_check_fail.py new file mode 100755 index 000000000..b5634c60b --- /dev/null +++ b/test/others/pycriu/test_check_fail.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + # Intentionally set only log_file (no images/work dir) to ensure check() fails + criu.opts.log_file = "check.log" + + try: + criu.check() + except Exception: + print("PASS") + return 0 + + print("FAIL: check() did not fail when log_file is set without images/work dir") + return 1 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_images_dir.py b/test/others/pycriu/test_check_images_dir.py new file mode 100755 index 000000000..f479c2a88 --- /dev/null +++ b/test/others/pycriu/test_check_images_dir.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def _log_path(images_dir, log_file): + return log_file if os.path.isabs(log_file) else os.path.join(images_dir, log_file) + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.images_dir = build_dir + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + lp = _log_path(build_dir, criu.opts.log_file) + msg = f"FAIL: {e} ({'see log: ' + lp if os.path.exists(lp) else 'no log found'})" + print(msg) + return 1 + + lp = _log_path(build_dir, criu.opts.log_file) + if not (os.path.isfile(lp) and os.path.getsize(lp) > 0): + print(f"FAIL: log file missing or empty: {lp}") + return 1 + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/pycriu/test_check_work_dir_fd.py b/test/others/pycriu/test_check_work_dir_fd.py new file mode 100755 index 000000000..e20a83097 --- /dev/null +++ b/test/others/pycriu/test_check_work_dir_fd.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import os +import sys + +# Add ../../../lib so we can import pycriu +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) +if LIB_DIR not in sys.path: + sys.path.insert(0, LIB_DIR) + +import pycriu # noqa: E402 + +def main(): + build_dir = os.path.join(SCRIPT_DIR, "build") + socket_path = os.path.join(build_dir, "criu_service.socket") + os.makedirs(build_dir, exist_ok=True) + + # Open a directory FD to use as work_dir_fd (prefer O_PATH if available) + flags = getattr(os, "O_PATH", 0) or os.O_RDONLY + fd = os.open(build_dir, flags) + + criu = pycriu.criu() + criu.use_sk(socket_path) + + criu.opts.work_dir_fd = fd + criu.opts.log_file = "check.log" + criu.opts.log_level = 4 + + try: + criu.check() + except Exception as e: + print(f"FAIL: {e}") + return 1 + finally: + try: + os.close(fd) + except Exception: + pass + + print("PASS") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index fc64f0c97..c0e56d528 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -4,13 +4,22 @@ all: test-c rpc_pb2.py criu CFLAGS += -g -Werror -Wall -I. LDLIBS += -lprotobuf-c -PYTHON ?= python +PYTHON ?= python3 run: all @make -C .. loop - mkdir -p build + mkdir -p build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} chmod a+rwx build + chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status + rm -f build/_marker_* + @# Create all log files to be accessible for anybody + @# so that they can be displayed by any user. + for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ + imgs_c/restore-c.log imgs_loop/criu.log imgs_loop/dump-loop.log \ + imgs_py/criu.log imgs_py/restore-py.log imgs_c/criu.log service.log; do \ + touch build/$$i; chmod 666 build/$$i; \ + done sudo -g '#1000' -u '#1000' mkfifo build/status @# Need to start the criu daemon here to access the pidfile. @# The script read.py is used to wait until 'criu service' @@ -39,7 +48,7 @@ rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto - protoc-c --proto_path=. --c_out=. rpc.proto + protoc --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu diff --git a/test/others/rpc/action-script.sh b/test/others/rpc/action-script.sh new file mode 100755 index 000000000..991e315de --- /dev/null +++ b/test/others/rpc/action-script.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +MARKER_FILE="_marker_${CRTOOLS_SCRIPT_ACTION}" + +if [ -z "$CRTOOLS_SCRIPT_ACTION" ]; then + echo "Error: CRTOOLS_SCRIPT_ACTION is not set." + exit 2 +fi + +if [ ! -f "$MARKER_FILE" ]; then + touch "$MARKER_FILE" +else + echo "Error: Running the same action hook for the second time" + exit 1 +fi + +exit 0 diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 90c80fcae..c1a8276d8 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import os @@ -13,6 +13,9 @@ from setup_swrk import setup_swrk log_file = 'config_file_test.log' does_not_exist = 'does-not.exist' +script_path = os.path.dirname(os.path.abspath(__file__)) +action_script_file = os.path.join(script_path, 'action-script.sh') + def setup_config_file(content): # Creating a temporary file which will be used as configuration file. @@ -89,29 +92,37 @@ def test_broken_configuration_file(): sys.exit(-1) -def search_in_log_file(log, message): - with open(os.path.join(args['dir'], log)) as f: +def search_in_log_file(log_path, message): + with open(log_path) as f: if message not in f.read(): - print( - 'FAIL: Missing the expected error message (%s) in the log file' - % message) + print('FAIL: Missing the expected error message (%s) in the log file' % message) sys.exit(-1) +def print_log_file(log_path): + print("\n--- Begin log file: %s ---" % log_path) + with open(log_path, 'r') as f: + print(f.read()) + print("--- End log file ---\n") + + def check_results(resp, log): # Check if the specified log file exists - if not os.path.isfile(os.path.join(args['dir'], log)): + log_path = os.path.join(args['dir'], log) + if not os.path.isfile(log_path): print('FAIL: Expected log file %s does not exist' % log) sys.exit(-1) # Dump should have failed with: 'The criu itself is within dumped tree' if resp.type != rpc.DUMP: print('FAIL: Unexpected msg type %r' % resp.type) + print_log_file(log_path) sys.exit(-1) if 'The criu itself is within dumped tree' not in resp.cr_errmsg: print('FAIL: Missing the expected error message in RPC response') + print_log_file(log_path) sys.exit(-1) # Look into the log file for the same message - search_in_log_file(log, 'The criu itself is within dumped tree') + search_in_log_file(log_path, 'The criu itself is within dumped tree') def test_rpc_without_configuration_file(): @@ -156,6 +167,7 @@ def test_rpc_with_configuration_file_overwriting_rpc(): # file settings in the default configuration. log = does_not_exist content = 'log-file ' + log + '\n' + content += 'action-script ' + action_script_file + '\n' content += 'no-tcp-established\nno-shell-job' path = setup_config_file(content) # Only set the configuration file via RPC; @@ -180,11 +192,18 @@ args = vars(parser.parse_args()) cleanup_output(args['dir']) +print("*** Test broken config file ***") test_broken_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC without config file ***") test_rpc_without_configuration_file() cleanup_output(args['dir']) + +print("*** Test RPC with config file ***") test_rpc_with_configuration_file() cleanup_output(args['dir']) + +print("*** Test configuration file overwriting RPC ***") test_rpc_with_configuration_file_overwriting_rpc() cleanup_output(args['dir']) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index f84757efd..ea841199f 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 # Test criu errno import socket, os, errno @@ -40,7 +40,7 @@ class test: resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp - def check_resp(self, resp, typ, err): + def check_resp(self, resp, typ, err, errmsg = None): if resp.type != typ: raise Exception('Unexpected response type ' + str(resp.type)) @@ -50,6 +50,9 @@ class test: if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) + if errmsg and errmsg not in str(resp.cr_errmsg): + raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') + def no_process(self): print('Try to dump unexisting process') # Get pid of non-existing process. @@ -67,6 +70,7 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.pid = pid + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() @@ -84,6 +88,7 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.leave_running = True + req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() @@ -130,11 +135,27 @@ class test: print('Success') + def child_first_err(self): + print('Receive correct first error message') + + req = self.get_base_req() + req.type = rpc.CHECK + # Log file must not have subdirectory + req.opts.log_file = "/foo/bar.log" + + self.send_req(req) + resp = self.recv_resp() + + self.check_resp(resp, rpc.CHECK, None, "No subdirs are allowed in log_file name") + + print('Success') + def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() + self.child_first_err() t = test() diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index b51357d42..259f22e77 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys, errno import rpc_pb2 as rpc @@ -23,6 +23,7 @@ req.type = rpc.PAGE_SERVER req.opts.log_file = 'page-server.log' req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP s.send(req.SerializeToString()) diff --git a/test/others/rpc/read.py b/test/others/rpc/read.py old mode 100644 new mode 100755 diff --git a/test/others/rpc/restore-loop.py b/test/others/rpc/restore-loop.py index 84a2ce56d..67110c2cf 100755 --- a/test/others/rpc/restore-loop.py +++ b/test/others/rpc/restore-loop.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index 9be577587..b6158dfea 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -1,16 +1,9 @@ #!/bin/bash -set -ex - -if [ -e /etc/os-release ]; then - . /etc/os-release - if [ "$ID" == "centos" ] && [[ "$VERSION_ID" == "7"* ]];then - echo "Skipping tests on CentOS 7 because they do not work in CI" - exit 0 - fi -fi +set -e CRIU=./criu +FAIL=1 export PROTODIR=`readlink -f "${PWD}/../../protobuf"` @@ -27,6 +20,13 @@ function stop_server { title_print "Shutdown service server" kill -SIGTERM $(cat build/pidfile) unlink build/pidfile + if [ "${FAIL}" == "1" ]; then + for i in build/output*; do + echo "File: $i" + cat $i + done + find . -name "*.log" -print -exec cat {} \; || true + fi } function test_c { @@ -59,7 +59,7 @@ function test_restore_loop { title_print "Dump loop process" # So theoretically '-j' (--shell-job) should not be necessary, but on alpine # this test fails without it. - ${CRIU} dump -j -v4 -o dump-loop.log -D build/imgs_loop -t ${P} + ${CRIU} dump -j -v4 -o dump-loop.log --network-lock skip -D build/imgs_loop -t ${P} title_print "Run restore-loop" ./restore-loop.py build/criu_service.socket build/imgs_loop @@ -88,6 +88,8 @@ test_restore_loop test_ps test_errno +FAIL=0 + stop_server trap 'echo "Success"' EXIT diff --git a/test/others/rpc/setup_swrk.py b/test/others/rpc/setup_swrk.py index c7f84f952..ffaa01de4 100644 --- a/test/others/rpc/setup_swrk.py +++ b/test/others/rpc/setup_swrk.py @@ -5,12 +5,6 @@ import subprocess def setup_swrk(): print('Connecting to CRIU in swrk mode.') s1, s2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) - - kwargs = {} - if sys.version_info.major == 3: - kwargs["pass_fds"] = [s1.fileno()] - - swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], **kwargs) + swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], pass_fds=[s1.fileno()]) s1.close() return swrk, s2 - diff --git a/test/others/rpc/test-c.c b/test/others/rpc/test-c.c index 792dbbf9c..b3507975f 100644 --- a/test/others/rpc/test-c.c +++ b/test/others/rpc/test-c.c @@ -99,6 +99,8 @@ int main(int argc, char *argv[]) req.opts->images_dir_fd = dir_fd; req.opts->has_log_level = true; req.opts->log_level = 4; + req.opts->has_network_lock = true; + req.opts->network_lock = CRIU_NETWORK_LOCK_METHOD__SKIP; /* * Connect to service socket diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index 80f6338f4..6f692f755 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import socket, os, sys import rpc_pb2 as rpc @@ -24,6 +24,7 @@ req.type = rpc.DUMP req.opts.leave_running = True req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) +req.opts.network_lock = rpc.SKIP # Send request s.send(req.SerializeToString()) diff --git a/test/others/rpc/version.py b/test/others/rpc/version.py index 9d7fa745b..a18cd5b7b 100755 --- a/test/others/rpc/version.py +++ b/test/others/rpc/version.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import rpc_pb2 as rpc diff --git a/test/others/shell-job/run.py b/test/others/shell-job/run.py index a59945d6a..969965f00 100755 --- a/test/others/shell-job/run.py +++ b/test/others/shell-job/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os, pty, sys, subprocess import termios, fcntl, time diff --git a/test/others/skip-file-rwx-check/Makefile b/test/others/skip-file-rwx-check/Makefile new file mode 100644 index 000000000..419d592b7 --- /dev/null +++ b/test/others/skip-file-rwx-check/Makefile @@ -0,0 +1,7 @@ +.PHONY: run clean + +run: + ./run.sh + +clean: + rm -rf testfile *.img dump.log restore-expected-fail.log restore.log stats-dump stats-restore diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh new file mode 100755 index 000000000..0776ebf61 --- /dev/null +++ b/test/others/skip-file-rwx-check/run.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +set -o errexit +set -o nounset +set -o pipefail +set -o xtrace + +source ../env.sh + +make clean +touch testfile +chmod +w testfile +bash -c 'exec 3= 5: - import importlib.util - spec = importlib.util.spec_from_file_location(name, path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) - else: - import imp - mod = imp.load_source(name, path) + import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) return mod class inhfd_test: - def __init__(self, name, desc, flavor, freezer): + def __init__(self, name, desc, flavor, freezer, rootless): + if rootless: + raise test_fail_exc("This kind of test does not currently support rootless mode") self.__name = os.path.basename(name) print("Load %s" % name) self.__fdtyp = load_module_from_file(self.__name, name) @@ -801,8 +874,8 @@ class inhfd_test: class groups_test(zdtm_test): - def __init__(self, name, desc, flavor, freezer): - zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer) + def __init__(self, name, desc, flavor, freezer, rootless): + zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer, rootless) if flavor.ns: self.__real_name = name with open(name) as fd: @@ -814,7 +887,7 @@ class groups_test(zdtm_test): self._bins += self.__subs self._deps += get_test_desc('zdtm/lib/groups')['deps'] - self._env = {'ZDTM_TESTS': self.__real_name} + self._env['ZDTM_TESTS'] = self.__real_name def __get_start_cmd(self, name): tdir = os.path.dirname(name) @@ -824,7 +897,7 @@ class groups_test(zdtm_test): subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout=subprocess.PIPE) - out, _ = s.communicate() + out, _ = s.communicate(timeout=self.__timeout) cmd = out.decode().splitlines()[-1].strip() return 'cd /' + tdir + ' && ' + cmd @@ -868,15 +941,22 @@ class criu_cli: fault=None, strace=[], preexec=None, - nowait=False): + preload_libfault=False, + nowait=False, + timeout=60): env = dict( os.environ, - ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") + ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0", + CRIU_LIBS_DIR=PLUGINS_DIR + ) if fault: print("Forcing %s fault" % fault) env['CRIU_FAULT'] = fault + if preload_libfault: + env['LD_PRELOAD'] = LIBFAULT_PATH + cr = subprocess.Popen(strace + [criu_bin, action, "--no-default-config"] + args, env=env, @@ -884,7 +964,11 @@ class criu_cli: preexec_fn=preexec) if nowait: return cr - return cr.wait() + return cr.wait(timeout=timeout) + + @staticmethod + def exit_signal(ret): + return ret < 0 class criu_rpc_process: @@ -967,7 +1051,9 @@ class criu_rpc: fault=None, strace=[], preexec=None, - nowait=False): + preload_libfault=False, + nowait=False, + timeout=None): if fault: raise test_fail_exc('RPC and FAULT not supported') if strace: @@ -1004,8 +1090,11 @@ class criu_rpc: else: raise test_fail_exc('RPC for %s required' % action) except crpc.CRIUExceptionExternal as e: - print("Fail", e) - ret = -1 + if e.typ != e.resp_typ: + ret = -2 + else: + print("Fail", e) + ret = -1 else: ret = 0 @@ -1018,6 +1107,10 @@ class criu_rpc: return ret + @staticmethod + def exit_signal(ret): + return ret == -2 + class criu: def __init__(self, opts): @@ -1039,6 +1132,7 @@ class criu: self.__dedup = bool(opts['dedup']) self.__mdedup = bool(opts['noauto_dedup']) self.__user = bool(opts['user']) + self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) self.__stream = bool(opts['stream']) self.__show_stats = bool(opts['show_stats']) @@ -1048,9 +1142,29 @@ class criu: self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] + + global crpc + pycriu_search_path = opts.get('pycriu_search_path') + if pycriu_search_path: + sys.path.insert(0, pycriu_search_path) + + try: + import pycriu as crpc + if pycriu_search_path: + print(f"pycriu loaded from: {crpc.__file__}") + except ImportError: + if not pycriu_search_path: + print("Consider building CRIU or using '--pycriu-search-path' option.") + raise + finally: + if pycriu_search_path: + sys.path.pop(0) + self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] + self.__preload_libfault = bool(opts['preload_libfault']) self.__mntns_compat_mode = bool(opts['mntns_compat_mode']) + self.__cuda_checkpoint = bool(opts['mocked_cuda_checkpoint']) if opts['rpc']: self.__criu = criu_rpc @@ -1133,11 +1247,17 @@ class criu: s_args = ["--log-file", log, "--images-dir", self.__ddir(), "--verbosity=4"] + opts + if self.__cuda_checkpoint: + s_args += [ "--libdir" , os.path.join(os.getcwd(), "..", "plugins", "cuda") ] + with open(os.path.join(self.__ddir(), action + '.cropt'), 'w') as f: f.write(' '.join(s_args) + '\n') print("Run criu " + action) + if self.__rootless: + s_args += ["--unprivileged"] + strace = [] if self.__sat: fname = os.path.join(self.__ddir(), action + '.strace') @@ -1156,7 +1276,10 @@ class criu: if action == "restore": preexec = None else: - preexec = self.__user and self.set_user_id or None + if os.getuid(): + preexec = None + else: + preexec = self.__user and self.set_user_id or None __ddir = self.__ddir() @@ -1171,8 +1294,10 @@ class criu: with open("/proc/sys/kernel/ns_last_pid") as ns_last_pid_fd: ns_last_pid = ns_last_pid_fd.read() + preload_libfault = self.__preload_libfault and action in ['dump', 'pre-dump', 'restore'] + ret = self.__criu.run(action, s_args, self.__criu_bin, self.__fault, - strace, preexec, nowait) + strace, preexec, preload_libfault, nowait) if nowait: os.close(status_fds[1]) @@ -1212,8 +1337,8 @@ class criu: return rst_succeeded = os.access( os.path.join(__ddir, "restore-succeeded"), os.F_OK) - if self.__test.blocking() or (self.__sat and action == 'restore' and - rst_succeeded): + if (self.__test.blocking() and not self.__criu.exit_signal(ret)) or \ + (self.__sat and action == 'restore' and rst_succeeded): raise test_fail_expected_exc(action) else: raise test_fail_exc("CRIU %s" % action) @@ -1476,15 +1601,17 @@ class criu: except Exception: return False - return criu_cli.run( - "check", - ["--no-default-config", "--verbosity=0", "--feature", feature], - opts['criu_bin']) == 0 + args = ["--no-default-config", "-verbosity=0", "--feature", feature] + if opts['rootless']: + args += ["--unprivileged"] + + return criu_cli.run("check", args, opts['criu_bin']) == 0 @staticmethod def available(): if not os.access(opts['criu_bin'], os.X_OK): print("CRIU binary not found at %s" % opts['criu_bin']) + print("Consider building CRIU or using '--criu-bin' option.") sys.exit(1) def kill(self): @@ -1651,6 +1778,15 @@ def get_visible_state(test): return files, maps, mounts +def has_vsyscall(maps): + vsyscall = u"ffffffffff600000-ffffffffff601000" + for i in maps: + if vsyscall in i: + return i + + return None + + def check_visible_state(test, state, opts): new = get_visible_state(test) @@ -1666,9 +1802,9 @@ def check_visible_state(test, state, opts): new_maps = new[1][pid] if os.getenv("COMPAT_TEST"): # the vsyscall vma isn't unmapped from x32 processes - vsyscall = u"ffffffffff600000-ffffffffff601000 r-xp" - if vsyscall in new_maps and vsyscall not in old_maps: - new_maps.remove(vsyscall) + entry = has_vsyscall(new_maps) + if entry and has_vsyscall(old_maps) is None: + new_maps.remove(entry) if old_maps != new_maps: print("%s: Old maps lost: %s" % (pid, old_maps - new_maps)) print("%s: New maps appeared: %s" % (pid, new_maps - old_maps)) @@ -1891,7 +2027,7 @@ def do_run_test(tname, tdesc, flavs, opts): if opts['dry_run']: continue flav = flavors[f](opts) - t = tclass(tname, tdesc, flav, fcg) + t = tclass(tname, tdesc, flav, fcg, opts['rootless']) cr_api = criu(opts) try: @@ -1942,8 +2078,6 @@ class Launcher: self.__subs = {} self.__fail = False self.__file_report = None - self.__junit_file = None - self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: @@ -1955,22 +2089,14 @@ class Launcher: if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase - from junit_xml import TestCase, TestSuite now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") - junitreport = os.path.join(report_dir, "criu-testreport.xml") - while os.access(reportname, os.F_OK) or os.access( - junitreport, os.F_OK): + while os.access(reportname, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) - junitreport = os.path.join(report_dir, - "criu-testreport" + ".%d.xml" % att) att += 1 - self.__junit_file = open(junitreport, 'a') - self.__junit_test_cases = [] - self.__file_report = open(reportname, 'a') print(u"TAP version 13", file=self.__file_report) print(u"# Hardware architecture: " + arch, file=self.__file_report) @@ -1979,12 +2105,20 @@ class Launcher: file=self.__file_report) print(u"# ", file=self.__file_report) print(u"1.." + str(nr_tests), file=self.__file_report) - with open("/proc/sys/kernel/tainted") as taintfd: - self.__taint = taintfd.read() + self.__taint = self.__read_kernel_tainted() if int(self.__taint, 0) != 0: - print("The kernel is tainted: %r" % self.__taint) - if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != '1': - raise Exception("The kernel is tainted: %r" % self.__taint) + self.__report_kernel_taint("The kernel is tainted: %r" % self.__taint) + + @staticmethod + def __read_kernel_tainted(): + with open("/proc/sys/kernel/tainted") as taintfd: + return taintfd.read().strip() + + @staticmethod + def __report_kernel_taint(msg): + print(msg) + if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != "1": + raise Exception(msg) def __show_progress(self, msg): perc = int(self.__nr * 16 / self.__total) @@ -1997,10 +2131,6 @@ class Launcher: self.__runtest += 1 self.__nr_skip += 1 - if self.__junit_test_cases is not None: - tc = TestCase(name) - tc.add_skipped_info(reason) - self.__junit_test_cases.append(tc) if self.__file_report: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report) @@ -2010,11 +2140,12 @@ class Launcher: if len(self.__subs) >= self.__max: self.wait() - with open("/proc/sys/kernel/tainted") as taintfd: - taint = taintfd.read() + taint = self.__read_kernel_tainted() if self.__taint != taint: - raise Exception("The kernel is tainted: %r (%r)" % - (taint, self.__taint)) + prev_taint = self.__taint + self.__taint = taint + self.__report_kernel_taint( + "The kernel is tainted: %r (was %r)" % (taint, prev_taint)) ''' The option --link-remap allows criu to hardlink open files back to the @@ -2042,7 +2173,9 @@ class Launcher: 'sat', 'script', 'rpc', 'criu_config', 'lazy_pages', 'join_ns', 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', - 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode') + 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', + 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint', + 'pycriu_search_path') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2052,8 +2185,14 @@ class Launcher: logf = None log = None + if opts['rootless'] and os.getuid() == 0: + os.setgid(NON_ROOT_UID) + os.setuid(NON_ROOT_UID) + env = dict(os.environ, CR_CT_TEST_INFO=arg) + if opts['mocked_cuda_checkpoint']: + env['PATH'] = os.path.join(os.getcwd(), "cuda-checkpoint") + ":" + env["PATH"] sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], - env=dict(os.environ, CR_CT_TEST_INFO=arg), + env=env, stdout=log, stderr=subprocess.STDOUT, close_fds=True) @@ -2093,11 +2232,6 @@ class Launcher: # The following wait() is not useful for our domain logic. # It's useful for taming warnings in subprocess.Popen.__del__() sub['sub'].wait() - tc = None - if self.__junit_test_cases is not None: - tc = TestCase(sub['name'], - elapsed_sec=time.time() - sub['start']) - self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2108,7 +2242,6 @@ class Launcher: with open(sub['log']) as sublog: output = sublog.read() details = {'output': output} - tc.add_error_info(output=output) print(testline, file=self.__file_report) print("%s" % yaml.safe_dump(details, explicit_start=True, @@ -2154,10 +2287,6 @@ class Launcher: if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: - ts = TestSuite(opts['title'], self.__junit_test_cases, - os.getenv("NODE_NAME")) - self.__junit_file.write(TestSuite.to_xml_string([ts])) - self.__junit_file.close() self.__file_report.close() if opts['keep_going']: @@ -2193,9 +2322,21 @@ def all_tests(opts): continue files.append(fp) excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) - tlist = list(filter( + tlist = list(sorted(filter( lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and - x not in excl, map(lambda x: x.strip(), files))) + x not in excl, map(lambda x: x.strip(), files)))) + + if opts.get('test_shard_count'): + if opts.get('test_shard_index') is None: + raise KeyError('--test_shard_count > 0 must come with --test_shard_index') + slice_idx = opts['test_shard_index'] + slices = opts['test_shard_count'] + if slice_idx >= slices: + raise IndexError('--test_shard_index not less than --test_shard_count ({} >= {})'.format(slice_idx, slices)) + slist = list(tlist[slice_idx::slices]) + print("We're shard #{} of {}. Running {} of {} tests.\n".format(slice_idx, slices, len(slist), len(tlist))) + tlist = slist + return tlist @@ -2306,11 +2447,6 @@ def run_tests(opts): return torun = list(torun) - if opts['keep_going'] and len(torun) < 2: - print( - "[WARNING] Option --keep-going is more useful when running multiple tests" - ) - opts['keep_going'] = False if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") @@ -2353,6 +2489,7 @@ def run_tests(opts): "Specify --criu-image-streamer-dir or modify PATH to provide an alternate location") .format(streamer_dir)) + usernsIsSupported = criu.check("userns") launcher = Launcher(opts, len(torun)) try: for t in torun: @@ -2422,7 +2559,7 @@ def run_tests(opts): run_flavs = set(test_flavs) & set(opts_flavs) else: run_flavs = set([test_flavs.pop()]) - if not criu.check("userns"): + if not usernsIsSupported: run_flavs -= set(['uns']) if opts['user']: # FIXME -- probably uns will make sense @@ -2591,6 +2728,10 @@ def set_nr_hugepages(nr): with open("/proc/sys/vm/nr_hugepages", "w") as f: f.write("{}\n".format(nr)) return orig_hugepages + except PermissionError as err: + # EACCES is expected when running as non-root, otherwise re-raise the exception. + if err.errno != errno.EACCES or os.getuid() == 0: + raise except OSError as err: if err.errno != errno.EOPNOTSUPP: raise @@ -2664,6 +2805,10 @@ def get_cli_args(): rp.add_argument("--freezecg", help="Use freeze cgroup (path:state)") rp.add_argument("--user", help="Run CRIU as regular user", action='store_true') + rp.add_argument( + "--rootless", + help="Run CRIU rootless (uid!=0) (needs CAP_CHECKPOINT_RESTORE)", + action='store_true') rp.add_argument("--rpc", help="Run CRIU via RPC rather than CLI", action='store_true') @@ -2711,6 +2856,9 @@ def get_cli_args(): rp.add_argument("--criu-bin", help="Path to criu binary", default='../criu/criu') + rp.add_argument("--pycriu-search-path", + help=f"Path to search for pycriu module first (e.g., {site.getsitepackages()[0]})", + default=None) rp.add_argument("--crit-bin", help="Path to crit binary", default='../crit/crit') @@ -2724,6 +2872,19 @@ def get_cli_args(): rp.add_argument("--mntns-compat-mode", help="Use old compat mounts restore engine", action='store_true') + rp.add_argument("--test-shard-index", type=int, default=None, + help="Select tests for a shard (0-based)") + rp.add_argument("--test-shard-count", type=int, default=0, + help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") + rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") + rp.add_argument("--criu-plugin", + help="Run tests with CRIU plugin", + choices=['amdgpu', 'cuda', 'inventory_test_enabled', 'inventory_test_disabled'], + nargs='+', + default=None) + rp.add_argument("--mocked-cuda-checkpoint", + action="store_true", + help="Run criu with the cuda plugin and the mocked cuda-checkpoint tool") lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) @@ -2788,7 +2949,7 @@ if __name__ == '__main__': if opts['debug']: sys.settrace(traceit) - if opts['action'] == 'run': + if opts['action'] == run_tests: criu.available() for tst in test_classes.values(): tst.available() diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index d34523315..c95b4ef6a 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -23,12 +23,12 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 + ARCHCFLAGS += -march=armv6 else ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a+fp + ARCHCFLAGS += -march=armv7-a+fp else ifeq ($(ARMV),8) - # To build aarch32 on armv8 Travis-CI (see criu Makefile) - USERCFLAGS += -march=armv7-a + # To build aarch32 on armv8 (see criu Makefile) + ARCHCFLAGS += -march=armv7-a ARMV := 7 endif endif @@ -40,8 +40,8 @@ endif PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes -CFLAGS += $(USERCFLAGS) -CFLAGS += -D_GNU_SOURCE +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) +CFLAGS += -D_GNU_SOURCE -D_LARGEFILE64_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include ifeq ($(strip $(V)),) @@ -66,6 +66,11 @@ endif export PKG_CONFIG_PATH endif +ifeq ($(SHSTK_ENABLE),1) + CFLAGS += -mshstk + LDFLAGS += -Wl,-z,shstk +endif + define pkg-libs $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --libs $(1)) endef @@ -74,9 +79,17 @@ define pkg-cflags $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --cflags $(1)) endef +ifeq ($(GCS_ENABLE),1) + CFLAGS += -mbranch-protection=standard + LDFLAGS += -z experimental-gcs=check + TEST_ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2 +else + TEST_ENV = +endif + %.d: %.c $(E) " DEP " $@ - $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP -c $< -o $@ + $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP $< -o $@ %.o: %.c | %.d $(E) " CC " $@ diff --git a/test/zdtm/criu_config.py b/test/zdtm/criu_config.py index 487becfb4..9fd292747 100644 --- a/test/zdtm/criu_config.py +++ b/test/zdtm/criu_config.py @@ -11,6 +11,7 @@ class criu_config: fault=None, strace=[], preexec=None, + preload=False, nowait=False): config_path = tempfile.mktemp(".conf", "criu-%s-" % action) @@ -40,3 +41,7 @@ class criu_config: if nowait: return cr return cr.wait() + + @staticmethod + def exit_signal(ret): + return ret < 0 diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 3ec58dfaf..428d726d6 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -1,10 +1,10 @@ LIBDIR := . -CFLAGS += $(USERCFLAGS) +CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) LIB := libzdtmtst.a -LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c file.c mountinfo.c PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') @@ -34,4 +34,4 @@ clean: clean-more $(LIB): $(LIBOBJ) $(E) " AR " $@ - $(Q)ar rcs $@ $^ + $(Q)$(AR) rcs $@ $^ diff --git a/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h new file mode 100644 index 000000000..1803aaeb4 --- /dev/null +++ b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h @@ -0,0 +1,49 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +#define atomic_get(v) (*(volatile int *)v) +#define atomic_set(v, i) (*(v) = (i)) + +static inline int __atomic_add(int i, atomic_t *v) +{ + int result; + asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(*v), "=&r"(result) : "r"(i) : "memory"); + return result + i; +} + +static inline void atomic_add(int i, atomic_t *v) +{ + __atomic_add(i, v); +} + +static inline int atomic_add_return(int i, atomic_t *v) +{ + return __atomic_add(i, v); +} + +#define atomic_sub(i, v) atomic_add(-(int)i, v) +#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) +#define atomic_inc(v) atomic_add_return(1, v) +#define atomic_dec(v) atomic_sub_return(1, v) +#define atomic_dec_return(v) atomic_sub_return(1, v) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + int ret; + asm volatile("1: \n" + " ll.w %0, %1 \n" + " bne %0, %2, 2f \n" + " or $t0, %3, $zero \n" + " sc.w $t0, %1 \n" + " beqz $t0, 1b \n" + "2: \n" + " dbar 0 \n" + : "=&r"(ret), "+ZB"(*ptr) + : "r"(old), "r"(new) + : "t0", "memory"); + return ret; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/arch/riscv64/include/asm/atomic.h b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h new file mode 100644 index 000000000..a4faf1322 --- /dev/null +++ b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h @@ -0,0 +1,107 @@ +#ifndef __CR_ATOMIC_H__ +#define __CR_ATOMIC_H__ + +typedef uint32_t atomic_t; + +/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ + +#define nop() __asm__ __volatile__("nop") + +#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") + +/* These barriers need to enforce ordering on both devices or memory. */ +#define mb() RISCV_FENCE(iorw, iorw) +#define rmb() RISCV_FENCE(ir, ir) +#define wmb() RISCV_FENCE(ow, ow) + +/* These barriers do not need to enforce ordering on devices, just memory. */ +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) + +#define __smp_store_release(p, v) \ + do { \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(rw, w); \ + WRITE_ONCE(*p, v); \ + } while (0) + +#define __smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + RISCV_FENCE(r, rw); \ + ___p1; \ + }) + +/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ + +static inline int atomic_read(const atomic_t *v) +{ + return (*(volatile int *)v); +} + +static inline void atomic_set(atomic_t *v, int i) +{ + *v = i; +} + +#define atomic_get atomic_read + +static inline int atomic_add_return(int i, atomic_t *v) +{ + int result; + + asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(*v), "=r"(result) : "r"(i) : "memory"); + __smp_mb(); + return result + i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +static inline int atomic_inc(atomic_t *v) +{ + return atomic_add_return(1, v) - 1; +} + +static inline int atomic_add(int val, atomic_t *v) +{ + return atomic_add_return(val, v) - val; +} + +static inline int atomic_dec(atomic_t *v) +{ + return atomic_sub_return(1, v) + 1; +} + +/* true if the result is 0, or false for all other cases. */ +#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) + +static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) +{ + unsigned long tmp; + int oldval; + + __smp_mb(); + + asm volatile("1:\n" + " lr.w %1, %2\n" + " bne %1, %3, 2f\n" + " sc.w %0, %4, %2\n" + " bnez %0, 1b\n" + "2:" + : "=&r"(tmp), "=&r"(oldval), "+A"(*ptr) + : "r"(old), "r"(new) + : "memory"); + + __smp_mb(); + return oldval; +} + +#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/file.c b/test/zdtm/lib/file.c new file mode 100644 index 000000000..57d85421d --- /dev/null +++ b/test/zdtm/lib/file.c @@ -0,0 +1,46 @@ +#include +#include +#include "zdtmtst.h" + +int write_value(const char *path, const char *value) +{ + int fd, l; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + l = write(fd, value, strlen(value)); + if (l < 0) { + pr_perror("failed to write %s to %s", value, path); + close(fd); + return -1; + } + + close(fd); + return 0; +} + +int read_value(const char *path, char *value, int size) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + ret = read(fd, (void *)value, size); + if (ret < 0) { + pr_perror("read %s", path); + close(fd); + return -1; + } + + value[ret] = '\0'; + close(fd); + return 0; +} diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index 7b8be5f9f..efcc7a1d0 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); @@ -108,6 +108,7 @@ int get_cwd_check_perm(char **result) "Bit 'x' should be set in all path components of " "this directory\n", cwd, getuid(), getgid(), errno, strerror(errno)); + free(cwd); return -1; } diff --git a/test/zdtm/lib/list.h b/test/zdtm/lib/list.h new file mode 100644 index 000000000..97d0f1e06 --- /dev/null +++ b/test/zdtm/lib/list.h @@ -0,0 +1,389 @@ +#ifndef __ZDTM_LIST_H__ +#define __ZDTM_LIST_H__ + +/* + * Double linked lists. + */ + +#include +#include "zdtmtst.h" + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *)0x00100100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *)0x00200200 + POISON_POINTER_DELTA) + +struct list_head { + struct list_head *prev, *next; +}; + +#define LIST_HEAD_INIT(name) \ + { \ + &(name), &(name) \ + } +#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline void list_replace(struct list_head *old, struct list_head *new) +{ + new->next = old->next; + new->next->prev = new; + new->prev = old->prev; + new->prev->next = new; +} + +static inline void list_replace_init(struct list_head *old, struct list_head *new) +{ + list_replace(old, new); + INIT_LIST_HEAD(old); +} + +static inline void list_del_init(struct list_head *entry) +{ + __list_del_entry(entry); + INIT_LIST_HEAD(entry); +} + +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +static inline void list_move_tail(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add_tail(list, head); +} + +static inline int list_is_last(const struct list_head *list, const struct list_head *head) +{ + return list->next == head; +} + +static inline int list_is_first(const struct list_head *list, const struct list_head *head) +{ + return list->prev == head; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) +{ + struct list_head *new_first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new_first; + new_first->prev = head; +} + +static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) +{ + if (list_empty(head)) + return; + if (list_is_singular(head) && (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); +} + +static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +static inline void list_splice(const struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head, head->next); +} + +static inline void list_splice_tail(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev, head); +} + +static inline void list_splice_init(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } +} + +static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } +} + +#define list_entry(ptr, type, member) container_of(ptr, type, member) + +#define list_first_entry(ptr, type, member) list_entry((ptr)->next, type, member) + +#define list_for_each(pos, head) for (pos = (head)->next; pos != (head); pos = pos->next) + +#define list_for_each_prev(pos, head) for (pos = (head)->prev; pos != (head); pos = pos->prev) + +#define list_for_each_safe(pos, n, head) for (pos = (head)->next, n = pos->next; pos != (head); pos = n, n = pos->next) + +#define list_for_each_prev_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; pos != (head); pos = n, n = pos->prev) + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_prepare_entry(pos, head, member) ((pos) ?: list_entry(head, typeof(*pos), member)) + +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_entry(pos->member.prev, typeof(*pos), member); &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); pos = list_entry(pos->member.next, typeof(*pos), member)) + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_entry(pos->member.next, typeof(*pos), member); &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +#define list_safe_reset_next(pos, n, member) n = list_entry(pos->member.next, typeof(*pos), member) + +/* + * Double linked lists with a single pointer list head. + */ + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#define HLIST_HEAD_INIT \ + { \ + .first = NULL \ + } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) + +static inline void INIT_HLIST_NODE(struct hlist_node *h) +{ + h->next = NULL; + h->pprev = NULL; +} + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (!hlist_unhashed(n)) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, struct hlist_node *next) +{ + next->next = n->next; + n->next = next; + next->pprev = &n->next; + + if (next->next) + next->next->pprev = &next->next; +} + +/* after that we'll appear to be on some hlist and hlist_del will work */ +static inline void hlist_add_fake(struct hlist_node *n) +{ + n->pprev = &n->next; +} + +/* + * Move a list from one list head to another. Fixup the pprev + * reference of the first entry if it exists. + */ +static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) +{ + new->first = old->first; + if (new->first) + new->first->pprev = &new->first; + old->first = NULL; +} + +#define hlist_entry(ptr, type, member) container_of(ptr, type, member) + +#define hlist_for_each(pos, head) for (pos = (head)->first; pos; pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && ({ \ + n = pos->next; \ + 1; \ + }); \ + pos = n) + +#define hlist_entry_safe(ptr, type, member) (ptr) ? hlist_entry(ptr, type, member) : NULL + +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member); pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_continue(pos, member) \ + for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member); pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_from(pos, member) \ + for (; pos; pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + +#define hlist_for_each_entry_safe(pos, n, head, member) \ + for (pos = hlist_entry_safe((head)->first, typeof(*pos), member); pos && ({ \ + n = pos->member.next; \ + 1; \ + }); \ + pos = hlist_entry_safe(n, typeof(*pos), member)) + +#endif /* __ZDTM_LIST_H__ */ diff --git a/test/zdtm/lib/lock.h b/test/zdtm/lib/lock.h index 2b23550be..cc5306e06 100644 --- a/test/zdtm/lib/lock.h +++ b/test/zdtm/lib/lock.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "asm/atomic.h" #define BUG_ON(condition) \ diff --git a/test/zdtm/lib/mountinfo.c b/test/zdtm/lib/mountinfo.c new file mode 100644 index 000000000..d6ab67a3f --- /dev/null +++ b/test/zdtm/lib/mountinfo.c @@ -0,0 +1,490 @@ +#include +#include + +#include "mountinfo.h" +#include "fs.h" +#include "xmalloc.h" + +/* + * mountinfo contains mangled paths. space, tab and back slash were replaced + * with usual octal escape. This function replaces these symbols back. + */ +static void cure_path(char *path) +{ + int i, len, off = 0; + + if (strchr(path, '\\') == NULL) /* fast path */ + return; + + len = strlen(path); + for (i = 0; i < len; i++) { + if (!strncmp(path + i, "\\040", 4)) { + path[i - off] = ' '; + goto replace; + } else if (!strncmp(path + i, "\\011", 4)) { + path[i - off] = '\t'; + goto replace; + } else if (!strncmp(path + i, "\\134", 4)) { + path[i - off] = '\\'; + goto replace; + } + if (off) + path[i - off] = path[i]; + continue; + replace: + off += 3; + i += 3; + } + path[len - off] = 0; +} + +static struct mountinfo_zdtm *mountinfo_zdtm_alloc(struct mntns_zdtm *mntns) +{ + struct mountinfo_zdtm *new; + + new = xzalloc(sizeof(struct mountinfo_zdtm)); + if (new) + list_add_tail(&new->list, &mntns->mountinfo_list); + return new; +} + +static void mountinfo_zdtm_free(struct mountinfo_zdtm *mountinfo) +{ + list_del(&mountinfo->list); + xfree(mountinfo->mountpoint); + xfree(mountinfo->root); + xfree(mountinfo->fstype); + xfree(mountinfo); +} + +static void mountinfo_zdtm_free_all(struct mntns_zdtm *mntns) +{ + struct mountinfo_zdtm *mountinfo, *tmp; + + list_for_each_entry_safe(mountinfo, tmp, &mntns->mountinfo_list, list) + mountinfo_zdtm_free(mountinfo); +} + +#define BUF_SIZE 4096 +char buf[BUF_SIZE]; + +int mntns_parse_mountinfo(struct mntns_zdtm *mntns) +{ + FILE *f; + int ret; + + INIT_LIST_HEAD(&mntns->mountinfo_list); + + f = fopen("/proc/self/mountinfo", "r"); + if (!f) { + pr_perror("Failed to open mountinfo"); + return -1; + } + + while (fgets(buf, BUF_SIZE, f)) { + struct mountinfo_zdtm *new; + unsigned int kmaj, kmin; + char *str, *hyphen, *shared, *master; + int n; + + new = mountinfo_zdtm_alloc(mntns); + if (!new) { + pr_perror("Failed to alloc mountinfo_zdtm"); + goto free; + } + + ret = sscanf(buf, "%i %i %u:%u %ms %ms %*s %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, + &new->root, &new->mountpoint, &n); + if (ret != 6) { + pr_perror("Failed to parse mountinfo line \"%s\"", buf); + goto free; + } + cure_path(new->root); + cure_path(new->mountpoint); + new->s_dev = MKKDEV(kmaj, kmin); + + str = buf + n; + hyphen = strstr(buf, " - "); + if (!hyphen) { + pr_perror("Failed to find \" - \" in mountinfo line \"%s\"", buf); + goto free; + } + *hyphen++ = '\0'; + + shared = strstr(str, "shared:"); + if (shared) + new->shared_id = atoi(shared + 7); + master = strstr(str, "master:"); + if (master) + new->master_id = atoi(master + 7); + + ret = sscanf(hyphen, "- %ms", &new->fstype); + if (ret != 1) { + pr_perror("Failed to parse fstype in mountinfo tail \"%s\"", hyphen); + goto free; + } + } + + fclose(f); + return 0; +free: + mountinfo_zdtm_free_all(mntns); + fclose(f); + return -1; +} + +static struct mountinfo_topology *mountinfo_topology_alloc(struct mntns_zdtm *mntns, struct mountinfo_zdtm *mountinfo) +{ + struct mountinfo_topology *new; + + new = xzalloc(sizeof(struct mountinfo_topology)); + if (new) { + new->mountinfo = mountinfo; + new->topology_id = -1; + INIT_LIST_HEAD(&new->children); + INIT_LIST_HEAD(&new->siblings); + list_add_tail(&new->list, &mntns->topology_list); + INIT_LIST_HEAD(&new->sharing_list); + } + return new; +} + +static void mountinfo_topology_free(struct mountinfo_topology *topology) +{ + list_del(&topology->list); + xfree(topology); +} + +static void mountinfo_topology_free_all(struct mntns_zdtm *mntns) +{ + struct mountinfo_topology *topology, *tmp; + + list_for_each_entry_safe(topology, tmp, &mntns->topology_list, list) + mountinfo_topology_free(topology); +} + +static struct mountinfo_topology *mountinfo_topology_lookup_parent(struct mntns_zdtm *mntns, + struct mountinfo_topology *topology) +{ + struct mountinfo_topology *parent; + + list_for_each_entry(parent, &mntns->topology_list, list) { + if (parent->mountinfo->mnt_id == topology->mountinfo->parent_mnt_id) + return parent; + } + + return NULL; +} + +static struct mountinfo_topology *mt_subtree_next(struct mountinfo_topology *mt, struct mountinfo_topology *root) +{ + if (!list_empty(&mt->children)) + return list_entry(mt->children.next, struct mountinfo_topology, siblings); + + while (mt->parent && mt != root) { + if (mt->siblings.next == &mt->parent->children) + mt = mt->parent; + else + return list_entry(mt->siblings.next, struct mountinfo_topology, siblings); + } + + return NULL; +} + +static void __mt_resort_siblings(struct mountinfo_topology *parent) +{ + LIST_HEAD(list); + + while (!list_empty(&parent->children)) { + struct mountinfo_topology *m, *p; + + m = list_first_entry(&parent->children, struct mountinfo_topology, siblings); + list_del(&m->siblings); + + list_for_each_entry(p, &list, siblings) + if (strcmp(p->mountinfo->mountpoint, m->mountinfo->mountpoint) < 0) + break; + + list_add_tail(&m->siblings, &p->siblings); + } + + list_splice(&list, &parent->children); +} + +static void mntns_mt_resort_siblings(struct mntns_zdtm *mntns) +{ + struct mountinfo_topology *mt = mntns->tree; + LIST_HEAD(mtlist); + int i = 0; + + while (1) { + /* Assign topology id to mt in dfs order */ + mt->topology_id = i++; + list_move_tail(&mt->list, &mtlist); + __mt_resort_siblings(mt); + mt = mt_subtree_next(mt, mntns->tree); + if (!mt) + break; + } + + /* Update mntns->topology_list in dfs order */ + list_splice(&mtlist, &mntns->topology_list); +} + +static struct sharing_group *sharing_group_find_or_alloc(struct mntns_zdtm *mntns, int shared_id, int master_id, + unsigned int s_dev) +{ + struct sharing_group *sg; + + list_for_each_entry(sg, &mntns->sharing_groups_list, list) { + if ((sg->shared_id == shared_id) && (sg->master_id == master_id)) { + if (sg->s_dev != s_dev) { + pr_err("Sharing/devid inconsistency\n"); + return NULL; + } + return sg; + } + } + + sg = xzalloc(sizeof(struct sharing_group)); + if (!sg) + return NULL; + + sg->shared_id = shared_id; + sg->master_id = master_id; + sg->s_dev = s_dev; + sg->topology_id = -1; + + INIT_LIST_HEAD(&sg->children); + INIT_LIST_HEAD(&sg->siblings); + INIT_LIST_HEAD(&sg->mounts_list); + + list_add_tail(&sg->list, &mntns->sharing_groups_list); + + return sg; +} + +static void sharing_group_free(struct sharing_group *sg) +{ + list_del(&sg->list); + xfree(sg); +} + +static void sharing_group_free_all(struct mntns_zdtm *mntns) +{ + struct sharing_group *sg, *tmp; + + list_for_each_entry_safe(sg, tmp, &mntns->sharing_groups_list, list) + sharing_group_free(sg); +} + +static struct sharing_group *sharing_group_lookup_parent(struct mntns_zdtm *mntns, struct sharing_group *sg) +{ + struct sharing_group *parent; + + list_for_each_entry(parent, &mntns->sharing_groups_list, list) { + if (parent->shared_id == sg->master_id) + return parent; + } + + /* Create "external" sharing */ + parent = sharing_group_find_or_alloc(mntns, sg->master_id, 0, sg->s_dev); + if (parent) + return parent; + + return NULL; +} + +static int mntns_build_tree(struct mntns_zdtm *mntns) +{ + struct mountinfo_topology *topology, *parent, *tree = NULL; + struct mountinfo_zdtm *mountinfo; + struct sharing_group *sg, *sg_parent; + + INIT_LIST_HEAD(&mntns->topology_list); + + /* Prealloc mount tree */ + list_for_each_entry(mountinfo, &mntns->mountinfo_list, list) { + topology = mountinfo_topology_alloc(mntns, mountinfo); + if (!topology) + goto err; + } + + /* Build mount tree */ + list_for_each_entry(topology, &mntns->topology_list, list) { + parent = mountinfo_topology_lookup_parent(mntns, topology); + if (!parent) { + if (tree) { + pr_err("Bad mount tree with too roots %d and %d\n", tree->mountinfo->mnt_id, + parent->mountinfo->mnt_id); + goto err; + } + tree = topology; + } else { + topology->parent = parent; + list_add_tail(&topology->siblings, &parent->children); + } + } + mntns->tree = tree; + + /* Sort mounts by mountpoint */ + mntns_mt_resort_siblings(mntns); + + INIT_LIST_HEAD(&mntns->sharing_groups_list); + + /* Prealloc sharing groups */ + list_for_each_entry(topology, &mntns->topology_list, list) { + if (!topology->mountinfo->shared_id && !topology->mountinfo->master_id) + continue; + + /* + * Due to mntns->topology_list is sorted in dfs order + * sharing groups are also sorted the same + */ + sg = sharing_group_find_or_alloc(mntns, topology->mountinfo->shared_id, topology->mountinfo->master_id, + topology->mountinfo->s_dev); + if (!sg) + goto err; + + list_add_tail(&topology->sharing_list, &sg->mounts_list); + topology->sharing = sg; + + /* Set sharing group topology id to minimal topology id of it's mounts */ + if (sg->topology_id == -1 || topology->topology_id < sg->topology_id) + sg->topology_id = topology->topology_id; + } + + /* Build sharing group trees */ + list_for_each_entry(sg, &mntns->sharing_groups_list, list) { + if (sg->master_id) { + sg_parent = sharing_group_lookup_parent(mntns, sg); + sg->parent = sg_parent; + list_add(&sg->siblings, &sg_parent->children); + } + } + + return 0; +err: + mountinfo_topology_free_all(mntns); + sharing_group_free_all(mntns); + return -1; +} + +static int mountinfo_topology_list_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) +{ + struct mountinfo_topology *topology_a, *topology_b; + + topology_a = list_first_entry(&mntns_a->topology_list, struct mountinfo_topology, list); + topology_b = list_first_entry(&mntns_b->topology_list, struct mountinfo_topology, list); + + while (&topology_a->list != &mntns_a->topology_list && &topology_b->list != &mntns_b->topology_list) { + if (topology_a->topology_id != topology_b->topology_id) { + pr_err("Mounts %d and %d have different topology id %d and %d\n", topology_a->mountinfo->mnt_id, + topology_b->mountinfo->mnt_id, topology_a->topology_id, topology_b->topology_id); + return -1; + } + + if (topology_a->parent && topology_b->parent) { + if (topology_a->parent->topology_id != topology_b->parent->topology_id) { + pr_err("Mounts %d and %d have different parent topology id %d and %d\n", + topology_a->mountinfo->mnt_id, topology_b->mountinfo->mnt_id, + topology_a->parent->topology_id, topology_b->parent->topology_id); + return -1; + } + } else if (topology_a->parent || topology_b->parent) { + pr_err("One of mounts %d and %d has parent and other doesn't\n", topology_a->mountinfo->mnt_id, + topology_b->mountinfo->mnt_id); + return -1; + } + + if (topology_a->sharing && topology_b->sharing) { + if (topology_a->sharing->topology_id != topology_b->sharing->topology_id) { + pr_err("Mounts %d and %d have different sharing topology id %d and %d\n", + topology_a->mountinfo->mnt_id, topology_b->mountinfo->mnt_id, + topology_a->sharing->topology_id, topology_b->sharing->topology_id); + return -1; + } + } else if (topology_a->sharing || topology_b->sharing) { + pr_err("One of mounts %d and %d has sharing and other doesn't\n", topology_a->mountinfo->mnt_id, + topology_b->mountinfo->mnt_id); + return -1; + } + + topology_a = list_entry(topology_a->list.next, struct mountinfo_topology, list); + topology_b = list_entry(topology_b->list.next, struct mountinfo_topology, list); + } + if (&topology_a->list != &mntns_a->topology_list || &topology_b->list != &mntns_b->topology_list) { + pr_err("Mount tree topology length mismatch\n"); + return -1; + } + + return 0; +} + +static int sharing_group_list_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) +{ + struct sharing_group *sg_a, *sg_b; + + sg_a = list_first_entry(&mntns_a->sharing_groups_list, struct sharing_group, list); + sg_b = list_first_entry(&mntns_b->sharing_groups_list, struct sharing_group, list); + + while (&sg_a->list != &mntns_a->sharing_groups_list && &sg_b->list != &mntns_b->sharing_groups_list) { + if (sg_a->topology_id != sg_b->topology_id) { + pr_err("Sharings (%d,%d) and (%d,%d) have different sharing topology id %d and %d\n", + sg_a->shared_id, sg_a->master_id, sg_b->shared_id, sg_b->master_id, sg_a->topology_id, + sg_b->topology_id); + return -1; + } + + if (sg_a->parent && sg_b->parent) { + if (sg_a->parent->topology_id != sg_b->parent->topology_id) { + pr_err("Sharings (%d,%d) and (%d,%d) have different parent topology id %d and %d\n", + sg_a->shared_id, sg_a->master_id, sg_b->shared_id, sg_b->master_id, + sg_a->parent->topology_id, sg_b->parent->topology_id); + return -1; + } + } else if (sg_a->parent || sg_b->parent) { + pr_err("One of sharings (%d,%d) and (%d,%d) has parent and other doesn't\n", sg_a->shared_id, + sg_a->master_id, sg_b->shared_id, sg_b->master_id); + return -1; + } + + sg_a = list_entry(sg_a->list.next, struct sharing_group, list); + sg_b = list_entry(sg_b->list.next, struct sharing_group, list); + } + + if (&sg_a->list != &mntns_a->sharing_groups_list || &sg_b->list != &mntns_b->sharing_groups_list) { + pr_err("Mount tree sharing topology length mismatch\n"); + return -1; + } + + return 0; +} + +int mntns_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) +{ + if (mntns_build_tree(mntns_a)) { + pr_err("Failed to build first mountinfo topology tree\n"); + return -1; + } + + if (mntns_build_tree(mntns_b)) { + pr_err("Failed to build second mountinfo topology tree\n"); + return -1; + } + + if (mountinfo_topology_list_compare(mntns_a, mntns_b)) + return -1; + + if (sharing_group_list_compare(mntns_a, mntns_b)) + return -1; + + return 0; +} + +void mntns_free_all(struct mntns_zdtm *mntns) +{ + mountinfo_zdtm_free_all(mntns); + mountinfo_topology_free_all(mntns); + sharing_group_free_all(mntns); +} diff --git a/test/zdtm/lib/mountinfo.h b/test/zdtm/lib/mountinfo.h new file mode 100644 index 000000000..6d90e2c10 --- /dev/null +++ b/test/zdtm/lib/mountinfo.h @@ -0,0 +1,70 @@ +#ifndef __ZDTM_MOUNTINFO__ +#define __ZDTM_MOUNTINFO__ + +#include "list.h" + +struct mountinfo_zdtm { + int mnt_id; + int parent_mnt_id; + char *mountpoint; + char *root; + unsigned int s_dev; + int shared_id; + int master_id; + char *fstype; + + /* list of all mounts */ + struct list_head list; +}; + +struct mntns_zdtm { + struct list_head mountinfo_list; + struct list_head topology_list; + struct mountinfo_topology *tree; + struct list_head sharing_groups_list; +}; + +#define MNTNS_ZDTM_INIT(name) \ + { \ + .mountinfo_list = LIST_HEAD_INIT(name.mountinfo_list), \ + .topology_list = LIST_HEAD_INIT(name.topology_list), \ + .sharing_groups_list = LIST_HEAD_INIT(name.sharing_groups_list), \ + } +#define MNTNS_ZDTM(name) struct mntns_zdtm name = MNTNS_ZDTM_INIT(name) + +struct sharing_group { + int shared_id; + int master_id; + unsigned int s_dev; + + struct sharing_group *parent; + struct list_head children; + struct list_head siblings; + + int topology_id; + + struct list_head mounts_list; + + struct list_head list; +}; + +struct mountinfo_topology { + struct mountinfo_zdtm *mountinfo; + + struct mountinfo_topology *parent; + struct list_head children; + struct list_head siblings; + + int topology_id; + + struct sharing_group *sharing; + struct list_head sharing_list; + + struct list_head list; +}; + +extern int mntns_parse_mountinfo(struct mntns_zdtm *mntns); +extern void mntns_free_all(struct mntns_zdtm *mntns); +extern int mntns_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b); + +#endif diff --git a/test/zdtm/lib/msg.c b/test/zdtm/lib/msg.c index 1cf92e3e0..9ba1c47a4 100644 --- a/test/zdtm/lib/msg.c +++ b/test/zdtm/lib/msg.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -55,7 +56,7 @@ void test_msg(const char *format, ...) off += strftime(buf, sizeof(buf), "%H:%M:%S", tm); } - off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); + off += sprintf(buf + off, ".%.3" PRId64 ": ", (int64_t)(tv.tv_usec / 1000)); off += sprintf(buf + off, "%5d: ", getpid()); skip: diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 6f6cccc99..822e09c92 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -27,8 +28,9 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path; + char *root, *criu_path, *dev_path, *zdtm_bind; char path[PATH_MAX]; + char bind_path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { @@ -51,6 +53,34 @@ static int prepare_mntns(void) return -1; } + zdtm_bind = getenv("ZDTM_BIND"); + if (zdtm_bind) { + /* + * Bindmount the directory to itself. + * e.g.: The mnt_ro_root test makes "/" mount readonly, but we + * still want to write logs to /zdtm/static/ so let's make it + * separate writable bind mount. + */ + snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); + if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { + fprintf(stderr, "Can't bind-mount ZDTM_BIND: %m\n"); + return -1; + } + } + + dev_path = getenv("ZDTM_DEV"); + if (dev_path) { + snprintf(path, sizeof(path), "%s/dev", root); + if (mount(dev_path, path, NULL, MS_BIND, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + if (mount(NULL, path, NULL, MS_PRIVATE, NULL)) { + pr_perror("Unable to mount %s", path); + return -1; + } + } + criu_path = getenv("ZDTM_CRIU"); if (criu_path) { snprintf(path, sizeof(path), "%s%s", root, criu_path); @@ -218,7 +248,7 @@ static inline int _settime(clockid_t clk_id, time_t offset) if (clk_id == CLOCK_MONOTONIC_COARSE || clk_id == CLOCK_MONOTONIC_RAW) clk_id = CLOCK_MONOTONIC; - len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); + len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, (int64_t)offset); fd = open("/proc/self/timens_offsets", O_WRONLY); if (fd < 0) { diff --git a/test/zdtm/lib/sysctl.c b/test/zdtm/lib/sysctl.c index 9583ec3df..3b1ebc168 100644 --- a/test/zdtm/lib/sysctl.c +++ b/test/zdtm/lib/sysctl.c @@ -3,6 +3,49 @@ #include "zdtmtst.h" #include "sysctl.h" +int sysctl_read_str(const char *name, char *data, size_t size) +{ + int fd, ret; + + fd = open(name, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = read(fd, data, size - 1); + if (ret < 0) { + pr_perror("Can't read %s", name); + close(fd); + return -1; + } + data[ret] = '\0'; + close(fd); + + return 0; +} + +int sysctl_write_str(const char *name, char *data) +{ + int fd, ret; + + fd = open(name, O_WRONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return -1; + } + + ret = write(fd, data, strlen(data)); + if (ret < 0) { + pr_perror("Can't write %s into %s", data, name); + close(fd); + return -1; + } + close(fd); + + return 0; +} + int sysctl_read_int(const char *name, int *data) { int fd; diff --git a/test/zdtm/lib/sysctl.h b/test/zdtm/lib/sysctl.h index 67129102f..d435bd7e9 100644 --- a/test/zdtm/lib/sysctl.h +++ b/test/zdtm/lib/sysctl.h @@ -3,5 +3,7 @@ extern int sysctl_read_int(const char *name, int *data); extern int sysctl_write_int(const char *name, int val); +extern int sysctl_read_str(const char *name, char *data, size_t size); +extern int sysctl_write_str(const char *name, char *data); #endif diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 57eb42046..95017e42e 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -239,34 +239,37 @@ void test_init(int argc, char **argv) exit(1); } - val = getenv("ZDTM_GROUPS"); - if (val) { - char *tok = NULL; - unsigned int size = 0, groups[NGROUPS_MAX]; + val = getenv("ZDTM_ROOTLESS"); + if (!val) { + val = getenv("ZDTM_GROUPS"); + if (val) { + char *tok = NULL; + unsigned int size = 0, groups[NGROUPS_MAX]; - tok = strtok(val, " "); - while (tok) { - size++; - groups[size - 1] = atoi(tok); - tok = strtok(NULL, " "); + tok = strtok(val, " "); + while (tok) { + size++; + groups[size - 1] = atoi(tok); + tok = strtok(NULL, " "); + } + + if (setgroups(size, groups)) { + fprintf(stderr, "Can't set groups: %m"); + exit(1); + } } - if (setgroups(size, groups)) { - fprintf(stderr, "Can't set groups: %m"); + val = getenv("ZDTM_GID"); + if (val && (setgid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); exit(1); } - } - val = getenv("ZDTM_GID"); - if (val && (setgid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); - } - - val = getenv("ZDTM_UID"); - if (val && (setuid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); + val = getenv("ZDTM_UID"); + if (val && (setuid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); + } } if (prctl(PR_SET_DUMPABLE, 1)) { @@ -403,7 +406,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64 || __riscv) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); diff --git a/test/zdtm/lib/unix.c b/test/zdtm/lib/unix.c index 49773dedd..288f1df24 100644 --- a/test/zdtm/lib/unix.c +++ b/test/zdtm/lib/unix.c @@ -5,7 +5,7 @@ int unix_fill_sock_name(struct sockaddr_un *name, char *relFilename) { - char *cwd; + cleanup_free char *cwd = NULL; if (get_cwd_check_perm(&cwd)) { pr_err("failed to get current working directory with valid permissions.\n"); diff --git a/test/zdtm/lib/xmalloc.h b/test/zdtm/lib/xmalloc.h new file mode 100644 index 000000000..95e0d4043 --- /dev/null +++ b/test/zdtm/lib/xmalloc.h @@ -0,0 +1,68 @@ +#ifndef __ZDTM_XMALLOC_H__ +#define __ZDTM_XMALLOC_H__ + +#include +#include + +#ifndef pr_err +#error "Macro pr_err is needed." +#endif + +#define __xalloc(op, size, ...) \ + ({ \ + void *___p = op(__VA_ARGS__); \ + if (!___p) \ + pr_err("%s: Can't allocate %li bytes\n", __func__, (long)(size)); \ + ___p; \ + }) + +#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) +#define xmalloc(size) __xalloc(malloc, size, size) +#define xzalloc(size) __xalloc(calloc, size, 1, size) +#define xrealloc(p, size) __xalloc(realloc, size, p, size) + +#define xfree(p) free(p) + +#define xrealloc_safe(pptr, size) \ + ({ \ + int __ret = -1; \ + void *new = xrealloc(*pptr, size); \ + if (new) { \ + *pptr = new; \ + __ret = 0; \ + } \ + __ret; \ + }) + +#define xmemdup(ptr, size) \ + ({ \ + void *new = xmalloc(size); \ + if (new) \ + memcpy(new, ptr, size); \ + new; \ + }) + +#define memzero_p(p) memset(p, 0, sizeof(*p)) +#define memzero(p, size) memset(p, 0, size) + +/* + * Helper for allocating trees with single xmalloc. + * This one advances the void *pointer on s bytes and + * returns the previous value. Use like this + * + * m = xmalloc(total_size); + * a = xptr_pull(&m, tree_root_t); + * a->b = xptr_pull(&m, leaf_a_t); + * a->c = xptr_pull(&m, leaf_c_t); + * ... + */ +static inline void *xptr_pull_s(void **m, size_t s) +{ + void *ret = (*m); + (*m) += s; + return ret; +} + +#define xptr_pull(m, type) xptr_pull_s(m, sizeof(type)) + +#endif /* __CR_XMALLOC_H__ */ diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index ed7c23ee2..b0e25702e 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -126,11 +126,25 @@ extern int write_pidfile(int pid); /* message helpers */ extern int test_log_init(const char *outfile, const char *suffix); extern int zdtm_seccomp; -#define pr_err(format, arg...) test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg) -#define pr_perror(format, arg...) \ - test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#define fail(format, arg...) \ - test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#define pr_err(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg); \ + 1; \ + }) + +#define pr_perror(format, arg...) \ + ({ \ + test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) + +#define fail(format, arg...) \ + ({ \ + test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ + strerror(errno)); \ + 1; \ + }) #define skip(format, arg...) test_msg("SKIP: %s:%d: " format "\n", __FILE__, __LINE__, ##arg) #define pass() test_msg("PASS\n") @@ -202,4 +216,13 @@ static inline void cleanup_closep(void *p) TEMP_FAILURE_RETRY(close(*pp)); } +extern int write_value(const char *path, const char *value); +extern int read_value(const char *path, char *value, int size); + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + #endif /* _VIMITESU_H_ */ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index 4a93659d4..e1df2e5fa 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -8,6 +8,7 @@ TST_NOFILE := \ sleeping00 \ pid00 \ caps00 \ + caps01 \ wait00 \ zombie00 \ zombie01 \ @@ -23,6 +24,7 @@ TST_NOFILE := \ sse20 \ mprotect00 \ timers \ + timers01 \ timerfd \ unbound_sock \ sched_prio00 \ @@ -35,6 +37,8 @@ TST_NOFILE := \ socket_udp-corked \ socket6_udp \ socket_udp_shutdown \ + socket_icmp \ + socket6_icmp \ sk-freebind \ sk-freebind-false \ socket_udplite \ @@ -53,15 +57,24 @@ TST_NOFILE := \ shm \ shm-mp \ ptrace_sig \ + pidfd_self \ + pidfd_of_thread \ + pidfd_dead \ + pidfd_diffdead \ + pidfd_child \ + pidfd_kill \ + fd_from_pidfd \ pipe00 \ pipe01 \ pipe02 \ pthread00 \ + pthread00-pac \ pthread01 \ pthread02 \ pthread_timers \ pthread_timers_h \ rseq00 \ + membarrier \ vdso00 \ vdso01 \ vdso02 \ @@ -84,7 +97,8 @@ TST_NOFILE := \ socket-tcp4v6 \ socket-tcp-local \ socket-tcp-reuseport \ - socket-tcp-nfconntrack \ + socket-tcp-ipt-nfconntrack \ + socket-tcp-nft-nfconntrack \ socket-tcp6-local \ socket-tcp4v6-local \ socket-tcpbuf \ @@ -123,6 +137,10 @@ TST_NOFILE := \ sock_opts00 \ sock_opts01 \ sock_opts02 \ + sock_ip_opts00 \ + sock_ip_opts01 \ + sock_tcp_opts00 \ + sock_tcp_opts01 \ sk-unix-unconn \ sk-unix-unconn-seqpacket \ ipc_namespace \ @@ -134,6 +152,7 @@ TST_NOFILE := \ maps05 \ maps09 \ maps10 \ + maps11 \ mlock_setuid \ xids00 \ groups \ @@ -184,6 +203,8 @@ TST_NOFILE := \ stopped01 \ stopped02 \ stopped12 \ + stopped03 \ + stopped04 \ rtc \ clean_mntns \ mntns_rw_ro_rw \ @@ -199,6 +220,7 @@ TST_NOFILE := \ scm04 \ scm05 \ scm06 \ + scm09 \ aio00 \ aio01 \ fd \ @@ -210,6 +232,7 @@ TST_NOFILE := \ seccomp_filter_tsync \ seccomp_filter_threads \ seccomp_filter_inheritance \ + seccomp_no_new_privs \ different_creds \ vsx \ bridge \ @@ -251,6 +274,8 @@ TST_NOFILE := \ memfd02 \ memfd02-hugetlb \ memfd03 \ + memfd04 \ + memfd05 \ shmemfd \ shmemfd-priv \ time \ @@ -260,19 +285,24 @@ TST_NOFILE := \ sigtrap \ sigtrap01 \ change_mnt_context \ + fd_offset \ # jobctl00 \ PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') +pkg-config-atleast-version = $(shell sh -c '$(PKG_CONFIG) --atleast-version=$(2) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ - bpf_array + bpf_array endif ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) - TST_NOFILE += maps03 + TST_NOFILE += maps03 +ifeq ($(call pkg-config-atleast-version,libtracefs,1.7),y) + TST_NOFILE += uprobes +endif endif endif @@ -289,6 +319,7 @@ TST_FILE = \ write_read02 \ write_read10 \ maps00 \ + maps12 \ link10 \ file_attr \ deleted_unix_sock \ @@ -303,6 +334,10 @@ TST_FILE = \ ghost_holes00 \ ghost_holes01 \ ghost_holes02 \ + ghost_holes_large00 \ + ghost_holes_large01 \ + ghost_multi_hole00 \ + ghost_multi_hole01 \ unlink_largefile \ mtime_mmap \ fifo \ @@ -347,6 +382,12 @@ TST_FILE = \ socket_close_data01 \ fifo_upon_unix_socket00 \ fifo_upon_unix_socket01 \ + sk-unix-listen01 \ + sk-unix-listen02 \ + sk-unix-listen03 \ + sk-unix-listen04 \ + sk-unix-restore-fs-share \ + mnt_ext_file_bind_auto \ TST_DIR = \ cwd00 \ @@ -377,10 +418,13 @@ TST_DIR = \ cgroup02 \ cgroup03 \ cgroup04 \ + cgroupv2_00 \ + cgroupv2_01 \ cgroup_ifpriomap \ cgroup_ignore \ cgroup_stray \ cgroup_yard \ + cgroup_threads \ unlink_fstat04 \ unlink_fstat041 \ mntns_remap \ @@ -389,6 +433,7 @@ TST_DIR = \ mntns_ghost \ mntns_ghost01 \ mntns_ro_root \ + mnt_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ @@ -464,35 +509,41 @@ STATE_OUT = $(TST_STATE:%=%.out) include ../Makefile.inc +ifeq ($(ARCH),aarch64) + PAC_CFLAGS := -mbranch-protection=standard +else + PAC_CFLAGS := +endif + all: $(TST) criu-rtc.so install: all .PHONY: all install $(TST_NOFILE:%=%.pid): %.pid: % - $(> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ - $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto + $(Q)protoc --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 713ffaa46..dc1636821 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 76de8b8b4..0bc36048c 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%[^ \n]s", profile); + len = fscanf(f, "%1023[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/caps01.c b/test/zdtm/static/caps01.c new file mode 100644 index 000000000..0f8a7101e --- /dev/null +++ b/test/zdtm/static/caps01.c @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that CapAmb are preserved"; +const char *test_author = "Liu Chao "; + +struct cap_hdr { + unsigned int version; + int pid; +}; + +struct cap_data { + unsigned int eff; + unsigned int prm; + unsigned int inh; +}; + +#define _LINUX_CAPABILITY_VERSION_3 0x20080522 +#define _LINUX_CAPABILITY_U32S_3 2 +#define CAP_DAC_OVERRIDE 1 +#define PR_CAP_AMBIENT 47 +#define PR_CAP_AMBIENT_IS_SET 1 +#define PR_CAP_AMBIENT_RAISE 2 +#define PR_CAP_AMBIENT_LOWER 3 + +int capget(struct cap_hdr *hdrp, struct cap_data *datap); +int capset(struct cap_hdr *hdrp, const struct cap_data *datap); + +static int cap_last_cap = 63; + +int main(int argc, char **argv) +{ + task_waiter_t t; + int pid, result_pipe[2]; + unsigned int amb[_LINUX_CAPABILITY_U32S_3]; + unsigned int amb_2[_LINUX_CAPABILITY_U32S_3]; + char res = 'x'; + FILE *f; + + test_init(argc, argv); + task_waiter_init(&t); + + f = fopen("/proc/sys/kernel/cap_last_cap", "r"); + if (f) { + if (fscanf(f, "%d", &cap_last_cap) != 1) { + pr_perror("Unable to read cal_last_cap"); + fclose(f); + return 1; + } + fclose(f); + } else + test_msg("/proc/sys/kernel/cap_last_cap is not available\n"); + + if (pipe(result_pipe)) { + pr_perror("Can't create pipe"); + return 1; + } + + pid = test_fork(); + if (pid == 0) { + int b, i, ret; + struct cap_hdr hdr; + struct cap_data data[_LINUX_CAPABILITY_U32S_3]; + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + if (capget(&hdr, data) < 0) { + pr_perror("capget"); + return -1; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + + data[0].eff &= ~((1 << CAP_CHOWN) | (1 << CAP_DAC_OVERRIDE)); + data[0].prm &= ~(1 << CAP_DAC_OVERRIDE); + data[0].inh = data[0].prm; + data[1].inh = data[1].prm; + + if (capset(&hdr, data) < 0) { + pr_perror("capset"); + return -1; + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb[b] = data[b].prm; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + if ((amb[b] & (1 << i)) > 0) + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); + else + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, i + b * 32, 0, 0); + if (ret) { + pr_perror("Unable to set ambient capability %d to %d: %d", i + b * 32, amb[b] & (1 << i), ret); + return -1; + } + } + } + + task_waiter_complete_current(&t); + task_waiter_wait4(&t, getppid()); + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + amb_2[b] = 0; + for (i = 0; i < 32; i++) { + if (b * 32 + i > cap_last_cap) + break; + ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i + b * 32, 0, 0); + if (ret < 0) { + pr_perror("Unable to read ambient capability %d: %d", i + b * 32, ret); + goto bad; + } + + amb_2[b] |= (ret << i); + } + } + + for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { + if (amb[b] != amb_2[b]) { + res = '1'; + goto bad; + } + } + + res = '0'; + bad: + write(result_pipe[1], &res, 1); + + if (res != '0') { + write(result_pipe[1], amb, sizeof(amb)); + write(result_pipe[1], amb_2, sizeof(amb_2)); + } + + close(result_pipe[0]); + close(result_pipe[1]); + _exit(0); + } + + task_waiter_wait4(&t, pid); + + test_daemon(); + test_waitsig(); + + task_waiter_complete_current(&t); + + read(result_pipe[0], &res, 1); + + if (res == '0') + pass(); + else { + read(result_pipe[0], amb, sizeof(amb)); + read(result_pipe[0], amb_2, sizeof(amb_2)); + test_msg("amb[]=%08x, %08x\n", amb[0], amb[1]); + test_msg("amb[]=%08x, %08x\n", amb_2[0], amb_2[1]); + fail("Fail: %c", res); + } + close(result_pipe[0]); + close(result_pipe[1]); + + return 0; +} diff --git a/test/zdtm/static/caps01.desc b/test/zdtm/static/caps01.desc new file mode 100644 index 000000000..2eac7e654 --- /dev/null +++ b/test/zdtm/static/caps01.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/test/zdtm/static/cgroup00.desc b/test/zdtm/static/cgroup00.desc index 3c6c4a7e2..42a3f2b73 100644 --- a/test/zdtm/static/cgroup00.desc +++ b/test/zdtm/static/cgroup00.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index bc8515264..7bfb67762 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup01.desc b/test/zdtm/static/cgroup01.desc index 3c6c4a7e2..42a3f2b73 100644 --- a/test/zdtm/static/cgroup01.desc +++ b/test/zdtm/static/cgroup01.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 6229a8a08..8a925c0a4 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/cgroup02.desc b/test/zdtm/static/cgroup02.desc index df17a5789..eb5a9dd37 100644 --- a/test/zdtm/static/cgroup02.desc +++ b/test/zdtm/static/cgroup02.desc @@ -1,4 +1,4 @@ { 'dopts': '--manage-cgroups --cgroup-root name=zdtmtst:/prefix', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'ropts': '--manage-cgroups --cgroup-root /newroot --cgroup-root name=zdtmtst:/prefix'} diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index 5a424be12..f586a0628 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -17,45 +17,25 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *cgname = "zdtmtst"; - -int write_value(const char *path, const char *value) -{ - int fd, l; - - fd = open(path, O_WRONLY); - if (fd < 0) { - pr_perror("open %s", path); - return -1; - } - - l = write(fd, value, strlen(value)); - close(fd); - if (l < 0) { - pr_perror("failed to write %s to %s", value, path); - return -1; - } - - return 0; -} +static const char *const cgname = "zdtmtst"; int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", dirname); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { - pr_perror("Can't make dir"); + pr_perror("Can't make dir %s", subdir); return -1; } if (mount("none", subdir, "cgroup", 0, controller)) { - pr_perror("Can't mount cgroups"); + pr_perror("Can't mount cgroup controller %s at %s", controller, subdir); goto err_rd; } @@ -72,7 +52,8 @@ int mount_and_add(const char *controller, const char *path, const char *prop, co goto err_rs; ssprintf(paux, "%s/%s/special_prop_check", subdir, path); - mkdir(paux, 0600); + if (mkdir(paux, 0600) < 0) + pr_perror("Can't make dir %s", paux); return 0; err_rs: @@ -94,11 +75,11 @@ bool checkval(char *path, char *val) } n = read(fd, buf, sizeof(buf) - 1); + if (n < 0) + pr_perror("read %s", path); close(fd); - if (n < 0) { - pr_perror("read"); + if (n < 0) return false; - } buf[n] = 0; if (strcmp(val, buf)) { @@ -115,7 +96,7 @@ int main(int argc, char **argv) char buf[1024], path[PATH_MAX]; struct stat sb; - char *dev_allow[] = { + const char *const dev_allow[] = { "c *:* m", "b *:* m", "c 1:3 rwm", "c 1:5 rwm", "c 1:7 rwm", "c 5:0 rwm", "c 5:2 rwm", "c 1:8 rwm", "c 1:9 rwm", "c 136:* rwm", "c 10:229 rwm", }; @@ -146,12 +127,14 @@ int main(int argc, char **argv) sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); if (!checkval(path, buf)) { + errno = 0; fail(); goto out; } sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); if (!checkval(path, "268435456\n")) { + errno = 0; fail(); goto out; } @@ -163,6 +146,7 @@ int main(int argc, char **argv) } if (!S_ISDIR(sb.st_mode)) { + errno = 0; fail("special_prop_check not a directory?"); goto out; } diff --git a/test/zdtm/static/cgroup04.checkskip b/test/zdtm/static/cgroup04.checkskip index 205f8fc53..1ccbada4d 100755 --- a/test/zdtm/static/cgroup04.checkskip +++ b/test/zdtm/static/cgroup04.checkskip @@ -1,3 +1,20 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +for ctl in devices memory; do + # Check that the controller is available. + + grep -q "^${ctl}\\s" /proc/cgroups + + # Check that the controller is not co-mounted with any other. + + # /proc/self/cgroup may have: + # "1:devices:/sys" + if ! grep -q "^[0-9]*:${ctl}:" /proc/self/cgroup; then + # but not eg: + # "1:devices,job:/sys" + grep -qE "^[0-9]*:([^:]*,)?${ctl}(,[^:]*)?:" /proc/self/cgroup && exit 1 + fi +done diff --git a/test/zdtm/static/cgroup_ifpriomap.checkskip b/test/zdtm/static/cgroup_ifpriomap.checkskip index 205f8fc53..f401ad1b2 100755 --- a/test/zdtm/static/cgroup_ifpriomap.checkskip +++ b/test/zdtm/static/cgroup_ifpriomap.checkskip @@ -1,3 +1,6 @@ #!/bin/bash +set -e -! test -f /sys/fs/cgroup/cgroup.controllers +test ! -f /sys/fs/cgroup/cgroup.controllers + +grep -q '^net_prio\s' /proc/cgroups diff --git a/test/zdtm/static/cgroup_stray.c b/test/zdtm/static/cgroup_stray.c index 0c0ed93cf..f5754410f 100644 --- a/test/zdtm/static/cgroup_stray.c +++ b/test/zdtm/static/cgroup_stray.c @@ -135,7 +135,7 @@ out: int main(int argc, char **argv) { int ret = -1, sk_pair[2], sk, status; - char path[PATH_MAX], c; + char path[PATH_MAX], c = 0; pid_t pid = 0; test_init(argc, argv); diff --git a/test/zdtm/static/cgroup_threads.c b/test/zdtm/static/cgroup_threads.c new file mode 100644 index 000000000..2c17e13a7 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup layout of threads is preserved"; +const char *test_author = "Michał Cłapiński "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup directory name", 1); +static const char *cgname = "zdtmtst"; +#define SUBNAME "subcg_threads" +#define SUBNAME2 SUBNAME "/subsubcg" + +#define exit_group(code) syscall(__NR_exit_group, code) + +static int cg_move(char *name) +{ + int cgfd, l; + char paux[256]; + + sprintf(paux, "%s/%s", dirname, name); + if (mkdir(paux, 0600)) { + pr_perror("Can't create %s", paux); + return -1; + } + + sprintf(paux, "%s/%s/tasks", dirname, name); + + cgfd = open(paux, O_WRONLY); + if (cgfd < 0) { + pr_perror("Can't open tasks"); + return -1; + } + + l = write(cgfd, "0", 2); + close(cgfd); + + if (l < 0) { + pr_perror("Can't move self to subcg"); + return -1; + } + + return 0; +} + +static int cg_check(char *name) +{ + int found = 0; + FILE *cgf; + char paux[256], aux[128]; + + cgf = fopen("/proc/thread-self/cgroup", "r"); + if (cgf == NULL) + return -1; + + sprintf(aux, "name=%s:/%s", cgname, name); + while (fgets(paux, sizeof(paux), cgf)) { + char *s; + + s = strchr(paux, ':') + 1; + s[strlen(s) - 1] = '\0'; + test_msg("CMP [%s] vs [%s]\n", s, aux); + if (!strcmp(s, aux)) { + found = 1; + break; + } + } + + fclose(cgf); + + return found ? 0 : -1; +} + +int th_sync[2], rst_sync[2]; + +void *thread_fn(void *args) +{ + int status = cg_move(SUBNAME2); + + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + + if (status == 0) { + if (read(rst_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + exit_group(1); + } + + status = cg_check(SUBNAME2); + if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { + pr_perror("write"); + exit_group(1); + } + } + + pthread_exit(0); +} + +int main(int argc, char **argv) +{ + int status, exit_code = 1; + pthread_t thread; + char aux[64]; + + test_init(argc, argv); + + /* + * Pipe to talk to the kid. + * First, it reports that it's ready (int), + * then it reports the restore status (int). + */ + + if (pipe(th_sync)) { + pr_perror("pipe"); + return 1; + } + + /* "Restore happened" pipe */ + if (pipe(rst_sync)) { + pr_perror("pipe"); + return 1; + } + + if (mkdir(dirname, 0700) < 0) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(aux, "none,name=%s", cgname); + if (mount("none", dirname, "cgroup", 0, aux)) { + pr_perror("Can't mount cgroups"); + goto out_rd; + } + + if (cg_move(SUBNAME)) + goto out_rs; + + if (pthread_create(&thread, NULL, thread_fn, NULL)) { + pr_perror("Can't create a new thread"); + goto out_rs; + } + + status = -1; + read(th_sync[0], &status, sizeof(status)); + if (status != 0) { + pr_perror("Error moving into cgroups"); + close(rst_sync[0]); + goto out_rs; + } + + test_daemon(); + test_waitsig(); + + close(rst_sync[1]); + + status = -1; + if (read(th_sync[0], &status, sizeof(status)) < 0) { + pr_perror("read"); + goto out_rs; + } + if (status != 0) { + fail("child cg changed"); + goto out_rs; + } + + pass(); + exit_code = 0; + +out_rs: + umount(dirname); +out_rd: + rmdir(dirname); +out: + return exit_code; +} diff --git a/test/zdtm/static/cgroup_threads.desc b/test/zdtm/static/cgroup_threads.desc new file mode 100644 index 000000000..42a3f2b73 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup_threads.hook b/test/zdtm/static/cgroup_threads.hook new file mode 100755 index 000000000..f4b553d34 --- /dev/null +++ b/test/zdtm/static/cgroup_threads.hook @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +tname=$(mktemp -d cgclean.XXXXXX) +trap 'rmdir "${tname}"' EXIT + +mount -t cgroup none $tname -o "none,name=zdtmtst" +trap 'umount "${tname}"; rmdir "${tname}"' EXIT + +echo "Cleaning $tname" + +rmdir "$tname/subcg_threads/subsubcg/" || true +rmdir "$tname/subcg_threads/" || true + +echo "Left there is:" +ls "$tname" diff --git a/test/zdtm/static/cgroup_yard.desc b/test/zdtm/static/cgroup_yard.desc index 8736d6780..9ad4a9b57 100644 --- a/test/zdtm/static/cgroup_yard.desc +++ b/test/zdtm/static/cgroup_yard.desc @@ -1,6 +1,6 @@ { 'flavor': 'h', -'flags': 'suid', +'flags': 'suid excl', # We create the external cgroup yard in working directory during --pre-dump # hook. We have to go up a few directories to find the yard. 'opts': '--manage-cgroups --cgroup-yard ../../../../../../external_yard' diff --git a/test/zdtm/static/cgroup_yard.hook b/test/zdtm/static/cgroup_yard.hook index d06bc45fd..b70bd59e9 100755 --- a/test/zdtm/static/cgroup_yard.hook +++ b/test/zdtm/static/cgroup_yard.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys import os diff --git a/test/zdtm/static/cgroupns.desc b/test/zdtm/static/cgroupns.desc index 80dd710e1..dc61e36cf 100644 --- a/test/zdtm/static/cgroupns.desc +++ b/test/zdtm/static/cgroupns.desc @@ -1,4 +1,4 @@ { 'feature': 'cgroupns', - 'flags': 'suid', + 'flags': 'suid excl', 'flavor': 'h', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroupv2_00.c b/test/zdtm/static/cgroupv2_00.c new file mode 100644 index 000000000..2c6780e0c --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.c @@ -0,0 +1,86 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that some cgroup-v2 properties in kernel controllers are preserved"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg00"; + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + int ret = -1; + + test_init(argc, argv); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + test_daemon(); + test_waitsig(); + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "cpuset\n")) { + fail("cgroup.subtree_control mismatches"); + goto out; + } + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); + if (read_value(path, aux, sizeof(aux))) + goto out; + + if (strcmp(aux, "threaded\n")) { + fail("cgroup.type mismatches"); + goto out; + } + + pass(); + + ret = 0; + +out: + sprintf(path, "%s", dirname); + umount(path); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_00.checkskip b/test/zdtm/static/cgroupv2_00.checkskip new file mode 100755 index 000000000..375ed3564 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc new file mode 100644 index 000000000..e70c84df8 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_00.hook b/test/zdtm/static/cgroupv2_00.hook new file mode 100755 index 000000000..1002b1ec5 --- /dev/null +++ b/test/zdtm/static/cgroupv2_00.hook @@ -0,0 +1,16 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg00" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" +echo "-cpuset" > "$tname/$cgname/cgroup.subtree_control" + +set +e +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" diff --git a/test/zdtm/static/cgroupv2_01.c b/test/zdtm/static/cgroupv2_01.c new file mode 100644 index 000000000..f3a6d18ba --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.c @@ -0,0 +1,180 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that cgroup-v2 threaded controllers"; +const char *test_author = "Bui Quang Minh "; + +char *dirname; +TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); +const char *cgname = "subcg01"; + +task_waiter_t t; + +#define gettid(code) syscall(__NR_gettid) + +void cleanup(void) +{ + char path[1024]; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + rmdir(path); + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + rmdir(path); + sprintf(path, "%s/%s", dirname, cgname); + rmdir(path); + sprintf(path, "%s", dirname); + umount(path); +} + +int is_in_cgroup(char *cgname) +{ + FILE *cgf; + char buffer[1024]; + + sprintf(buffer, "/proc/self/task/%ld/cgroup", gettid()); + cgf = fopen(buffer, "r"); + if (cgf == NULL) { + pr_err("Fail to open thread's cgroup procfs\n"); + return 0; + } + + while (fgets(buffer, sizeof(buffer), cgf)) { + if (strstr(buffer, cgname)) { + fclose(cgf); + return 1; + } + } + + fclose(cgf); + return 0; +} + +void *thread_func(void *arg) +{ + char path[1024], aux[1024]; + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) { + cleanup(); + exit(1); + } + + read_value(path, aux, sizeof(aux)); + + task_waiter_complete(&t, 1); + + /* Wait for restore */ + task_waiter_wait4(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread2"); + if (!is_in_cgroup(path)) { + fail("Thread2's cgroup is not restored"); + cleanup(); + exit(1); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + char path[1024], aux[1024]; + pthread_t thread2; + int ret = 1; + + test_init(argc, argv); + task_waiter_init(&t); + + if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + return -1; + } + + if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { + pr_perror("Can't mount cgroup-v2"); + return -1; + } + + sprintf(path, "%s/%s", dirname, cgname); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + /* Make cpuset controllers available in children directory */ + sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); + sprintf(aux, "%s", "+cpuset"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); + sprintf(aux, "%d", getpid()); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); + if (mkdir(path, 0700) < 0 && errno != EEXIST) { + pr_perror("Can't make dir"); + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.type"); + sprintf(aux, "%s", "threaded"); + if (write_value(path, aux)) + goto out; + + ret = pthread_create(&thread2, NULL, thread_func, NULL); + if (ret < 0) { + pr_err("pthread_create %s\n", strerror(ret)); + ret = 1; + goto out; + } + + sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.threads"); + sprintf(aux, "%ld", gettid()); + if (write_value(path, aux)) + goto out; + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + task_waiter_complete(&t, 2); + + sprintf(path, "/%s/%s", cgname, "thread1"); + if (!is_in_cgroup(path)) { + fail("Main thread's cgroup is not restored"); + cleanup(); + exit(1); + } + pthread_join(thread2, NULL); + pass(); + + ret = 0; + +out: + cleanup(); + return ret; +} diff --git a/test/zdtm/static/cgroupv2_01.checkskip b/test/zdtm/static/cgroupv2_01.checkskip new file mode 100755 index 000000000..375ed3564 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.checkskip @@ -0,0 +1,11 @@ +#!/bin/bash + +if [ -f /sys/fs/cgroup/cgroup.controllers ]; then + grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 +fi + +if [ -d /sys/fs/cgroup/unified ]; then + grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 +fi + +exit 1 diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc new file mode 100644 index 000000000..e70c84df8 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.hook b/test/zdtm/static/cgroupv2_01.hook new file mode 100755 index 000000000..2263fd014 --- /dev/null +++ b/test/zdtm/static/cgroupv2_01.hook @@ -0,0 +1,24 @@ +#!/bin/bash + +[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 + +set -e +cgname="subcg01" +tname=$(mktemp -d cgclean.XXXXXX) +mount -t cgroup2 cgroup2 $tname + +echo "Cleaning $tname" + +set +e +rmdir "$tname/$cgname/thread1" + +# When the test finishes, the cleanup() function removes this directory +# successfully because the thread in this controller exit and no other +# threads belong to this controller +if [ "$1" == "--pre-restore" ]; then + rmdir "$tname/$cgname/thread2" +fi + +rmdir "$tname/$cgname" +umount "$tname" +rmdir "$tname" diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 6d436014b..8787ae5cf 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %s", opts); + result = sscanf(pos, " - %*s %*s %1023s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/child_opened_proc.c b/test/zdtm/static/child_opened_proc.c index 2125cd264..cfe04fa4b 100644 --- a/test/zdtm/static/child_opened_proc.c +++ b/test/zdtm/static/child_opened_proc.c @@ -10,7 +10,7 @@ #include "zdtmtst.h" const char *test_doc = "Check that tree prior to files opening"; -const char *test_author = "Stanislav Kinsbursky +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if fd obtained from pidfd_get_fd is C/R correctly\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) +{ + return syscall(__NR_pidfd_getfd, pidfd, targetfd, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int pidfd, child, p[2], child_read, read_data, status; + int data = 42; + + test_init(argc, argv); + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + close(p[WRITE]); + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + close(p[READ]); + if (write(p[WRITE], &data, sizeof(data)) != sizeof(data)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + child_read = pidfd_getfd(pidfd, p[READ], 0); + if (child_read < 0) { + pr_perror("pidfd_getfd"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (read(child_read, &read_data, sizeof(read_data)) != sizeof(read_data)) { + pr_perror("read"); + goto err_close; + } + + if (read_data != data) { + fail("data from fd obtained using pidfd_getfd incorrect"); + goto err_close; + } + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + pr_perror("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + return 1; + } + + pass(); + close(child_read); + close(pidfd); + return 0; +err_close: + close(child_read); + close(pidfd); + return 1; +} diff --git a/test/zdtm/static/fd_offset.c b/test/zdtm/static/fd_offset.c new file mode 100644 index 000000000..96255a4a1 --- /dev/null +++ b/test/zdtm/static/fd_offset.c @@ -0,0 +1,42 @@ +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check that criu properly restores offsets on ELF files"; +const char *test_author = "Michal Clapinski "; + +void check_offset(int fd) +{ + int offset = lseek(fd, 0, SEEK_CUR); + if (offset < 0) { + fail("lseek"); + exit(1); + } + if (offset != 0) { + fail("wrong offset; expected: 0, got: %d", offset); + exit(1); + } +} + +int main(int argc, char **argv) +{ + int fd; + + test_init(argc, argv); + + fd = open("/proc/self/exe", O_RDONLY); + if (fd < 0) { + fail("open"); + exit(1); + } + check_offset(fd); + + test_daemon(); + test_waitsig(); + + check_offset(fd); + + pass(); + return 0; +} diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index beea171f5..bfdca51d9 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index d2049ebaa..ae4827de9 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 35ef41a21..228e66892 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 11d224fa7..7e0d2654e 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/file_locks06.checkskip b/test/zdtm/static/file_locks06.checkskip index 06ab58521..c5039a2d2 100755 --- a/test/zdtm/static/file_locks06.checkskip +++ b/test/zdtm/static/file_locks06.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import fcntl import tempfile import struct diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index 31d0d92b2..3d952ac95 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -6,6 +6,10 @@ #define MAP_HUGETLB 0x40000 #endif +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -18,6 +22,10 @@ #define MADV_DONTDUMP 16 #endif +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; @@ -41,6 +49,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; + else if (_vmflag_match(tok, "dp")) + *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -57,6 +67,8 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); + else if (_vmflag_match(tok, "wf")) + *madv |= (1ul << MADV_WIPEONFORK); /* * Anything else is just ignored. diff --git a/test/zdtm/static/ghost_holes_large00.c b/test/zdtm/static/ghost_holes_large00.c new file mode 100644 index 000000000..1a9739f8e --- /dev/null +++ b/test/zdtm/static/ghost_holes_large00.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with one large hole(1GiB) in the middle"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for data size */ +#ifdef LIMIT +#define BUFSIZE 1024 * 1024 +#else +#define BUFSIZE 4096 +#endif +static unsigned char buf[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define DATA1_OFF 0 +#define HOLE_SIZE (1LL * 1 * 1024 * 1024 * 1024) +#define DATA2_OFF (BUFSIZE + HOLE_SIZE) +#define FILE_SIZE (2 * BUFSIZE + HOLE_SIZE) +#define ST_UNIT 512 + +int main(int argc, char **argv) +{ + int fd; + struct stat st; + uint32_t crc; + bool chk_hole = true; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + pr_perror("can't write data1"); + goto failed; + } + + crc = ~0; + datagen(buf, BUFSIZE, &crc); + if (pwrite(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + pr_perror("can't write data2"); + goto failed; + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + test_msg("Won't check for hole\n"); + chk_hole = false; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("file size OK\n"); + + if (st.st_blocks * ST_UNIT != 2 * BUFSIZE) { + fail("actual file size changed to %ld", (long)st.st_blocks * ST_UNIT); + goto failed; + } + + test_msg("actual file size OK\n"); + + /* Data 1 */ + if (pread(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { + fail("pread1 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk1 fail"); + goto failed; + } + + test_msg("Data1 OK\n"); + + /* Data 2 */ + if (pread(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { + fail("pread2 fail"); + goto failed; + } + + crc = ~0; + if (datachk(buf, BUFSIZE, &crc)) { + fail("datachk2 fail"); + goto failed; + } + + test_msg("Data2 OK\n"); + + /* Hole */ + if (chk_hole) { + if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { + fail("Begin of mid hole not found"); + goto failed; + } + if (lseek(fd, DATA1_OFF + BUFSIZE, SEEK_DATA) != DATA2_OFF) { + fail("End of mid hole not found"); + goto failed; + } + test_msg("Mid hole OK\n"); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_holes_large01.c b/test/zdtm/static/ghost_holes_large01.c new file mode 120000 index 000000000..1b90363d4 --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.c @@ -0,0 +1 @@ +ghost_holes_large00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_holes_large01.desc b/test/zdtm/static/ghost_holes_large01.desc new file mode 100644 index 000000000..8e6a476bd --- /dev/null +++ b/test/zdtm/static/ghost_holes_large01.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole00.c b/test/zdtm/static/ghost_multi_hole00.c new file mode 100644 index 000000000..0f78d4f14 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test ghost with a lot of holes(every 8K length contains only 4K data)"; +const char *test_author = "Liang-Chun Chen "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +/* Buffer that is suitable for hole size */ +#define BUFSIZE 4096 +static unsigned char buf4k[BUFSIZE]; + +#ifndef SEEK_DATA +#define SEEK_DATA 3 +#define SEEK_HOLE 4 +#endif + +#define FILE_SIZE (1 << 23) /* 8Mb */ + +#define FILE_INTERVAL (1 << 13) /* 8Kb */ + +int main(int argc, char **argv) +{ + int fd, off; + struct stat st; + uint32_t crc; + + test_init(argc, argv); + + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); + if (fd < 0) { + pr_perror("can't open %s", filename); + exit(1); + } + + if (unlink(filename) < 0) { + pr_perror("can't unlink %s", filename); + goto failed; + } + + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + crc = ~0; + datagen(buf4k, BUFSIZE, &crc); + if (pwrite(fd, &buf4k, BUFSIZE, off) != BUFSIZE) { + perror("pwrite"); + goto failed; + } + + /* + * In some file system, such as xfs, + * only pwrite might not able to create highly sparse file, + * so we need to forcibly allocate hole inside the file. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off + BUFSIZE, BUFSIZE)) { + perror("fallocate"); + goto failed; + } + } + + if (ftruncate(fd, FILE_SIZE)) { + pr_perror("Can't fixup file size"); + goto failed; + } + + test_daemon(); + test_waitsig(); + + if (fstat(fd, &st) < 0) { + fail("can't stat after"); + goto failed; + } + + if (st.st_size != FILE_SIZE) { + fail("file size changed to %ld", (long)st.st_size); + goto failed; + } + + test_msg("Size %u OK\n", FILE_SIZE); + + /* Data*/ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (pread(fd, buf4k, BUFSIZE, off) != BUFSIZE) { + fail("pread failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + crc = ~0; + if (datachk(buf4k, BUFSIZE, &crc)) { + fail("datachk failed @ %u", off / FILE_INTERVAL); + goto failed; + } + + test_msg("Data @%du OK\n", off / FILE_INTERVAL); + } + + /* Hole */ + for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { + if (lseek(fd, off, SEEK_HOLE) != off + BUFSIZE) { + fail("failed to find hole @ %u", off / FILE_SIZE); + goto failed; + } + test_msg("Hole @%du OK\n", off / FILE_INTERVAL); + } + + close(fd); + pass(); + return 0; + +failed: + close(fd); + return 1; +} diff --git a/test/zdtm/static/ghost_multi_hole00.desc b/test/zdtm/static/ghost_multi_hole00.desc new file mode 100644 index 000000000..3981e8180 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole00.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --no-ghost-fiemap'} diff --git a/test/zdtm/static/ghost_multi_hole01.c b/test/zdtm/static/ghost_multi_hole01.c new file mode 120000 index 000000000..c75006a6b --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.c @@ -0,0 +1 @@ +ghost_multi_hole00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole01.desc b/test/zdtm/static/ghost_multi_hole01.desc new file mode 100644 index 000000000..d1dc68a54 --- /dev/null +++ b/test/zdtm/static/ghost_multi_hole01.desc @@ -0,0 +1 @@ +{'dopts': '--ghost-limit 8M --ghost-fiemap'} diff --git a/test/zdtm/static/macvlan.checkskip b/test/zdtm/static/macvlan.checkskip new file mode 100755 index 000000000..f4e060953 --- /dev/null +++ b/test/zdtm/static/macvlan.checkskip @@ -0,0 +1,38 @@ +#!/bin/bash + +FAIL=0 + +create_macvlan_device() { + if ! ip link add test_mvlan1 type veth >/dev/null 2>&1; then + FAIL=1 + fi + if ! ip link add mymacvlan1 link test_mvlan1 type macvlan >/dev/null 2>&1; then + FAIL=1 + fi + + return "${FAIL}" +} + +cleanup() { + ip link del test_mvlan1 >/dev/null 2>&1 + ip link del mymacvlan1 >/dev/null 2>&1 +} + +trap "cleanup" QUIT TERM INT HUP EXIT + +# Test once without loading the module +if create_macvlan_device; then + exit 0 +fi + +# Test once more with explicitly loading the module +if ! modprobe macvlan >/dev/null 2>&1; then + exit 1 +fi +create_macvlan_device + +if [ "${FAIL}" == "1" ]; then + exit 1 +fi + +exit 0 diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c index b1e55e861..f6989f3af 100644 --- a/test/zdtm/static/maps00.c +++ b/test/zdtm/static/maps00.c @@ -137,7 +137,7 @@ static int check_map(struct map *map) } /* prot |= PROT_READ// need barrier before this line, because compiler change order commands. - I finded one method: look at next lines*/ + I found one method: look at next lines*/ } else prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 29f1372c9..38244f020 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -2,11 +2,19 @@ #include "zdtmtst.h" #include "get_smaps_bits.h" +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif -const char *test_doc = "Test shared memory with advises"; +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test private memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { @@ -23,8 +31,14 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { - pr_perror("mmap failed"); - return -1; + if (errno == EINVAL) { + test_msg("mmap failed, no kernel support\n"); + *m = (struct mmap_data){}; + return 0; + } else { + pr_perror("mmap failed"); + return -1; + } } if (madvise(m->start, MEM_SIZE, adv)) { @@ -43,12 +57,12 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[5] = {}; + struct mmap_data m[7] = {}; size_t i; test_init(argc, argv); - test_msg("Alloc growsdown\n"); + test_msg("Alloc dontfork\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; @@ -64,10 +78,18 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; - test_msg("Alloc dontfork/random|mergeable\n"); + test_msg("Alloc mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; + test_msg("Alloc wipeonfork\n"); + if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) + return -1; + + test_msg("Alloc droppable\n"); + if (alloc_anon_mmap(&m[6], MAP_DROPPABLE | MAP_ANONYMOUS, MADV_NORMAL)) + return -1; + test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) diff --git a/test/zdtm/static/maps11.c b/test/zdtm/static/maps11.c new file mode 100644 index 000000000..df309714b --- /dev/null +++ b/test/zdtm/static/maps11.c @@ -0,0 +1,205 @@ +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +#ifndef MAP_DROPPABLE +#define MAP_DROPPABLE 0x08 +#endif + +#ifndef MADV_WIPEONFORK +#define MADV_WIPEONFORK 18 +#endif + +const char *test_doc = "Test MAP_DROPPABLE/MADV_WIPEONFORK mappings with 2 processes"; +const char *test_author = "Alexander Mikhalitsyn "; + +bool mem_is_zero(const uint8_t *buffer, size_t length) +{ + size_t i; + + for (i = 0; i < length; i++) + if (buffer[i] != 0) + return false; + + return true; +} + +int main(int argc, char **argv) +{ + uint8_t *p1, *p2; + pid_t pid; + int status; + const char data[] = "MADV_WIPEONFORK vma data"; + bool criu_was_there = false; + struct stat st1, st2; + + test_init(argc, argv); + + p1 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_DROPPABLE | MAP_ANONYMOUS, 0, 0); + if (p1 == MAP_FAILED) { + if (errno == EINVAL) { + skip("mmap failed, no kernel support for MAP_DROPPABLE\n"); + goto skip; + } else { + pr_perror("mmap failed"); + return -1; + } + } + + p2 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (p2 == MAP_FAILED) { + pr_perror("mmap failed"); + return 1; + } + + if (madvise(p2, sizeof(data), MADV_WIPEONFORK)) { + pr_perror("madvise failed"); + return -1; + } + + /* contents of this mapping is supposed to be dropped after C/R */ + memcpy(p1, data, sizeof(data)); + + /* contents of this mapping is supposed to be dropped after fork() */ + memcpy(p2, data, sizeof(data)); + + /* + * Let's spawn a process before C/R so our mappings get inherited + * then, after C/R we need to ensure that CRIU memory premapping + * machinery works properly. + * + * It is important, because we restore MADV_WIPEONFORK on a later + * stages (after vma premapping happens) and we need to ensure that + * CRIU handles everything in a right way. + */ + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + test_waitsig(); + + /* + * Both mappings have VM_WIPEONFORK flag set, + * so we expect to have it null-ified after fork(). + */ + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("1st child: memory check failed\n"); + return 1; + } + + return 0; + } + + /* + * A simple way to detect if C/R happened is to compare st_ino + * fields of stat() on the procfs files of the current task. + * + * Hopefully, this terrible hack is never used in real-world + * applications ;-) Here, we only need this to make test + * to pass with/without --nocr option. + */ + if (stat("/proc/self/status", &st1)) { + pr_perror("stat"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* signal a child process to continue */ + if (kill(pid, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("1st waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("1st process didn't exit cleanly: status=%d", status); + goto err; + } + + if (stat("/proc/self/status", &st2)) { + pr_perror("stat"); + return 1; + } + + /* detect CRIU */ + criu_was_there = st1.st_ino != st2.st_ino; + + /* + * We should mark failure if one of the following happens: + * 1. MAP_DROPPABLE memory is not zero after C/R + * 2. MAP_DROPPABLE memory somehow changed without C/R + * (kernel issue? memory pressure?) + * 3. MADV_WIPEONFORK memory is not preserved + * + * We care about 2nd case only because we would like test + * to pass even with --nocr zdtm.py option. + */ + if ((criu_was_there && !mem_is_zero(p1, sizeof(data))) || + (!criu_was_there && memcmp(p1, data, sizeof(data))) || + memcmp(p2, data, sizeof(data))) { + fail("Data mismatch"); + return 1; + } + + /* contents of these mappings is supposed to be dropped after fork() */ + memcpy(p1, data, sizeof(data)); + memcpy(p2, data, sizeof(data)); + + pid = test_fork(); + if (pid < 0) { + pr_perror("fork failed"); + return 1; + } + + if (pid == 0) { + if (!mem_is_zero(p1, sizeof(data)) || + !mem_is_zero(p2, sizeof(data))) { + pr_err("2nd child: memory check failed\n"); + return 1; + } + + return 0; + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("2nd waitpid"); + goto err; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + fail("2nd process didn't exit cleanly: status=%d", status); + goto err; + } + + pass(); + + return 0; +err: + if (waitpid(-1, NULL, WNOHANG) == 0) { + kill(pid, SIGTERM); + wait(NULL); + } + return 1; + +skip: + test_daemon(); + test_waitsig(); + pass(); + return 0; +} diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c new file mode 100644 index 000000000..f0d6c2381 --- /dev/null +++ b/test/zdtm/static/maps12.c @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test madvise(MADV_GUARD_INSTALL)"; +const char *test_author = "Alexander Mikhalitsyn "; +/* some parts of code were taken from Linux kernel's kselftest guard-pages.c + written by Lorenzo Stoakes */ + +char *filename; +int fd; +TEST_OPTION(filename, string, "file name", 1); + +#ifndef MADV_GUARD_INSTALL +#define MADV_GUARD_INSTALL 102 +#endif + +uint8_t *map_base; + +struct { + unsigned int pages_num; + bool filemap; +} vmas[] = { + { 2, false }, + { 2, false }, + { 2, false }, + { 2, true }, + { 2, true }, + { 2, true }, +}; + +struct { + bool guarded; + bool wipeonfork; +} pages[] = { + { false, false }, /* vmas[0] */ + { true, false }, + { true, false }, /* vmas[1] */ + { false, false }, + { false, false }, /* vmas[2] */ + { true, true }, + { true, false }, /* vmas[3] */ + { false, false }, + { true, false }, /* vmas[4] */ + { true, false }, + { false, false }, /* vmas[5] */ + { true, false }, +}; + +static volatile sig_atomic_t signal_jump_set; +static sigjmp_buf signal_jmp_buf; + +static void handle_sigsegv(int signo) +{ + if (!signal_jump_set) + return; + + siglongjmp(signal_jmp_buf, 1); +} + +static bool try_write_to_addr(uint8_t *ptr) +{ + bool failed; + + /* Tell signal handler to jump back here on fatal signal. */ + signal_jump_set = true; + /* If a fatal signal arose, we will jump back here and failed is set. */ + failed = sigsetjmp(signal_jmp_buf, 1) != 0; + + if (!failed) + *ptr = 'x'; + + signal_jump_set = false; + return !failed; +} + +static int setup_sigsegv_handler(void) +{ + uint8_t write_me; + + if (signal(SIGSEGV, handle_sigsegv) == SIG_ERR) { + pr_perror("setting SIGSEGV handler failed"); + return 1; + } + + /* ensure that try_write_to_addr() works properly */ + if (!try_write_to_addr(&write_me)) { + pr_err("Failed to write at valid addr. Buggy try_write_to_addr()?\n"); + return 1; + } + + if (try_write_to_addr(NULL)) { + pr_err("Failed to detect an invalid write. Buggy try_write_to_addr()?\n"); + return 1; + } + + return 0; +} + +static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap) +{ + char *map; + + map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), + filemap ? fd : -1, + filemap ? (off_t)((intptr_t)addr_hint - (intptr_t)map_base) : 0); + if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) + return MAP_FAILED; + + return map; +} + +static int __check_guards(const char *when, bool in_child) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + /* + * Skip pages that were never guarded, and also those + * that were, but have MADV_WIPEONFORK which means that + * guards were removed on fork. + */ + if (!pages[i].guarded || (in_child && pages[i].wipeonfork)) + continue; + + if (try_write_to_addr(&map_base[i * PAGE_SIZE])) { + pr_err("successful write to a guarded area %d %s C/R\n", + i, when); + return 1; + } + } + + return 0; +} + +static int check_guards(const char *when) +{ + int status; + pid_t pid; + + /* + * First of all, check that guards are on their places + * in a main test process. + */ + if (__check_guards(when, false)) { + return 1; + } + + /* + * Now, check that guards are on their places + * after fork(). This allows to ensure that + * combo MADV_WIPEONFORK + MADV_GUARD_INSTALL + * is restored properly too. + */ + + pid = test_fork(); + if (pid < 0) { + pr_perror("check_guards: fork failed"); + return 1; + } + + if (pid == 0) { + if (__check_guards(when, true)) { + pr_err("check_guards(\"%s\") failed in child\n", when); + exit(1); + } + + exit(0); + } + + if (waitpid(pid, &status, 0) != pid) { + pr_perror("check_guards: waitpid"); + return 1; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + pr_err("check_guards: process didn't exit cleanly: status=%d\n", status); + return 1; + } + + return 0; +} + +static void gen_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + datagen(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc); + } +} + +static int set_pages_madvs(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + if (pages[i].guarded) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_GUARD_INSTALL)) { + pr_perror("MADV_GUARD_INSTALL failed on page %d", i); + return 1; + } + } + + if (pages[i].wipeonfork) { + if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, + MADV_WIPEONFORK)) { + pr_perror("MADV_WIPEONFORK failed on page %d", i); + return 1; + } + } + } + + return 0; +} + +static int check_pages_data(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pages); i++) { + uint32_t crc; + + if (pages[i].guarded) + continue; + + crc = ~0; + if (datachk(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc)) { + pr_err("Page %d is corrupted\n", i); + return 1; + } + } + + return 0; +} + +static int prepare_vmas(void) +{ + char *map; + int i, shift; + + shift = 0; + for (i = 0; i < ARRAY_SIZE(vmas); i++) { + map = mmap_pages(&map_base[shift * PAGE_SIZE], + vmas[i].pages_num, vmas[i].filemap); + if (map == MAP_FAILED) { + pr_err("mmap of [%d,%d] pages failed\n", + shift, shift + vmas[i].pages_num); + return 1; + } + + shift += vmas[i].pages_num; + } + + if (shift != ARRAY_SIZE(pages)) { + pr_err("Different number of pages in vmas and pages arrays.\n"); + return 1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + unsigned int pages_num = ARRAY_SIZE(pages); + + test_init(argc, argv); + + fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); + if (fd < 0) { + pr_perror("Unable to create a test file"); + return -1; + } + + if (ftruncate(fd, pages_num * PAGE_SIZE)) { + pr_perror("Unable to ftruncate a test file"); + return -1; + } + + if (setup_sigsegv_handler()) { + pr_err("setup_sigsegv_handler() failed\n"); + return 1; + } + + /* let's find a large enough area in address space */ + map_base = mmap_pages(NULL, pages_num, false); + if (map_base == MAP_FAILED) { + pr_err("mmap of %d pages failed\n", pages_num); + return 1; + } + + /* + * Now we know that we have a free vm address space area + * [map_base, map_base + pages_num * PAGE_SIZE). + * We can use (map_base) as a hint for our further mmaps. + */ + if (prepare_vmas()) { + pr_err("prepare_vmas() failed\n"); + return 1; + } + + /* fill non-guarded pages with data and preserve checksums */ + gen_pages_data(); + + if (set_pages_madvs()) { + pr_err("set_pages_madvs() failed\n"); + return 1; + } + + /* ensure that madvise(MADV_GUARD_INSTALL) works like expected */ + if (check_guards("before")) { + pr_err("check_guards(\"before\") failed\n"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* ensure that guards are at their places */ + if (check_guards("after")) { + fail("check_guards(\"after\") failed"); + return 1; + } + + /* check that non-guarded pages still contain original data */ + if (check_pages_data()) { + fail("check_pages_data() failed"); + return 1; + } + + pass(); + munmap(map_base, pages_num * PAGE_SIZE); + close(fd); + return 0; +} diff --git a/test/zdtm/static/maps12.desc b/test/zdtm/static/maps12.desc new file mode 100644 index 000000000..3f7627ff3 --- /dev/null +++ b/test/zdtm/static/maps12.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'feature': 'pagemap_scan_guard_pages'} diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c new file mode 100644 index 000000000..85d705ba7 --- /dev/null +++ b/test/zdtm/static/membarrier.c @@ -0,0 +1,149 @@ +#include +#include +#include +#include "zdtmtst.h" + +const char *test_doc = "Test membarrier() migration"; +const char *test_author = "Michał Mirosław "; + +/* + * Define membarrier() CMDs to avoid depending on exact kernel header version. + */ +#define MEMBARRIER_CMD_GLOBAL_EXPEDITED (1 << 1) +#define MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED (1 << 2) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) +#define MEMBARRIER_CMD_GET_REGISTRATIONS (1 << 9) + +static int membarrier(int cmd, unsigned int flags, int cpu_id) +{ + return syscall(__NR_membarrier, cmd, flags, cpu_id); +} + +static const struct { + const char *name_suffix; + int register_cmd; + int execute_cmd; +} membarrier_cmds[] = { + { "GLOBAL_EXPEDITED", MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_GLOBAL_EXPEDITED }, + { "PRIVATE_EXPEDITED", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_PRIVATE_EXPEDITED }, + { "PRIVATE_EXPEDITED_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, + { "PRIVATE_EXPEDITED_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, +}; +static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); + +static int register_membarriers(void) +{ + int barriers_supported, barriers_registered; + bool all_ok = true; + + barriers_supported = membarrier(MEMBARRIER_CMD_QUERY, 0, 0); + if (barriers_supported < 0) { + fail("membarrier() not supported by running kernel"); + return -1; + } + + barriers_registered = 0; + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_supported & membarrier_cmds[i].register_cmd) + continue; + + barriers_registered |= membarrier_cmds[i].register_cmd; + + if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { + pr_perror("membarrier(REGISTER_%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) { + fail("can't register membarrier()s - tried %#x, kernel %#x", + barriers_registered, barriers_supported); + return -1; + } + + if (!barriers_registered) { + fail("no known membarrier() cmds are supported by the kernel"); + return -1; + } + + return barriers_registered; +} + +static bool check_membarriers_compat(int barriers_registered) +{ + bool all_ok = true; + + for (int i = 0; i < n_membarrier_cmds; ++i) { + if (~barriers_registered & membarrier_cmds[i].register_cmd) + continue; + if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { + pr_perror("membarrier(%s)", membarrier_cmds[i].name_suffix); + all_ok = false; + } + } + + if (!all_ok) + fail("membarrier() check failed"); + + return all_ok; +} + +static bool check_membarriers_get_registrations(int barriers_registered) +{ + int ret = membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS, 0, 0); + if (ret < 0) { + if (errno == EINVAL) { + test_msg("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS) not supported by running kernel"); + return true; + } + fail("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS)"); + return false; + } + if (ret != barriers_registered) { + fail("MEMBARRIER_CMD_GET_REGISTRATIONS check failed, expected: %d, got: %d", + barriers_registered, ret); + return false; + } + + return true; +} + +static bool check_membarriers(int barriers_registered) +{ + return check_membarriers_compat(barriers_registered) && + check_membarriers_get_registrations(barriers_registered); +} + +int main(int argc, char **argv) +{ + int barriers_registered; + + test_init(argc, argv); + + barriers_registered = register_membarriers(); + if (barriers_registered < 0) + return 1; + + test_msg("Pre-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + test_daemon(); + test_waitsig(); + + test_msg("Post-migration membarriers check\n"); + if (!check_membarriers(barriers_registered)) + return 1; + + pass(); + return 0; +} diff --git a/test/zdtm/static/memfd00.c b/test/zdtm/static/memfd00.c index d037f6969..8d77ed06e 100644 --- a/test/zdtm/static/memfd00.c +++ b/test/zdtm/static/memfd00.c @@ -30,8 +30,10 @@ int main(int argc, char *argv[]) { int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; struct statfs statfs1, statfs2; + struct stat stat; off_t pos1, pos2; char buf[5]; + int fmode1, fmode2; test_init(argc, argv); @@ -58,6 +60,13 @@ int main(int argc, char *argv[]) if (lseek(fd, pos1, SEEK_SET) < 0) err(1, "seek error"); + if (fchmod(fd, 0642)) + err(1, "Can't set permission bits"); + + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode1 = stat.st_mode; + test_daemon(); test_waitsig(); @@ -85,6 +94,15 @@ int main(int argc, char *argv[]) return 1; } + if (fstat(fd, &stat) < 0) + err(1, "fstat() issue"); + fmode2 = stat.st_mode; + + if (fmode1 != fmode2) { + fail("stat.st_mode = %#o != %#o", fmode2, fmode1); + return 1; + } + pos2 = lseek(fd, 0, SEEK_CUR); if (pos1 != pos2) { fail("position differs"); diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c new file mode 100644 index 000000000..215e949d1 --- /dev/null +++ b/test/zdtm/static/memfd04.c @@ -0,0 +1,132 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "exec(memfd)"; +const char *test_author = "Michał Mirosław "; + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +static int _execveat(int dirfd, const char *pathname, const char *const argv[], const char *const envp[], int flags) +{ + return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); +} + +static const char *const script_argv[] = { "true", NULL }; +static const char *const script_env[] = { NULL }; + +static bool test_exec_fd(int fd) +{ + int err, pid, status; + + err = fcntl(fd, F_GETFD); + if (err < 0) { + fail("fcntl(F_GETFD)"); + return false; + } + if (err) { + errno = 0; + fail("F_GETFD for the memfd returned %d but expected 0", err); + return false; + } + + pid = fork(); + if (!pid) { + _execveat(fd, "", script_argv, script_env, AT_EMPTY_PATH); + err = errno; + pr_perror("execveat()"); + _exit(err); + } + + if (pid < 0) { + fail("fork()"); + return false; + } + + while (waitpid(pid, &status, 0) != pid) { + if (errno == EINTR) + continue; + fail("waitpid(child=%d)", pid); + return false; + } + + if (status != 0) { + pr_err("child exited with status=%d\n", status); + return false; + } + + return true; +} + +static const char script[] = "#!/bin/true"; +static const size_t script_len = sizeof(script) - 1; + +int main(int argc, char *argv[]) +{ +#ifdef MEMFD05 + char path[PATH_MAX]; + char *addr_p, *addr_s; + int rofd; +#endif + int fd; + + test_init(argc, argv); + + fd = _memfd_create("somename", 0); + if (fd < 0) { + pr_perror("memfd_create()"); + return 1; + } + if (ftruncate(fd, script_len) == -1) { + pr_perror("ftruncate"); + return 1; + } + if (write(fd, script, script_len) != script_len) { + pr_perror("write(memfd)"); + return 1; + } +#ifdef MEMFD05 + snprintf(path, PATH_MAX - 1, "/proc/self/fd/%d", fd); + rofd = open(path, O_RDONLY); + if (rofd < 0) { + pr_perror("unable to open read-only memfd"); + return 1; + } + addr_p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, rofd, 0); + if (addr_p == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + addr_s = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr_s == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } +#endif + + if (!test_exec_fd(fd)) + return 1; + + test_msg("execveat(memfd) succeeded before C/R.\n"); + + test_daemon(); + test_waitsig(); + + if (!test_exec_fd(fd)) + return 1; + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd04.desc b/test/zdtm/static/memfd04.desc new file mode 100644 index 000000000..bbf136d14 --- /dev/null +++ b/test/zdtm/static/memfd04.desc @@ -0,0 +1 @@ +{'deps': ['/bin/true']} diff --git a/test/zdtm/static/memfd05.c b/test/zdtm/static/memfd05.c new file mode 120000 index 000000000..6caa9556f --- /dev/null +++ b/test/zdtm/static/memfd05.c @@ -0,0 +1 @@ +memfd04.c \ No newline at end of file diff --git a/test/zdtm/static/memfd05.desc b/test/zdtm/static/memfd05.desc new file mode 120000 index 000000000..1b4963572 --- /dev/null +++ b/test/zdtm/static/memfd05.desc @@ -0,0 +1 @@ +memfd04.desc \ No newline at end of file diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.c b/test/zdtm/static/mnt_ext_file_bind_auto.c new file mode 100644 index 000000000..0c3b9f5fb --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if external file mount works"; +const char *test_author = "Pavel Tikhomirov "; + +char *filename = "mnt_ext_file_bind_auto_bind_auto.file"; +TEST_OPTION(filename, string, "file name", 1); + +char *source = "mnt_ext_file_bind_auto_bind_auto.source"; + +int create_file(const char *path) +{ + int fd; + + fd = open(path, O_CREAT | O_RDWR, 0644); + if (fd < 0) { + pr_perror("open"); + return -1; + } + + close(fd); + return 0; +} + +int main(int argc, char **argv) +{ + char *zdtm_newns = getenv("ZDTM_NEWNS"); + char *tmp = "/tmp/zdtm_ext_file_bind_auto.tmp"; + char *sourcefile = "/tmp/zdtm_ext_file_bind_auto.file"; + char *root, tmpfile[PATH_MAX], testfile[PATH_MAX]; + + root = getenv("ZDTM_ROOT"); + if (root == NULL) { + pr_perror("root"); + return 1; + } + + if (!zdtm_newns) { + pr_perror("ZDTM_NEWNS is not set"); + return 1; + } else if (strcmp(zdtm_newns, "1")) { + goto test; + } + + /* Prepare file bindmount in criu root (source for external file bindmount) */ + mkdir(tmp, 0755); + if (mount(source, tmp, "tmpfs", 0, NULL)) { + pr_perror("mount tmpfs"); + return 1; + } + if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { + pr_perror("make private"); + return 1; + } + + sprintf(tmpfile, "%s/%s", tmp, filename); + if (create_file(tmpfile)) + return 1; + + if (create_file(sourcefile)) + return 1; + + if (mount(tmpfile, sourcefile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } + + umount2(tmp, MNT_DETACH); + + /* Prepare file in test root (mount point for external file bindmount) */ + sprintf(testfile, "%s/%s", root, filename); + if (create_file(testfile)) + return 1; + + /* + * Create temporary mntns, next mounts will not show up in criu mntns + * and will be inherited into test mntns + */ + if (unshare(CLONE_NEWNS)) { + pr_perror("unshare"); + return 1; + } + + if (mount(sourcefile, testfile, NULL, MS_BIND, NULL)) { + pr_perror("bind"); + return 1; + } +test: + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.desc b/test/zdtm/static/mnt_ext_file_bind_auto.desc new file mode 100644 index 000000000..825b08127 --- /dev/null +++ b/test/zdtm/static/mnt_ext_file_bind_auto.desc @@ -0,0 +1,4 @@ +{ 'opts': '--external mnt[]', + 'feature': 'mnt_id', + 'flavor': 'ns uns', + 'flags': 'suid'} diff --git a/test/zdtm/static/mnt_ro_root.c b/test/zdtm/static/mnt_ro_root.c new file mode 100644 index 000000000..2d8370150 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.c @@ -0,0 +1,32 @@ +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check if root mount remains read-only after c/r"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { + pr_perror("mount"); + return 1; + } + + test_daemon(); + test_waitsig(); + + /* + * Note: In zdtm.py:check_visible_state() we already check for all + * tests, that all mounts in the test's mount namespace remain the + * same, by comparing mountinfo before and after c/r. So rw/ro mount + * option inconsistency will be detected there and we don't need to + * check it in the test itself. + */ + pass(); + return 0; +} diff --git a/test/zdtm/static/mnt_ro_root.desc b/test/zdtm/static/mnt_ro_root.desc new file mode 100644 index 000000000..c9a8e4f18 --- /dev/null +++ b/test/zdtm/static/mnt_ro_root.desc @@ -0,0 +1,6 @@ +{ + 'flavor': 'ns uns', + 'flags': 'suid', + 'feature': 'mnt_id', + 'bind': 'zdtm/static', +} diff --git a/test/zdtm/static/mnt_root_ext.c b/test/zdtm/static/mnt_root_ext.c index 6a2eb068c..305e87262 100644 --- a/test/zdtm/static/mnt_root_ext.c +++ b/test/zdtm/static/mnt_root_ext.c @@ -51,6 +51,14 @@ int main(int argc, char **argv) return 1; } + /* + * Make mounts in temporary mntns slave, to prevent propagation to criu mntns + */ + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { + pr_perror("make rslave"); + return 1; + } + /* * Populate to the tests root host's rootfs subdir */ diff --git a/test/zdtm/static/mntns_open.c b/test/zdtm/static/mntns_open.c index 7d8bbbaa4..0430f5b99 100644 --- a/test/zdtm/static/mntns_open.c +++ b/test/zdtm/static/mntns_open.c @@ -17,7 +17,7 @@ #define CLONE_NEWNS 0x00020000 #endif -const char *test_doc = "Check that mnt_id is repsected"; +const char *test_doc = "Check that mnt_id is respected"; const char *test_author = "Pavel Emelianov "; #define MPTS_FILE "F" diff --git a/test/zdtm/static/mntns_root_bind.c b/test/zdtm/static/mntns_root_bind.c index 9e1ba06e6..4c0347cb2 100644 --- a/test/zdtm/static/mntns_root_bind.c +++ b/test/zdtm/static/mntns_root_bind.c @@ -71,7 +71,7 @@ int main(int argc, char **argv) task_waiter_wait4(&t, 2); if (access(bspath, F_OK)) { - fail("%s isn't accessiable", bspath); + fail("%s isn't accessible", bspath); return 1; } diff --git a/test/zdtm/static/mount_complex_sharing.c b/test/zdtm/static/mount_complex_sharing.c index b4463c41a..5f247a8e4 100644 --- a/test/zdtm/static/mount_complex_sharing.c +++ b/test/zdtm/static/mount_complex_sharing.c @@ -5,6 +5,7 @@ #include #include +#include "mountinfo.h" #include "zdtmtst.h" const char *test_doc = "Check complex sharing options for mounts"; @@ -211,6 +212,8 @@ static int mount_loop(void) int main(int argc, char **argv) { + MNTNS_ZDTM(mntns_before); + MNTNS_ZDTM(mntns_after); int ret = 1; test_init(argc, argv); @@ -223,12 +226,23 @@ int main(int argc, char **argv) if (mount_loop()) goto err; + if (mntns_parse_mountinfo(&mntns_before)) + goto err; + test_daemon(); test_waitsig(); + if (mntns_parse_mountinfo(&mntns_after)) + goto err; + + if (mntns_compare(&mntns_before, &mntns_after)) + goto err; + pass(); ret = 0; err: + mntns_free_all(&mntns_before); + mntns_free_all(&mntns_after); if (ret) fail(); return ret; diff --git a/test/zdtm/static/mtime_mmap.c b/test/zdtm/static/mtime_mmap.c index faa2d6fad..4de8438ee 100644 --- a/test/zdtm/static/mtime_mmap.c +++ b/test/zdtm/static/mtime_mmap.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -77,7 +78,7 @@ int main(int argc, char **argv) mtime_new = fst.st_mtime; /* time of last modification */ if (mtime_new <= mtime_old) { - fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); + fail("mtime %" PRId64 " wasn't updated on mmapped %s file", (int64_t)mtime_new, filename); goto failed; } @@ -98,7 +99,7 @@ int main(int argc, char **argv) /* time of last modification */ if (fst.st_mtime != mtime_new) { - fail("After migration, mtime changed to %ld", fst.st_mtime); + fail("After migration, mtime changed to %" PRId64, (int64_t)fst.st_mtime); goto failed; } diff --git a/test/zdtm/static/net_lock_socket_iptables.desc b/test/zdtm/static/net_lock_socket_iptables.desc index 936ff8702..cb622536f 100644 --- a/test/zdtm/static/net_lock_socket_iptables.desc +++ b/test/zdtm/static/net_lock_socket_iptables.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/net_lock_socket_iptables.hook b/test/zdtm/static/net_lock_socket_iptables.hook index 0ee147eb2..e9fcd7350 100755 --- a/test/zdtm/static/net_lock_socket_iptables.hook +++ b/test/zdtm/static/net_lock_socket_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import socket import time diff --git a/test/zdtm/static/net_lock_socket_iptables6.desc b/test/zdtm/static/net_lock_socket_iptables6.desc index 936ff8702..cb622536f 100644 --- a/test/zdtm/static/net_lock_socket_iptables6.desc +++ b/test/zdtm/static/net_lock_socket_iptables6.desc @@ -1,5 +1,6 @@ { 'flavor': 'h', + 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index 1e6ee1dea..f268f2fec 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%s", val); + ret = fscanf(fp, "%200s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index e7e73b1ae..58c23e8ba 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -1,6 +1,7 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', + 'feature': 'has_ipt_legacy', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns-nft-ipt.desc b/test/zdtm/static/netns-nft-ipt.desc index 4120f74d6..6d04589b3 100644 --- a/test/zdtm/static/netns-nft-ipt.desc +++ b/test/zdtm/static/netns-nft-ipt.desc @@ -2,7 +2,7 @@ 'deps': [ '/bin/sh', '/usr/sbin/nft', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns_lock_iptables.desc b/test/zdtm/static/netns_lock_iptables.desc index 69020f34e..b465706b8 100644 --- a/test/zdtm/static/netns_lock_iptables.desc +++ b/test/zdtm/static/netns_lock_iptables.desc @@ -1,6 +1,7 @@ { 'flavor': 'h', 'flags': 'suid excl reqrst', + 'feature': 'has_ipt_legacy', 'opts': '--tcp-established', 'dopts': '--network-lock iptables', 'ropts': '--join-ns net:/var/run/netns/criu-net-lock-test' diff --git a/test/zdtm/static/netns_lock_iptables.hook b/test/zdtm/static/netns_lock_iptables.hook index e7daf8a65..b51d3c2cc 100755 --- a/test/zdtm/static/netns_lock_iptables.hook +++ b/test/zdtm/static/netns_lock_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import subprocess import socket @@ -67,7 +67,7 @@ if sys.argv[1] == "--post-start": cln, addr = srv.accept() cln.sendall(str.encode("--post-restore")) cln.close() - + # Server will be closed when zdtm sends SIGKILL if sys.argv[1] == "--pre-dump": diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 545a17308..03b478b7d 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -1,20 +1,38 @@ #include +#include +#include #include "zdtmtst.h" #include "sysctl.h" -const char *test_doc = "Check dump and restore a net.unix.max_dgram_qlen sysctl parameter in subns"; +const char *test_doc = "Check dump and restore of sysctls in subns"; const char *test_author = "Alexander Mikhalitsyn "; +#define MAX_STR_SYSCTL_LEN 200 + +enum { + SYSCTL_INT, + SYSCTL_STR, +}; + typedef struct { const char *path; + int type; int old; int new; + char s_old[MAX_STR_SYSCTL_LEN]; + char s_new[MAX_STR_SYSCTL_LEN]; + bool set; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" +#define IPV4_SYSCTL_BASE "/proc/sys/net/ipv4" -static sysctl_opt_t net_unix_params[] = { { CONF_UNIX_BASE "/max_dgram_qlen", 0, 0 }, { NULL, 0, 0 } }; +static sysctl_opt_t net_unix_params[] = { + {CONF_UNIX_BASE "/max_dgram_qlen", SYSCTL_INT}, + {IPV4_SYSCTL_BASE "/ping_group_range", SYSCTL_STR, 0, 0, "40000\t50000\n"}, + {NULL, 0, 0} +}; int main(int argc, char **argv) { @@ -23,10 +41,22 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { - p->old = (((unsigned)lrand48()) % 1023) + 1; - if (sysctl_write_int(p->path, p->old)) { - pr_perror("Can't change %s", p->path); - return -1; + if (access(p->path, W_OK) != 0) { + test_msg("%s doesn't exist\n", p->path); + continue; + } + p->set = true; + if (p->type == SYSCTL_INT) { + p->old = (((unsigned)lrand48()) % 1023) + 1; + if (sysctl_write_int(p->path, p->old)) { + pr_perror("Can't change %s", p->path); + return -1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_write_str(p->path, p->s_old)) { + pr_perror("Can't change %s", p->path); + return -1; + } } } @@ -34,13 +64,27 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { - if (sysctl_read_int(p->path, &p->new)) - ret = 1; + if (!p->set) + continue; + if (p->type == SYSCTL_INT) { + if (sysctl_read_int(p->path, &p->new)) + ret = 1; - if (p->old != p->new) { - errno = EINVAL; - pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); - ret = 1; + if (p->old != p->new) { + errno = EINVAL; + pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); + ret = 1; + } + } else if (p->type == SYSCTL_STR) { + if (sysctl_read_str(p->path, p->s_new, MAX_STR_SYSCTL_LEN)) { + ret = 1; + } else { + if (strcmp(p->s_old, p->s_new)) { + errno = EINVAL; + pr_perror("%s changed: %s ---> %s", p->path, p->s_old, p->s_new); + ret = 1; + } + } } } diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc index 535842668..0c357aefe 100644 --- a/test/zdtm/static/netns_sub_sysctl.desc +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -1,4 +1,4 @@ { - 'flavor': 'ns', + 'flavor': 'ns uns', 'flags': 'suid' } diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index 68b6f22f5..a68fa38ee 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); diff --git a/test/zdtm/static/packet_sock.c b/test/zdtm/static/packet_sock.c index 4a9078f81..c1c94ac21 100644 --- a/test/zdtm/static/packet_sock.c +++ b/test/zdtm/static/packet_sock.c @@ -5,7 +5,7 @@ const char *test_author = "Pavel Emelyanov "; /* * Description: - * Create and bind several packet sockets, check thet getname + * Create and bind several packet sockets, check that getname * reports same result before and after c/r cycle. This is enough * for _basic_ packet functionality only, but still. */ diff --git a/test/zdtm/static/pidfd_child.c b/test/zdtm/static/pidfd_child.c new file mode 100644 index 000000000..ec559605d --- /dev/null +++ b/test/zdtm/static/pidfd_child.c @@ -0,0 +1,66 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks pidfd sends signal to child process after restore\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +int main(int argc, char* argv[]) +{ + int pidfd, status; + pid_t child; + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("Unable to fork a new process"); + return 1; + } else if (child == 0) { + test_waitsig(); + return 0; + } + + pidfd = pidfd_open(child, 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { + fail("Could not send signal"); + goto err_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + goto err_close; + } + + if (status != 0) { + fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); + goto err_close; + } + + pass(); + close(pidfd); + return 0; +err_close: + close(pidfd); + return 1; +} diff --git a/test/zdtm/static/pidfd_dead.c b/test/zdtm/static/pidfd_dead.c new file mode 100644 index 000000000..9c825899d --- /dev/null +++ b/test/zdtm/static/pidfd_dead.c @@ -0,0 +1,244 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of pidfds that point to dead processes\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main opens a pidfd for both child and grandchild. + * Before C/R we kill both child and grandchild. + * We end up with two unique dead pidfds. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int open_pidfd_pair(int pidfd[2], int pid) +{ + pidfd[0] = pidfd_open(pid, 0); + if (pidfd[0] < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + pidfd[1] = pidfd_open(pid, 0); + if (pidfd[1] < 0) { + close(pidfd[0]); + pr_perror("pidfd_open() failed"); + return 1; + } + return 0; +} + +static int compare_pidfds(int pidfd[2]) +{ + /* + * After linux 6.9 we can compare inode numbers + * to determine if two pidfds point to the same process. + * While the inode number may change before and after C/R + * pidfds pointing to the same pid should have the same inode number. + */ + struct statx stats[2]; + statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino != stats[1].stx_ino) + return 1; + return 0; +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, ret, gchild, p[2], status; + int cpidfd[2], gpidfd[2]; + struct statx stats[2]; + + test_init(argc, argv); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild = test_fork(); + close(p[READ]); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[WRITE]); + while(1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + + return 0; + } + } + + ret = open_pidfd_pair(cpidfd, child); + if (ret) + return 1; + + close(p[WRITE]); + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + close(p[READ]); + + ret = open_pidfd_pair(gpidfd, gchild); + if (ret) + return 1; + + /* + * We kill grandchild and child processes only after opening pidfds. + */ + if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + goto fail_close; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto fail_close; + } + + if (!WIFEXITED(status)) { + fail("Expected child to exit normally"); + goto fail_close; + } + + if (WEXITSTATUS(status) != 0) { + fail("Expected child to exit with 0"); + goto fail_close; + } + usleep(1000); + + if (kill(gchild, 0) != -1 && errno != ESRCH) { + fail("Expected grand child to not exist"); + goto fail_close; + } + + if (kill(child, 0) != -1 && errno != ESRCH) { + fail("Expected child to not exist"); + goto fail_close; + } + + test_daemon(); + test_waitsig(); + + ret = compare_pidfds(cpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + ret = compare_pidfds(gpidfd); + if (ret) { + fail("inodes not same for same pid"); + goto fail_close; + } + + statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); + statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); + if (stats[0].stx_ino == stats[1].stx_ino) { + fail("pidfds pointing to diff pids should have diff inodes"); + goto fail_close; + } + + pass(); + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 0; + +fail_close: + close(cpidfd[0]); + close(cpidfd[1]); + close(gpidfd[0]); + close(gpidfd[1]); + return 1; +} diff --git a/test/zdtm/static/pidfd_diffdead.c b/test/zdtm/static/pidfd_diffdead.c new file mode 100644 index 000000000..5bc1911a5 --- /dev/null +++ b/test/zdtm/static/pidfd_diffdead.c @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check C/R of processes that point to a common dead pidfd\n"; +const char *test_author = "Bhavik Sachdev "; + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +/* + * main + * `- child + * `- grandchild + * + * main and child open a pidfd for grandchild. + * Before C/R we kill grandchild. + * We end up with two pidfds in two diff processes that point to the same dead process. + */ + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int check_for_pidfs(void) +{ + long type; + int pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd open() failed"); + return -1; + } + type = get_fs_type(pidfd); + close(pidfd); + return type == PID_FS_MAGIC; +} + +int main(int argc, char *argv[]) +{ +#define READ 0 +#define WRITE 1 + + int child, ret, gchild, status; + struct statx stat; + task_waiter_t t; + unsigned long long ino; + + /* + * We use the inop pipe to send the inode number of the + * pidfd opened in the child to the main process for + * comparison. + */ + int p[2]; + int pidfd; + + test_init(argc, argv); + task_waiter_init(&t); + + ret = check_for_pidfs(); + if (ret < 0) + return 1; + + if (ret == 0) { + test_daemon(); + test_waitsig(); + skip("Test requires pidfs. skipping..."); + pass(); + return 0; + } + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + child = test_fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } else if (child == 0) { + int gchild; + gchild = test_fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } else if (gchild == 0) { + close(p[READ]); + close(p[WRITE]); + while (1) + sleep(1000); + } else { + if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + if (waitpid(gchild, &status, 0) != gchild) { + pr_perror("waitpid"); + return 1; + } + + if (!WIFSIGNALED(status)) { + fail("Expected grandchild to be terminated by a signal"); + return 1; + } + + if (WTERMSIG(status) != SIGKILL) { + fail("Expected grandchild to be terminated by SIGKILL"); + return 1; + } + task_waiter_complete(&t, 1); + + test_waitsig(); + + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &ino, sizeof(ino)) != sizeof(ino)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + close(pidfd); + + /* ino number should be same because both pidfds were for the same process */ + if (ino != stat.stx_ino) { + exit(1); + } + exit(0); + } + } + + if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { + pr_perror("write"); + return 1; + } + + pidfd = pidfd_open(gchild, 0); + if (pidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + /* + * We kill grandchild process only after opening pidfd. + */ + if (pidfd_send_signal(pidfd, SIGKILL, NULL, 0)) { + pr_perror("pidfd_send_signal"); + return 1; + } + + /* Wait for child to waitpid on gchild */ + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + close(p[READ]); + if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { + pr_perror("statx"); + goto err; + } + + /* Send inode number of pidfd to child for comparison */ + if (write(p[WRITE], &stat.stx_ino, sizeof(stat.stx_ino)) != sizeof(stat.stx_ino)) { + pr_perror("write"); + goto err; + } + close(p[WRITE]); + + if (kill(child, SIGTERM)) { + pr_perror("kill"); + goto err; + } + + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid"); + goto err; + } + + if (!WIFEXITED(status)) { + fail("Expected child to terminate normally"); + goto err; + } + + if (WEXITSTATUS(status) != 0) { + fail("Child failed"); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} diff --git a/test/zdtm/static/pidfd_kill.c b/test/zdtm/static/pidfd_kill.c new file mode 100644 index 000000000..6232d033a --- /dev/null +++ b/test/zdtm/static/pidfd_kill.c @@ -0,0 +1,128 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Kill child and grandchild process using pidfds\n"; +const char *test_author = "Bhavik Sachdev "; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int wait_for_child(int child) +{ + int status; + if (waitpid(child, &status, 0) != child) { + pr_perror("waitpid()"); + return 1; + } + + if (status != 0) { + test_msg("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), + WIFSIGNALED(status), WTERMSIG(status)); + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + #define READ 0 + #define WRITE 1 + + int child, gchild, cpidfd, gpidfd, gchild_pid, ret; + int p[2]; + + if (pipe(p)) { + pr_perror("pipe"); + return 1; + } + + test_init(argc, argv); + + child = fork(); + if (child < 0) { + pr_perror("fork"); + return 1; + } + + if (child == 0) { + gchild = fork(); + if (gchild < 0) { + pr_perror("fork"); + return 1; + } + + if (gchild == 0) { + test_waitsig(); + return 0; + } + + close(p[READ]); + if (write(p[WRITE], &gchild, sizeof(gchild)) + != sizeof(gchild)) { + pr_perror("write"); + return 1; + } + close(p[WRITE]); + + test_waitsig(); + return wait_for_child(gchild); + } + + cpidfd = pidfd_open(child, 0); + if (cpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + close(p[WRITE]); + if (read(p[READ], &gchild_pid, sizeof(gchild_pid)) + != sizeof(gchild_pid)) { + pr_perror("read"); + return 1; + } + close(p[READ]); + + gpidfd = pidfd_open(gchild_pid, 0); + if (gpidfd < 0) { + pr_perror("pidfd_open"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(gpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + if (pidfd_send_signal(cpidfd, SIGKILL, NULL, 0)) { + pr_perror("Could not send signal"); + goto fail_close; + } + + ret = wait_for_child(child); + if (ret) + goto fail_close; + + pass(); + close(cpidfd); + close(gpidfd); + return 0; + +fail_close: + fail(); + close(cpidfd); + close(gpidfd); + return 1; +} diff --git a/test/zdtm/static/pidfd_of_thread.c b/test/zdtm/static/pidfd_of_thread.c new file mode 100644 index 000000000..d232c7ac1 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check C/R of pidfds that point to threads\n"; +const char *test_author = "Bhavik Sachdev "; + +/* see also: https://codebrowser.dev/glibc/glibc/sysdeps/unix/sysv/linux/tst-clone3.c.html */ + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +#ifndef PIDFD_SIGNAL_THREAD +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#endif + +#ifndef PID_FS_MAGIC +#define PID_FS_MAGIC 0x50494446 +#endif + +static long get_fs_type(int lfd) +{ + struct statfs fst; + + if (fstatfs(lfd, &fst)) { + return -1; + } + return fst.f_type; +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static int thread_func(void *a) +{ + test_waitsig(); + return 0; +} + +#define CTID_INIT_VAL 1 + +int main(int argc, char* argv[]) +{ + char st[64 * 1024] __attribute__ ((aligned)); + pid_t tid; + int pidfd, test_pidfd; + futex_t exited; + + int clone_flags = CLONE_THREAD; + clone_flags |= CLONE_VM | CLONE_SIGHAND; + clone_flags |= CLONE_CHILD_CLEARTID; + + test_init(argc, argv); + + test_pidfd = pidfd_open(getpid(), 0); + if (test_pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + /* PIDFD_THREAD, PIDFD_SIGNAL_THREAD are supported only with pidfs */ + if (get_fs_type(test_pidfd) != PID_FS_MAGIC) { + test_daemon(); + test_waitsig(); + skip("pidfs not supported."); + close(test_pidfd); + return 0; + } + close(test_pidfd); + + futex_set(&exited, CTID_INIT_VAL); + + tid = clone(thread_func, st + sizeof(st), clone_flags, NULL, NULL, NULL, &(exited.raw)); + if (tid == -1) { + pr_perror("clone() failed"); + return 1; + } + + test_msg("Successfully created a thread with tid: %d\n", tid); + pidfd = pidfd_open(tid, PIDFD_THREAD); + if (pidfd < 0) { + pr_perror("pidfd_open() failed"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (pidfd_send_signal(pidfd, SIGTERM, NULL, PIDFD_SIGNAL_THREAD)) { + pr_perror("pidfd_send_signal() failed"); + fail(); + close(pidfd); + return 1; + } + + test_msg("Waiting for thread to exit\n"); + futex_wait_until(&exited, 0); + + pass(); + close(pidfd); + return 0; +} diff --git a/test/zdtm/static/pidfd_of_thread.desc b/test/zdtm/static/pidfd_of_thread.desc new file mode 100644 index 000000000..802caed65 --- /dev/null +++ b/test/zdtm/static/pidfd_of_thread.desc @@ -0,0 +1 @@ +{'flags': 'noauto crfail'} diff --git a/test/zdtm/static/pidfd_self.c b/test/zdtm/static/pidfd_self.c new file mode 100644 index 000000000..2730ee123 --- /dev/null +++ b/test/zdtm/static/pidfd_self.c @@ -0,0 +1,140 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check pidfd /proc/self/fdinfo/ entry remains consistent after checkpoint/restore\n"; +const char *test_author = "Bhavik Sachdev "; + +struct pidfd_status { + unsigned int flags; + pid_t pid; +}; + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) +{ + return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); +} + +static void show_pidfd(char *prefix, struct pidfd_status *s) +{ + test_msg("\n\t%s\n\tflags: 0%o\n\tpid: %d\n", prefix, s->flags, s->pid); +} + +static int parse_self_fdinfo(int pidfd, struct pidfd_status *s) +{ + char buf[256]; + int ret = -1; + FILE *f; + + sprintf(buf, "/proc/self/fdinfo/%d", pidfd); + f = fopen(buf, "r"); + if (!f) { + perror("Can't open /proc/self/fdinfo/ to parse"); + return -1; + } + + memset(s, 0, sizeof(*s)); + + /* + * flags: file access mode (octal) 02000002 => [O_RDWR | O_CLOEXEC] + * pid: the pid to which we have pidfd open + */ + while (fgets(buf, sizeof(buf), f)) { + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "flags: 0%o", &s->flags) != 1) { + goto parse_err; + } + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (!fgets(buf, sizeof(buf), f)) + goto parse_err; + + if (sscanf(buf, "Pid: %d", &s->pid) != 1) + goto parse_err; + ret = 0; + break; + } + + if (ret) + goto parse_err; +err: + fclose(f); + return ret; + +parse_err: + pr_perror("Format error"); + goto err; +} + +static int check_pidfd(int fd, struct pidfd_status *old) +{ + struct pidfd_status new; + + if (parse_self_fdinfo(fd, &new)) + return -1; + + show_pidfd("restored", &new); + + if (old->flags != new.flags || old->pid != new.pid) + return -1; + + return 0; +} + +int main(int argc, char* argv[]) +{ + struct pidfd_status old; + int pidfd, ret; + + test_init(argc, argv); + + pidfd = pidfd_open(getpid(), 0); + if (pidfd < 0) { + pr_perror("pidfd_open failed"); + return 1; + } + + parse_self_fdinfo(pidfd, &old); + + show_pidfd("old", &old); + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = check_pidfd(pidfd, &old); + if (ret) { + fail(); + goto err; + } + + if (pidfd_send_signal(pidfd, 0, NULL, 0)) { + pr_perror("Could not send signal"); + fail(); + goto err; + } + + pass(); + close(pidfd); + return 0; +err: + close(pidfd); + return 1; +} diff --git a/test/zdtm/static/pthread00-pac.c b/test/zdtm/static/pthread00-pac.c new file mode 120000 index 000000000..3ee8dc1f1 --- /dev/null +++ b/test/zdtm/static/pthread00-pac.c @@ -0,0 +1 @@ +pthread00.c \ No newline at end of file diff --git a/test/zdtm/static/pthread_timers.c b/test/zdtm/static/pthread_timers.c index 5246a985f..b1b2a9a23 100644 --- a/test/zdtm/static/pthread_timers.c +++ b/test/zdtm/static/pthread_timers.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -69,7 +70,8 @@ int main(int argc, char **argv) } if (itimerspec.it_interval.tv_nsec != TEST_INTERVAL_NSEC || itimerspec.it_interval.tv_sec) { - pr_perror("wrong interval: %ld:%ld", itimerspec.it_interval.tv_sec, itimerspec.it_interval.tv_nsec); + pr_perror("wrong interval: %" PRId64 ":%" PRId64, + (int64_t)itimerspec.it_interval.tv_sec, (int64_t)itimerspec.it_interval.tv_nsec); return 1; } diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c index 471ad6a43..7add7801e 100644 --- a/test/zdtm/static/rseq00.c +++ b/test/zdtm/static/rseq00.c @@ -46,12 +46,15 @@ static inline void *__criu_thread_pointer(void) static inline void unregister_glibc_rseq(void) { struct rseq *rseq = (struct rseq *)((char *)__criu_thread_pointer() + __rseq_offset); + unsigned int size = __rseq_size; /* hack: mark glibc rseq structure as failed to register */ rseq->cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED; /* unregister rseq */ - syscall(__NR_rseq, (void *)rseq, __rseq_size, 1, RSEQ_SIG); + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)rseq, size, 1, RSEQ_SIG); } #else static inline void unregister_glibc_rseq(void) diff --git a/test/zdtm/static/s390x_regs_check.c b/test/zdtm/static/s390x_regs_check.c index 40c480b3f..82dca0519 100644 --- a/test/zdtm/static/s390x_regs_check.c +++ b/test/zdtm/static/s390x_regs_check.c @@ -40,13 +40,13 @@ const char *test_author = "Michael Holzheu "; * * - Verify that "criu restore" sets the correct register sets * from "criu dump": - * $ zdtmp.py run -t zdtm/static/s390x_regs_check + * $ zdtm.py run -t zdtm/static/s390x_regs_check * * - Verify that dumpee continues running with correct registers after * parasite injection: - * $ zdtmp.py run --norst -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --norst --pre 2 -t zdtm/static/s390x_regs_check - * $ zdtmp.py run --check-only -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst -t zdtm/static/s390x_regs_check + * $ zdtm.py run --norst --pre 2 -t zdtm/static/s390x_regs_check + * $ zdtm.py run --check-only -t zdtm/static/s390x_regs_check */ #define NR_THREADS 2 #define NR_THREADS_ALL (NR_THREADS + 1) diff --git a/test/zdtm/static/sched_policy00.c b/test/zdtm/static/sched_policy00.c index dc71eed94..a35135050 100644 --- a/test/zdtm/static/sched_policy00.c +++ b/test/zdtm/static/sched_policy00.c @@ -51,7 +51,7 @@ int main(int argc, char **argv) } p.sched_priority = param; - if (sched_setscheduler(pid, SCHED_RR, &p)) { + if (sched_setscheduler(pid, SCHED_RR | SCHED_RESET_ON_FORK, &p)) { pr_perror("Can't set policy"); kill(pid, SIGKILL); return -1; @@ -61,7 +61,7 @@ int main(int argc, char **argv) test_waitsig(); ret = sched_getscheduler(pid); - if (ret != SCHED_RR) { + if (ret != (SCHED_RR | SCHED_RESET_ON_FORK)) { fail("Broken/No policy"); err++; } diff --git a/test/zdtm/static/scm00.c b/test/zdtm/static/scm00.c index d66975582..670e6fd6a 100644 --- a/test/zdtm/static/scm00.c +++ b/test/zdtm/static/scm00.c @@ -105,6 +105,9 @@ int main(int argc, char **argv) p[1] = p[0]; p[0] = -1; #endif +#endif +#ifdef CLOSE_SENDER_FD + close(sk[0]); #endif test_daemon(); diff --git a/test/zdtm/static/scm06.desc b/test/zdtm/static/scm06.desc index 2eac7e654..38cc3be51 100644 --- a/test/zdtm/static/scm06.desc +++ b/test/zdtm/static/scm06.desc @@ -1 +1,4 @@ -{'flags': 'suid'} +# This test isn't executed in the host flavor (in the same network namespace, +# because the kernel releases a test socket asynchronously, so the restore +# can fail if it is executed before the kernel actually destroys the socket. +{'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/scm09.c b/test/zdtm/static/scm09.c new file mode 120000 index 000000000..4cab0edd2 --- /dev/null +++ b/test/zdtm/static/scm09.c @@ -0,0 +1 @@ +scm00.c \ No newline at end of file diff --git a/test/zdtm/static/seccomp_filter_inheritance.c b/test/zdtm/static/seccomp_filter_inheritance.c index 7a86cd85e..5afcb3f84 100644 --- a/test/zdtm/static/seccomp_filter_inheritance.c +++ b/test/zdtm/static/seccomp_filter_inheritance.c @@ -100,7 +100,7 @@ int main(int argc, char **argv) if (filter_syscall(__NR_ptrace) < 0) _exit(1); - if (filter_syscall(__NR_fstat) < 0) + if (filter_syscall(__NR_statx) < 0) _exit(1); zdtm_seccomp = 1; diff --git a/test/zdtm/static/seccomp_no_new_privs.c b/test/zdtm/static/seccomp_no_new_privs.c new file mode 100644 index 000000000..95f9501ed --- /dev/null +++ b/test/zdtm/static/seccomp_no_new_privs.c @@ -0,0 +1,42 @@ +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that NO_NEW_PRIVS attribute is restored"; +const char *test_author = "Michał Mirosław "; + +int main(int argc, char **argv) +{ + int ret; + + test_init(argc, argv); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 0) + fail("initial NO_NEW_PRIVS = %d != 0", ret); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (ret) { + pr_perror("Can't set NO_NEW_PRIVS attribute"); + return 1; + } + + test_daemon(); + test_waitsig(); + + ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Can't read NO_NEW_PRIVS attribute"); + return 1; + } + if (ret != 1) + fail("restored NO_NEW_PRIVS = %d != 1", ret); + + pass(); + return 0; +} diff --git a/test/zdtm/static/selinux00.checkskip b/test/zdtm/static/selinux00.checkskip index 8d946a75e..4c85647d1 100755 --- a/test/zdtm/static/selinux00.checkskip +++ b/test/zdtm/static/selinux00.checkskip @@ -2,6 +2,19 @@ test -d /sys/fs/selinux || exit 1 +# check if necessary commands are installed +if ! command -v setenforce &>/dev/null; then + exit 1 +fi + +if ! command -v setsebool &>/dev/null; then + exit 1 +fi + +if ! command -v getsebool &>/dev/null; then + exit 1 +fi + # See selinux00.hook for details getsebool unconfined_dyntrans_all > /dev/null 2>&1 diff --git a/test/zdtm/static/shm-hugetlb.checkskip b/test/zdtm/static/shm-hugetlb.checkskip new file mode 100755 index 000000000..df2370815 --- /dev/null +++ b/test/zdtm/static/shm-hugetlb.checkskip @@ -0,0 +1,4 @@ +#!/bin/bash + +# will fail with EOPNOTSUPP +cat /proc/sys/vm/nr_hugepages &> /dev/null diff --git a/test/zdtm/static/sk-unix-listen01.c b/test/zdtm/static/sk-unix-listen01.c new file mode 100644 index 000000000..5c9274acb --- /dev/null +++ b/test/zdtm/static/sk-unix-listen01.c @@ -0,0 +1,117 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test in-flight unix sockets with data in them\n"; +const char *test_author = "Andrei Vagin "; + +#define SK_DATA "packet" + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +#define TEST_MODE 0640 + +#ifdef ZDTM_UNIX_SEQPACKET +#define SOCK_TYPE SOCK_SEQPACKET +#else +#define SOCK_TYPE SOCK_STREAM +#endif + +int main(int argc, char *argv[]) +{ + struct sockaddr_un addr; + unsigned int addrlen; + int ssk, sk; + + char path[PATH_MAX]; + char *cwd; + int ret; + + test_init(argc, argv); + + cwd = get_current_dir_name(); + if (!cwd) + return pr_perror("get_current_dir_name"); + + snprintf(path, sizeof(path), "%s/%s", cwd, filename); + unlink(path); + + addr.sun_family = AF_UNIX; + addrlen = strlen(filename); + if (addrlen > sizeof(addr.sun_path)) + return pr_err("address is too long"); + memcpy(addr.sun_path, filename, addrlen); + addrlen += sizeof(addr.sun_family); + + ssk = socket(AF_UNIX, SOCK_TYPE, 0); + if (ssk == -1) + return pr_perror("socket"); + + sk = socket(AF_UNIX, SOCK_TYPE, 0); + if (sk < 0) + return pr_perror("socket"); + + ret = bind(ssk, (struct sockaddr *)&addr, addrlen); + if (ret) + return pr_perror("bind"); + + ret = listen(ssk, 16); + if (ret) + return pr_perror("listen"); + + if (connect(sk, (struct sockaddr *)&addr, addrlen)) + return pr_perror("connect"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + memset(buf, 0, sizeof(buf)); + write(sk, SK_DATA, sizeof(SK_DATA)); + } +#endif + +#ifdef SK_UNIX_LISTEN03 + close(sk); + sk = -1; +#endif + + test_daemon(); + test_waitsig(); + + if (sk != -1) + close(sk); + + ret = accept(ssk, NULL, NULL); + if (ret < 0) + return fail("accept"); + +#ifdef SK_UNIX_LISTEN02 + { + char buf[64]; + if (read(ret, &buf, sizeof(buf)) != sizeof(SK_DATA)) + return pr_perror("read"); + + if (strcmp(buf, SK_DATA)) + return fail("data corrupted"); + } +#endif + + close(ssk); + unlink(path); + + pass(); + return 0; +} diff --git a/test/zdtm/static/sk-unix-listen02.c b/test/zdtm/static/sk-unix-listen02.c new file mode 120000 index 000000000..1211f4666 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen02.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/unlink_largefile.desc b/test/zdtm/static/sk-unix-listen02.desc similarity index 100% rename from test/zdtm/static/unlink_largefile.desc rename to test/zdtm/static/sk-unix-listen02.desc diff --git a/test/zdtm/static/sk-unix-listen03.c b/test/zdtm/static/sk-unix-listen03.c new file mode 120000 index 000000000..1211f4666 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen03.desc b/test/zdtm/static/sk-unix-listen03.desc new file mode 100644 index 000000000..ded89879a --- /dev/null +++ b/test/zdtm/static/sk-unix-listen03.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-listen04.c b/test/zdtm/static/sk-unix-listen04.c new file mode 120000 index 000000000..1211f4666 --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.c @@ -0,0 +1 @@ +sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen04.desc b/test/zdtm/static/sk-unix-listen04.desc new file mode 100644 index 000000000..ded89879a --- /dev/null +++ b/test/zdtm/static/sk-unix-listen04.desc @@ -0,0 +1 @@ +{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.c b/test/zdtm/static/sk-unix-restore-fs-share.c new file mode 100644 index 000000000..d4f6dde75 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test non-empty process group with terminated parent and unix socket"; +const char *test_author = "Qiao Ma "; + +char *filename; +TEST_OPTION(filename, string, "socket file name", 1); + +static int create_and_connect(void) +{ + struct sockaddr_un addr; + int client_fd; + + client_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (client_fd == -1) { + pr_perror("socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", filename) >= (int)sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + close(client_fd); + return -1; + } + + if (connect(client_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("connect"); + close(client_fd); + return -1; + } + + return 0; +} + +static int child(int ready_fd) +{ + int listen_fd; + struct sockaddr_un addr; + int ret = EXIT_FAILURE; + + listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_fd == -1) { + pr_perror("socket"); + return EXIT_FAILURE; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + if (strlen(filename) >= sizeof(addr.sun_path)) { + pr_err("Socket path too long\n"); + goto cleanup; + } + strncpy(addr.sun_path, filename, sizeof(addr.sun_path)); + + unlink(filename); /* Ignore error if file doesn't exist */ + + if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("bind"); + goto cleanup; + } + + if (listen(listen_fd, 5) == -1) { + pr_perror("listen"); + goto cleanup; + } + + if (create_and_connect() != 0) { + pr_err("Failed to create and connect\n"); + goto cleanup; + } + + /* Signal parent that socket is ready */ + if (write(ready_fd, "1", 1) != 1) { + pr_perror("write ready_fd"); + goto cleanup; + } + + /* Wait indefinitely */ + pause(); + + ret = EXIT_SUCCESS; +cleanup: + if (listen_fd != -1) + close(listen_fd); + unlink(filename); + + return ret; +} + +static int zombie_leader(int *cpid) +{ + char buf; + pid_t pid; + int pipefd[2]; + + if (pipe(pipefd) == -1) { + pr_perror("pipe"); + return EXIT_FAILURE; + } + + if (setpgid(0, 0) == -1) { + pr_perror("setpgid"); + return EXIT_FAILURE; + } + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork child"); + return EXIT_FAILURE; + } + + if (pid == 0) { + /* Close read end */ + close(pipefd[0]); + exit(child(pipefd[1])); + } + + /* Close write end in parent */ + close(pipefd[1]); + + /* Wait for child to set up socket */ + if (read(pipefd[0], &buf, 1) != 1) { + pr_err("Failed to receive readiness signal from child\n"); + close(pipefd[0]); + return EXIT_FAILURE; + } + close(pipefd[0]); + + *cpid = pid; + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + int ret = EXIT_FAILURE, status; + pid_t pid; + int *cpid; + + test_init(argc, argv); + + cpid = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (cpid == MAP_FAILED) { + pr_perror("mmap"); + return EXIT_FAILURE; + } + *cpid = 0; + + pid = fork(); + if (pid < 0) { + pr_perror("Failed to fork zombie"); + goto out; + } + + if (pid == 0) + exit(zombie_leader(cpid)); + + if (waitpid(pid, &status, 0) < 0) { + pr_perror("Failed to waitpid zombie"); + goto out; + } + + if (!WIFEXITED(status) || WEXITSTATUS(status) != EXIT_SUCCESS) { + pr_err("Unexpected exit code: %d\n", WEXITSTATUS(status)); + goto out; + } + + if (!*cpid) { + pr_err("Don't know grandchild's pid\n"); + goto out; + } + + test_daemon(); + test_waitsig(); + + ret = EXIT_SUCCESS; + pass(); +out: + /* Clean up */ + if (*cpid) + kill(*cpid, SIGKILL); + + munmap(cpid, sizeof(int)); + + return ret; +} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.desc b/test/zdtm/static/sk-unix-restore-fs-share.desc new file mode 100644 index 000000000..6c4afe5f0 --- /dev/null +++ b/test/zdtm/static/sk-unix-restore-fs-share.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns'} diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c new file mode 100644 index 000000000..cb464365d --- /dev/null +++ b/test/zdtm/static/sock_ip_opts00.c @@ -0,0 +1,114 @@ +#include +#include +#include + +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that different ip socket options are restored"; +const char *test_author = "Pavel Tikhomirov "; + +#ifdef ZDTM_VAL_ZERO +#define IP_OPT_VAL 0 +#else +#define IP_OPT_VAL 1 +#endif + +struct sk_opt { + int level; + int opt; + int val; +}; + +struct sk_opt sk_opts_v4[] = { + { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, + { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, + { SOL_IP, IP_TTL, 32 }, + { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, +}; + +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + +struct sk_opt sk_opts_v6[] = { + { SOL_IPV6, IPV6_FREEBIND, IP_OPT_VAL }, + { SOL_IPV6, IPV6_RECVPKTINFO, IP_OPT_VAL }, +}; + +struct sk_conf { + int domain; + int type; + int protocol; + int sk; +} sk_confs[] = { + { AF_INET, SOCK_DGRAM, IPPROTO_UDP }, + { AF_INET, SOCK_RAW, IPPROTO_UDP }, + { AF_INET6, SOCK_DGRAM, IPPROTO_UDP }, + { AF_INET6, SOCK_RAW, IPPROTO_UDP }, +}; + +int main(int argc, char **argv) +{ + struct sk_opt *opts; + int exit_code = 1; + int i, j, val; + socklen_t len; + int n_opts; + + test_init(argc, argv); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); + if (sk_confs[i].sk == -1) { + pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, + sk_confs[i].protocol); + goto close; + } + } + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + opts = sk_confs[i].domain == AF_INET ? sk_opts_v4 : sk_opts_v6; + n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); + + for (j = 0; j < n_opts; j++) { + val = opts[j].val; + if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { + pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + opts = sk_confs[i].domain == AF_INET ? sk_opts_v4 : sk_opts_v6; + n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); + + for (j = 0; j < n_opts; j++) { + len = sizeof(int); + if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { + pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + + if (val != opts[j].val) { + fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, + sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); + goto close; + } + } + } + + pass(); + exit_code = 0; +close: + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) + close(sk_confs[i].sk); + return exit_code; +} diff --git a/test/zdtm/static/sock_ip_opts00.desc b/test/zdtm/static/sock_ip_opts00.desc new file mode 100644 index 000000000..2201f0298 --- /dev/null +++ b/test/zdtm/static/sock_ip_opts00.desc @@ -0,0 +1 @@ +{'flags': 'suid', 'feature': 'ipv6_freebind'} diff --git a/test/zdtm/static/sock_ip_opts01.c b/test/zdtm/static/sock_ip_opts01.c new file mode 120000 index 000000000..15526f808 --- /dev/null +++ b/test/zdtm/static/sock_ip_opts01.c @@ -0,0 +1 @@ +sock_ip_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_ip_opts01.desc b/test/zdtm/static/sock_ip_opts01.desc new file mode 120000 index 000000000..e2c29ca25 --- /dev/null +++ b/test/zdtm/static/sock_ip_opts01.desc @@ -0,0 +1 @@ +sock_ip_opts00.desc \ No newline at end of file diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index 5b4624f6d..854aaa591 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -12,21 +12,27 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY -#define NOPTS 8 - int main(int argc, char **argv) { - int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; - socklen_t len = sizeof(int); + #define OPT(x) { x, #x } + static const struct { + int opt; + const char *name; + } vname[] = { + OPT(SO_PRIORITY), + OPT(SO_RCVLOWAT), + OPT(SO_MARK), + OPT(SO_PASSCRED), + OPT(SO_PASSSEC), + OPT(SO_DONTROUTE), + OPT(SO_NO_CHECK), + OPT(SO_OOBINLINE), + }; + static const int NOPTS = sizeof(vname) / sizeof(*vname); + #undef OPT - vname[0] = SO_PRIORITY; - vname[1] = SO_RCVLOWAT; - vname[2] = SO_MARK; - vname[3] = SO_PASSCRED; - vname[4] = SO_PASSSEC; - vname[5] = SO_DONTROUTE; - vname[6] = SO_NO_CHECK; - vname[7] = SO_OOBINLINE; + int sock, usock, sk, ret = 0, val[NOPTS], rval, i; + socklen_t len = sizeof(int); test_init(argc, argv); @@ -36,30 +42,37 @@ int main(int argc, char **argv) return 1; } + usock = socket(AF_UNIX, SOCK_STREAM, 0); + if (usock < 0) { + pr_perror("can't create unix socket"); + return 1; + } + for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], &len); if (ret) { - pr_perror("can't get option %d", i); + pr_perror("can't get %s", vname[i].name); return 1; } val[i]++; - ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); + ret = setsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], len); if (ret) { - pr_perror("can't set option %d", i); + pr_perror("can't set %s = %d", vname[i].name, val[i]); return 1; } - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d 2", i); + pr_perror("can't re-get %s", vname[i].name); return 1; } if (rval != val[i]) { if (rval + 1 == val[i]) { - pr_perror("can't reset option %d want %d have %d", i, val[i], rval); + pr_perror("failed to set %s: want %d have %d", vname[i].name, val[i], rval); return 1; } @@ -72,20 +85,23 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); + sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; + ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); if (ret) { - pr_perror("can't get option %d again", i); + pr_perror("can't verify %s", vname[i].name); return 1; } if (val[i] != rval) { - fail("option %d changed", i); + errno = 0; + fail("%s changed: %d -> %d", vname[i].name, val[i], rval); return 1; } } pass(); close(sock); + close(usock); return 0; } diff --git a/test/zdtm/static/sock_tcp_opts00.c b/test/zdtm/static/sock_tcp_opts00.c new file mode 100644 index 000000000..8061bc9ea --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.c @@ -0,0 +1,96 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that different tcp socket options are restored"; +const char *test_author = "Juntong Deng "; + +#ifdef ZDTM_VAL_ZERO +#define TCP_OPT_VAL 0 +#else +#define TCP_OPT_VAL 1 +#endif + +#ifndef SOL_TCP +#define SOL_TCP 6 +#endif + +struct sk_opt { + int level; + int opt; + int val; +}; + +struct sk_opt tcp_sk_opts[] = { + { SOL_TCP, TCP_CORK, TCP_OPT_VAL }, + { SOL_TCP, TCP_NODELAY, TCP_OPT_VAL }, +}; + +struct sk_conf { + int domain; + int type; + int protocol; + int sk; +} sk_confs[] = { + { AF_INET, SOCK_STREAM, IPPROTO_TCP }, + { AF_INET6, SOCK_STREAM, IPPROTO_TCP }, +}; + +int main(int argc, char **argv) +{ + struct sk_opt *opts = tcp_sk_opts; + int n_opts = ARRAY_SIZE(tcp_sk_opts); + int exit_code = 1; + int i, j, val; + socklen_t len; + + test_init(argc, argv); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); + if (sk_confs[i].sk == -1) { + pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, + sk_confs[i].protocol); + goto close; + } + } + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + val = opts[j].val; + if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { + pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + } + } + + test_daemon(); + test_waitsig(); + + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { + for (j = 0; j < n_opts; j++) { + len = sizeof(int); + if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { + pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); + goto close; + } + + if (val != opts[j].val) { + fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, + sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); + goto close; + } + } + } + + pass(); + exit_code = 0; +close: + for (i = 0; i < ARRAY_SIZE(sk_confs); i++) + close(sk_confs[i].sk); + return exit_code; +} diff --git a/test/zdtm/static/sock_tcp_opts00.desc b/test/zdtm/static/sock_tcp_opts00.desc new file mode 100644 index 000000000..2eac7e654 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts00.desc @@ -0,0 +1 @@ +{'flags': 'suid'} diff --git a/test/zdtm/static/sock_tcp_opts01.c b/test/zdtm/static/sock_tcp_opts01.c new file mode 120000 index 000000000..5219c2e98 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.c @@ -0,0 +1 @@ +./sock_tcp_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_tcp_opts01.desc b/test/zdtm/static/sock_tcp_opts01.desc new file mode 120000 index 000000000..fb1dfdcd1 --- /dev/null +++ b/test/zdtm/static/sock_tcp_opts01.desc @@ -0,0 +1 @@ +./sock_tcp_opts00.desc \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index d4cfe5064..c77d58477 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -1,10 +1,10 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed', + 'feature' : 'tcp_half_closed has_ipt_legacy', 'flavor': 'ns uns', } diff --git a/test/zdtm/static/socket-tcp-closing.c b/test/zdtm/static/socket-tcp-closing.c index 87e1d7533..df291d446 100644 --- a/test/zdtm/static/socket-tcp-closing.c +++ b/test/zdtm/static/socket-tcp-closing.c @@ -31,10 +31,13 @@ static int port = 8880; int fill_sock_buf(int fd) { + char zdtm[512]; int flags; int size; int ret; + memset(zdtm, 5, sizeof(zdtm)); + flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); @@ -47,7 +50,6 @@ int fill_sock_buf(int fd) size = 0; while (1) { - char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 9504557da..30f8ce071 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,7 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import sys -sys.path.append("../crit") +sys.path.append("../lib") import pycriu import os, os.path diff --git a/test/zdtm/static/socket-tcp-nfconntrack.c b/test/zdtm/static/socket-tcp-ipt-nfconntrack.c similarity index 100% rename from test/zdtm/static/socket-tcp-nfconntrack.c rename to test/zdtm/static/socket-tcp-ipt-nfconntrack.c diff --git a/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc new file mode 100644 index 000000000..53dd82285 --- /dev/null +++ b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'has_ipt_legacy', + 'flavor': 'h', + 'opts': '--tcp-established', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc deleted file mode 100644 index add2513f8..000000000 --- a/test/zdtm/static/socket-tcp-nfconntrack.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.c b/test/zdtm/static/socket-tcp-nft-nfconntrack.c new file mode 120000 index 000000000..8cb60dd03 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.c @@ -0,0 +1 @@ +socket-tcp.c \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.desc b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc new file mode 100644 index 000000000..38a4eb389 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc @@ -0,0 +1,7 @@ +{ + 'flavor': 'h', + 'feature': 'network_lock_nftables', + 'opts': '--tcp-established', + 'dopts': '--network-lock nftables', + 'flags': 'suid' +} diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index 3ebdfeef8..ff92e9f9f 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -1,10 +1,10 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', - '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/riscv64-linux-gnu/xtables/libipt_REJECT.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 4cc23c8fc..52382414b 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -1,9 +1,9 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed' + 'feature' : 'tcp_half_closed has_ipt_legacy' } diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index f6ef47385..bc2075496 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -67,17 +67,38 @@ int main(int argc, char **argv) int val; socklen_t optlen; -#ifdef ZDTM_CONNTRACK +#ifdef ZDTM_IPT_CONNTRACK if (unshare(CLONE_NEWNET)) { pr_perror("unshare"); return 1; } if (system("ip link set up dev lo")) return 1; - if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + + if (system("iptables-legacy -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) return 1; - if (system("iptables -w -A INPUT -j DROP")) + if (system("iptables-legacy -w -A INPUT -j DROP")) return 1; + +#endif + +#ifdef ZDTM_NFT_CONNTRACK + if (unshare(CLONE_NEWNET)) { + pr_perror("unshare"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; + + if (system("nft add table ip filter")) + return 1; + if (system("nft 'add chain ip filter INPUT { type filter hook input priority 0 ; }'")) + return 1; + if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) + return 1; + if (system("nft add rule ip filter INPUT counter drop")) + return 1; + #endif #ifdef ZDTM_TCP_LOCAL diff --git a/test/zdtm/static/socket6_icmp.c b/test/zdtm/static/socket6_icmp.c new file mode 120000 index 000000000..24d8fd806 --- /dev/null +++ b/test/zdtm/static/socket6_icmp.c @@ -0,0 +1 @@ +socket_icmp.c \ No newline at end of file diff --git a/test/zdtm/static/socket_icmp.c b/test/zdtm/static/socket_icmp.c new file mode 100644 index 000000000..f72e348bf --- /dev/null +++ b/test/zdtm/static/socket_icmp.c @@ -0,0 +1,128 @@ +#include "zdtmtst.h" + +const char *test_doc = "static test for ICMP socket\n"; +const char *test_author = "समीर सिंह Sameer Singh \n"; + +/* Description: + * Send a ping to localhost using ICMP socket + */ + +#include +#include +#include +#include +#if defined(ZDTM_IPV6) +#include +#else +#include +#endif +#include +#include +#include + +#include "sysctl.h" + +#define PACKET_SIZE 64 +#define RECV_TIMEOUT 1 + +static int echo_id = 1234; + +#if defined(ZDTM_IPV6) +#define TEST_ICMP_ECHOREPLY ICMP6_ECHOREPLY +#else +#define TEST_ICMP_ECHOREPLY ICMP_ECHOREPLY +#endif +int main(int argc, char **argv) +{ + int ret, sock, seq = 0; + char packet[PACKET_SIZE], recv_packet[PACKET_SIZE]; + + struct timeval tv; +#if defined(ZDTM_IPV6) + struct sockaddr_in6 addr, recv_addr; +#else + struct icmphdr icmp_header, *icmp_reply; +#endif + struct sockaddr_in addr, recv_addr; + socklen_t addr_len; + + // Allow GIDs 0-58468 to open an unprivileged ICMP socket + if (sysctl_write_str("/proc/sys/net/ipv4/ping_group_range", "0 58468")) + return -1; + + test_init(argc, argv); + +#if defined(ZDTM_IPV6) + sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); +#else + sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); +#endif + if (sock < 0) { + pr_perror("Can't create socket"); + return 1; + } + + tv.tv_sec = RECV_TIMEOUT; + tv.tv_usec = 0; + if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) { + pr_perror("Can't set socket option"); + return 1; + } + + memset(&addr, 0, sizeof(addr)); + memset(&icmp_header, 0, sizeof(icmp_header)); +#if defined(ZDTM_IPV6) + addr.sin6_family = AF_INET6; + inet_pton(AF_INET6, "::1", &addr.sin6_addr); + + icmp_header.icmp6_type = ICMP6_ECHO_REQUEST; + icmp_header.icmp6_code = 0; + icmp_header.icmp6_id = echo_id; + icmp_header.icmp6_seq = seq; +#else + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = inet_addr("127.0.0.1"); + + icmp_header.type = ICMP_ECHO; + icmp_header.code = 0; + icmp_header.un.echo.id = echo_id; + icmp_header.un.echo.sequence = seq; +#endif + + memcpy(packet, &icmp_header, sizeof(icmp_header)); + memset(packet + sizeof(icmp_header), 0xa5, + PACKET_SIZE - sizeof(icmp_header)); + + test_daemon(); + test_waitsig(); + + ret = sendto(sock, packet, PACKET_SIZE, 0, + (struct sockaddr *)&addr, sizeof(addr)); + + if (ret < 0) { + fail("Can't send"); + return 1; + } + + addr_len = sizeof(recv_addr); + + ret = recvfrom(sock, recv_packet, sizeof(recv_packet), 0, + (struct sockaddr *)&recv_addr, &addr_len); + + if (ret < 0) { + fail("Can't recv"); + return 1; + } + + icmp_reply = (struct icmphdr *)recv_packet; + + if (icmp_reply->type != ICMP_ECHOREPLY) { + fail("Got no ICMP_ECHO_REPLY"); + return 1; + } + + close(sock); + + pass(); + return 0; +} diff --git a/test/zdtm/static/socket_udp_shutdown.c b/test/zdtm/static/socket_udp_shutdown.c index 91dc8f30a..a7658b9dd 100644 --- a/test/zdtm/static/socket_udp_shutdown.c +++ b/test/zdtm/static/socket_udp_shutdown.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) test_init(argc, argv); - sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); - sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + sk1 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); + sk2 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); if (sk1 < 0 || sk2 < 0) { pr_perror("Can't create socket"); exit(1); diff --git a/test/zdtm/static/stopped.c b/test/zdtm/static/stopped.c index 059a2a92a..26b0174ed 100644 --- a/test/zdtm/static/stopped.c +++ b/test/zdtm/static/stopped.c @@ -65,7 +65,7 @@ int main(int argc, char **argv) } if (WIFSTOPPED(status)) - test_msg("The procces stopped\n"); + test_msg("The process stopped\n"); else { fail("The process doesn't stopped"); goto out; diff --git a/test/zdtm/static/stopped03.c b/test/zdtm/static/stopped03.c new file mode 100644 index 000000000..9a373930f --- /dev/null +++ b/test/zdtm/static/stopped03.c @@ -0,0 +1,161 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +#define STOP_SIGNO SIGTSTP +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_CHECK, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} *sh; + +static int new_pgrp(void) +{ + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, STOP_SIGNO)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_CHECK); + + infop.si_code = 0; + infop.si_status = 0; + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + sh->code = infop.si_code; + sh->status = infop.si_status; + + futex_set_and_wake(&sh->fstate, TEST_DONE); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + pr_err("Process is not in correct state before C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + futex_set_and_wake(&sh->fstate, TEST_CHECK); + futex_wait_while_lt(&sh->fstate, TEST_DONE); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { + fail = 1; + pr_err("Process is not in correct state after C/R." + " Expected stop signo: %d. Get stop signo: %d\n", + STOP_SIGNO, sh->status); + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} diff --git a/test/zdtm/static/stopped04.c b/test/zdtm/static/stopped04.c new file mode 100644 index 000000000..9bd968aa2 --- /dev/null +++ b/test/zdtm/static/stopped04.c @@ -0,0 +1,135 @@ +#include +#include +#include + +#include "zdtmtst.h" +#include "lock.h" + +const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; +const char *test_author = "Yuriy Vasiliev "; + +const char *stop_sigstr = "SIGTSTP"; +enum { + FUTEX_INITIALIZED = 0, + TEST_CRIU, + TEST_DONE, + TEST_EXIT, + TEST_EMERGENCY_ABORT, +}; + +struct shared { + futex_t fstate; + int status; + int code; +} *sh; + +static int new_pgrp(void) +{ + sigset_t sigset; + siginfo_t infop; + int ret = 1; + pid_t pid; + + /* + * Set the PGID to avoid creating an orphaned process group, + * which is not to be affected by terminal-generated stop signals. + */ + setpgid(0, 0); + + sigemptyset(&sigset); + sigaddset(&sigset, SIGTSTP); + sigprocmask(SIG_BLOCK, &sigset, NULL); + + pid = test_fork(); + if (pid < 0) + goto err_cr; + + if (pid == 0) { + /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + exit(0); + } + + if (kill(pid, SIGSTOP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { + pr_perror("Unable to waitid %d", pid); + goto err_cont; + } + + if (kill(pid, SIGTSTP)) { + pr_perror("Unable to send %s", stop_sigstr); + goto err_cr; + } + + /* Return the control back to MAIN worker to do C/R */ + futex_set_and_wake(&sh->fstate, TEST_CRIU); + futex_wait_while_lt(&sh->fstate, TEST_EXIT); + + ret = 0; +err_cont: + kill(pid, SIGCONT); +err_cr: + if (ret) + futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); + if (pid > 0) + wait(NULL); + + return ret; +} + +int main(int argc, char **argv) +{ + int fail = 0; + pid_t pid; + + test_init(argc, argv); + + sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (sh == MAP_FAILED) { + pr_perror("Failed to alloc shared region"); + return 1; + } + + futex_set(&sh->fstate, FUTEX_INITIALIZED); + + pid = test_fork(); + if (pid < 0) { + fail = 1; + goto out; + } + + if (pid == 0) + exit(new_pgrp()); + + /* Wait until pgrp is ready to C/R */ + futex_wait_while_lt(&sh->fstate, TEST_CRIU); + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker before C/R\n"); + fail = 1; + goto out; + } + + test_daemon(); + test_waitsig(); + + if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { + pr_err("Fail in child worker after C/R\n"); + goto out; + } + + if (!fail) + pass(); + + futex_set_and_wake(&sh->fstate, TEST_EXIT); +out: + if (pid > 0) + wait(NULL); + + munmap(sh, sizeof(struct shared)); + + return fail; +} diff --git a/test/zdtm/static/tempfs_subns.c b/test/zdtm/static/tempfs_subns.c index ed3ef9a3a..490fdad6e 100644 --- a/test/zdtm/static/tempfs_subns.c +++ b/test/zdtm/static/tempfs_subns.c @@ -20,7 +20,7 @@ int main(int argc, char **argv) { int fds[2], i; pid_t pid; - int fd, status; + int status, fd = -1; test_init(argc, argv); diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index ab88120c2..55609f260 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -17,6 +17,7 @@ int main(int argc, char **argv) unsigned long orig_flags = 0, new_flags = 0; unsigned long orig_madv = 0, new_madv = 0; void *area; + int ret; test_init(argc, argv); @@ -35,9 +36,46 @@ int main(int argc, char **argv) return -1; } + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); + return -1; + } + + test_msg("Fetch pre-migration flags/adv\n"); + if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) + return -1; + + errno = 0; + if (orig_flags != new_flags) { + fail("Flags changed %lx -> %lx", orig_flags, new_flags); + return -1; + } + + if (orig_madv != new_madv) { + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); + return -1; + } + test_daemon(); test_waitsig(); + ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); + if (ret < 0) { + pr_perror("Getting post-migration THP-disabled flag failed"); + return -1; + } + if (ret != 1) { + errno = 0; + fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); + return -1; + } + if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_perror("Enabling THP failed"); return -1; @@ -47,15 +85,14 @@ int main(int argc, char **argv) if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) return -1; + errno = 0; if (orig_flags != new_flags) { - pr_err("Flags are changed %lx -> %lx\n", orig_flags, new_flags); - fail(); + fail("Flags changed %lx -> %lx", orig_flags, new_flags); return -1; } if (orig_madv != new_madv) { - pr_err("Madvs are changed %lx -> %lx\n", orig_madv, new_madv); - fail(); + fail("Madvs changed %lx -> %lx", orig_madv, new_madv); return -1; } diff --git a/test/zdtm/static/thread_different_uid_gid.c b/test/zdtm/static/thread_different_uid_gid.c index 3a0b6291b..88f99659b 100644 --- a/test/zdtm/static/thread_different_uid_gid.c +++ b/test/zdtm/static/thread_different_uid_gid.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); if (ret >= 0) { ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); - } else if (ret < 0) { + } else { pr_perror("Failed to drop privileges"); exit(1); } diff --git a/test/zdtm/static/timers01.c b/test/zdtm/static/timers01.c new file mode 100644 index 000000000..10ecc3481 --- /dev/null +++ b/test/zdtm/static/timers01.c @@ -0,0 +1,74 @@ +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Checks non-periodic timers\n"; +const char *test_author = "Andrei Vagin "; + +static struct { + const int timer_type; + const int signal; + volatile sig_atomic_t count; +} timer_tests[] = { + /* from slowest to fastest */ + { ITIMER_VIRTUAL, SIGVTALRM }, + { ITIMER_PROF, SIGPROF }, + { ITIMER_REAL, SIGALRM }, +}; + +#define NUM_TIMERS (sizeof(timer_tests) / sizeof(timer_tests[0])) +#define TIMER_TIMEOUT 3600 +#define TIMER_ALLOWED_DELTA 300 + +static void setup_timers(void) +{ + int i; + struct itimerval tv = { + .it_interval = { .tv_sec = 0, .tv_usec = 0 }, + .it_value = { .tv_sec = TIMER_TIMEOUT, .tv_usec = 0 }, + }; + + for (i = 0; i < NUM_TIMERS; i++) { + if (setitimer(timer_tests[i].timer_type, &tv, NULL) < 0) { + pr_perror("can't set timer %d", i); + exit(1); + } + } +} + +static void check_timers(void) +{ + int i; + + for (i = 0; i < NUM_TIMERS; i++) { + struct itimerval tv = {}; + + if (getitimer(timer_tests[i].timer_type, &tv)) { + pr_perror("gettimer"); + exit(1); + } + if (tv.it_value.tv_sec > TIMER_TIMEOUT || + tv.it_value.tv_sec < TIMER_TIMEOUT - TIMER_ALLOWED_DELTA) { + fail("%ld isn't in [%d, %d]", (long)tv.it_value.tv_sec, + TIMER_TIMEOUT, + TIMER_TIMEOUT - TIMER_ALLOWED_DELTA); + exit(1); + } + } + pass(); +} + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + setup_timers(); + + test_daemon(); + test_waitsig(); + + check_timers(); + return 0; +} diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c new file mode 100644 index 000000000..6ef9a56bc --- /dev/null +++ b/test/zdtm/static/uprobes.c @@ -0,0 +1,295 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test the --allow-uprobes option"; +const char *test_author = "Shashank Balaji "; + +#define UPROBE_GROUP_NAME "zdtm" +#define UPROBE_EVENT_NAME "uprobes_test" +#define UPROBED_FUNCTION uprobe_target + +/* + * A uprobe can be set at the start of a function, but not all instructions + * will trigger the creation of a uprobes vma. + * + * Examples: + * - aarch64: if the function is a single `ret`, then no vma creation + * - x64: if the function is `nop; ret`, then no vma creation + * + * So to guarantee vma creation, create a volatile dummy variable (to prevent + * compiler optimization) and use it (to prevent "unused variable" warning) + */ +void UPROBED_FUNCTION(void) { + volatile int dummy __maybe_unused = 0; + dummy += 1; +} +/* Calling via volatile function pointer ensures noinline at callsite */ +typedef void (*func_ptr)(void); +volatile func_ptr uprobe_target_alias = UPROBED_FUNCTION; + +struct uprobe_context { + struct tracefs_instance *instance; + struct tracefs_dynevent *uprobe; +}; + +volatile bool got_sigtrap = false; + +/* + * Returns the file offset of a symbol in the executable of this program + * Returns 0 on failure +*/ +uint64_t calc_sym_offset(const char *sym_name) +{ + GElf_Shdr section_header; + Elf_Scn *section = NULL; + Elf_Data *symtab_data; + uint64_t offset = 0; + char buf[PATH_MAX]; + GElf_Sym symbol; + ssize_t n_bytes; + int n_entries; + Elf *elf; + int fd; + int i; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_err("ELF version of libelf is lower than that of the program\n"); + return 0; + } + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 0; + } + buf[n_bytes] = '\0'; + + fd = open(buf, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to open self-executable"); + return 0; + } + + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_err("%s\n", elf_errmsg(elf_errno())); + goto out_fd; + } + + /* Look for the symbol table section and its header */ + while ((section = elf_nextscn(elf, section)) != NULL) { + gelf_getshdr(section, §ion_header); + if (section_header.sh_type == SHT_SYMTAB) + break; + } + if (!section) { + pr_err("Failed to find symbol table\n"); + goto out_elf; + } + symtab_data = elf_getdata(section, NULL); + n_entries = section_header.sh_size / section_header.sh_entsize; + + /* Look for a symbol with the required name */ + for (i = 0; i < n_entries; i++) { + gelf_getsym(symtab_data, i, &symbol); + /* Symbol table's sh_link is the index of the string table section header */ + if (!strcmp(sym_name, + elf_strptr(elf, section_header.sh_link, symbol.st_name))) + break; + } + if (i == n_entries) { + pr_err("Failed to find symbol \"%s\"\n", sym_name); + goto out_elf; + } + + /* Get the section the symbol belongs to (mostly .text) */ + section = elf_getscn(elf, symbol.st_shndx); + gelf_getshdr(section, §ion_header); + offset = symbol.st_value - section_header.sh_addr + section_header.sh_offset; + +out_elf: + elf_end(elf); +out_fd: + close(fd); + return offset; +} + +/* + * Set and enable a uprobe on the file at the given offset + * Returns struct uprobe_context with members set to NULL on failure +*/ +struct uprobe_context enable_uprobe(const char *file, uint64_t offset) +{ + struct tracefs_instance *trace_instance; + struct tracefs_dynevent *uprobe; + struct uprobe_context context = {}; + + trace_instance = tracefs_instance_create("zdtm_uprobes_test"); + if (!trace_instance) { + pr_perror("Failed to create tracefs instance"); + return context; + } + tracefs_instance_reset(trace_instance); + + uprobe = tracefs_uprobe_alloc(UPROBE_GROUP_NAME, UPROBE_EVENT_NAME, file, offset, NULL); + if (!uprobe) { + pr_perror("Failed to allocate uprobe"); + goto instance_destroy; + } + + if (tracefs_dynevent_create(uprobe)) { + pr_perror("Failed to create uprobe"); + goto uprobe_free; + } + + if (tracefs_event_enable(trace_instance, UPROBE_GROUP_NAME, UPROBE_EVENT_NAME)) { + pr_perror("Failed to enable uprobe"); + goto uprobe_destroy; + } + + context.instance = trace_instance; + context.uprobe = uprobe; + return context; + +uprobe_destroy: + tracefs_dynevent_destroy(uprobe, false); +uprobe_free: + tracefs_dynevent_free(uprobe); +instance_destroy: + tracefs_instance_destroy(trace_instance); + tracefs_instance_free(trace_instance); + return context; +} + +void destroy_uprobe(struct uprobe_context context) +{ + tracefs_dynevent_destroy(context.uprobe, true); + tracefs_dynevent_free(context.uprobe); + tracefs_instance_destroy(context.instance); + tracefs_instance_free(context.instance); +} + +/* + * Check for the existence of the "[uprobes]" vma in /proc/self/maps + * Returns -1 on failure, 0 if not found, 1 if found +*/ +int uprobes_vma_exists(void) +{ + FILE *f; + char buf[LINE_MAX]; + int ret = 0; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + pr_perror("Failed to open /proc/self/maps"); + return -1; + } + + while (fgets(buf, sizeof(buf), f)) { + if (strstr(buf, "[uprobes]")) { + ret = 1; + break; + } + } + if (ret == 0 && !feof(f)) { + pr_err("Failed to finish reading /proc/self/maps\n"); + ret = -1; + } + + fclose(f); + return ret; +} + +/* + * SIGTRAP is sent if execution reaches a previously set uprobed location, and + * the corresponding uprobe is not active. We don't want this to happen on restore +*/ +void sigtrap_handler(int signo, siginfo_t *info, void* context) +{ + if (info->si_code == SI_KERNEL) { + got_sigtrap = true; + fail("SIGTRAP on attempting to call uprobed function"); + } +} + +int main(int argc, char **argv) +{ + struct uprobe_context context; + struct sigaction sa; + char buf[PATH_MAX]; + uint64_t offset; + int n_bytes; + int ret = 1; + + test_init(argc, argv); + + offset = calc_sym_offset(__stringify(UPROBED_FUNCTION)); + if (!offset) + return 1; + + n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); + if (n_bytes < 0) { + pr_perror("Failed to readlink /proc/self/exe"); + return 1; + } + buf[n_bytes] = '\0'; + + sa.sa_flags = SA_SIGINFO; + sa.sa_sigaction = sigtrap_handler; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGTRAP, &sa, NULL)) { + pr_perror("Failed to set SIGTRAP handler"); + return 1; + } + + context = enable_uprobe(buf, offset); + if (!context.instance) + return 1; + + /* + * Execution must reach the uprobed location at least once + * for the kernel to create the uprobes vma + */ + uprobe_target_alias(); + + switch (uprobes_vma_exists()) { + case -1: + goto out_uprobe; + break; + case 0: + pr_err("uprobes vma does not exist\n"); + goto out_uprobe; + break; + case 1: + test_msg("Found uprobes vma\n"); + break; + } + + test_daemon(); + test_waitsig(); + + /* + * Calling the uprobed function after restore should not cause + * a SIGTRAP, since the uprobe is still active + */ + uprobe_target_alias(); + if (!got_sigtrap) { + pass(); + ret = 0; + } + +out_uprobe: + destroy_uprobe(context); + return ret; +} diff --git a/test/zdtm/static/uprobes.desc b/test/zdtm/static/uprobes.desc new file mode 100644 index 000000000..6eab1f498 --- /dev/null +++ b/test/zdtm/static/uprobes.desc @@ -0,0 +1,6 @@ +{ + 'feature': 'cgroupns', + 'flags': 'suid nouser', + 'flavor': 'h', + 'opts': '--allow-uprobes' +} diff --git a/test/zdtm/static/vdso-proxy.c b/test/zdtm/static/vdso-proxy.c index 43334974f..a53e6cdc0 100644 --- a/test/zdtm/static/vdso-proxy.c +++ b/test/zdtm/static/vdso-proxy.c @@ -70,6 +70,7 @@ static int parse_maps(struct vm_area *vmas) #endif v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL; v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL; + v->is_vvar_or_vdso |= strstr(buf, "[vvar_vclock]") != NULL; test_msg("[NOTE]\tVMA: [%#" PRIx64 ", %#" PRIx64 "]\n", v->start, v->end); } @@ -86,42 +87,35 @@ static int parse_maps(struct vm_area *vmas) return i; } -int compare_vmas(struct vm_area *vmax, struct vm_area *vmay) -{ - if (vmax->start > vmay->start) - return 1; - if (vmax->start < vmay->start) - return -1; - if (vmax->end > vmay->end) - return 1; - if (vmax->end < vmay->end) - return -1; - - return 0; -} - -static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) +static int check_vvar_vdso(struct vm_area *before, int nr_before, struct vm_area *after, int nr_after) { int i, j = 0; - for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) { - int cmp = compare_vmas(&before[i], &after[j]); - - if (cmp == 0) - continue; - - if (cmp < 0) { /* Lost mapping */ + for (i = 0, j = 0; i < nr_before || j < nr_after;) { + if (j == nr_after || before[i].start < after[j].start) { test_msg("[NOTE]\tLost mapping: %#" PRIx64 "-%#" PRIx64 "\n", before[i].start, before[i].end); - j--; if (before[i].is_vvar_or_vdso) { fail("Lost vvar/vdso mapping"); return -1; } + i++; continue; } - - test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); - i--; + if (i == nr_before || before[i].start > after[j].start) { + test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); + j++; + continue; + } + if (before[i].end == after[j].end) { + i++; + j++; + } else if (before[i].end > after[j].end) { + before[i].start = after[j].end; + j++; + } else { + after[j].start = before[i].end; + i++; + } } return 0; @@ -129,11 +123,10 @@ static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) static struct vm_area vmas_before[MAX_VMAS]; static struct vm_area vmas_after[MAX_VMAS]; +static int nr_before, nr_after; int main(int argc, char *argv[]) { - int nr_before, nr_after; - test_init(argc, argv); test_msg("[NOTE]\tMappings before:\n"); @@ -154,7 +147,7 @@ int main(int argc, char *argv[]) } /* After restore vDSO/VVAR blobs must remain in the old place. */ - if (check_vvar_vdso(vmas_before, vmas_after)) + if (check_vvar_vdso(vmas_before, nr_before, vmas_after, nr_after)) return -1; if (nr_before + 2 < nr_after) { diff --git a/test/zdtm/static/vdso00.c b/test/zdtm/static/vdso00.c index a9bef4dbd..69123a203 100644 --- a/test/zdtm/static/vdso00.c +++ b/test/zdtm/static/vdso00.c @@ -1,6 +1,6 @@ #include #include - +#include #include #include @@ -19,14 +19,14 @@ int main(int argc, char *argv[]) test_msg("%s pid %d\n", argv[0], getpid()); gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); test_daemon(); test_waitsig(); /* this call will fail if vDSO is corrupted */ gettimeofday(&tv, &tz); - test_msg("%d time: %10li\n", getpid(), tv.tv_sec); + test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); pass(); diff --git a/test/zdtm/static/vdso01.c b/test/zdtm/static/vdso01.c index d8d64155a..d8b3c94d5 100644 --- a/test/zdtm/static/vdso01.c +++ b/test/zdtm/static/vdso01.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -324,7 +325,8 @@ static int vdso_clock_gettime_handler(void *func) clock_gettime(CLOCK_REALTIME, &ts1); vdso_clock_gettime(CLOCK_REALTIME, &ts2); - test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); + test_msg("clock_gettime: tv_sec %" PRId64 " vdso_clock_gettime: tv_sec %" PRId64 "\n", + (int64_t)ts1.tv_sec, (int64_t)ts2.tv_sec); if (labs(ts1.tv_sec - ts2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -354,7 +356,8 @@ static int vdso_gettimeofday_handler(void *func) gettimeofday(&tv1, &tz); vdso_gettimeofday(&tv2, &tz); - test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); + test_msg("gettimeofday: tv_sec %" PRId64 " vdso_gettimeofday: tv_sec %" PRId64 "\n", + (int64_t)tv1.tv_sec, (int64_t)tv2.tv_sec); if (labs(tv1.tv_sec - tv2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -372,7 +375,7 @@ static int vdso_time_handler(void *func) t1 = time(NULL); t2 = vdso_time(NULL); - test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t1); + test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t2); if (labs(t1 - t2) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); diff --git a/test/zdtm/static/vdso02.c b/test/zdtm/static/vdso02.c index 2050bca71..5779b7fd6 100644 --- a/test/zdtm/static/vdso02.c +++ b/test/zdtm/static/vdso02.c @@ -29,7 +29,8 @@ static int parse_vm_area(char *buf, struct vm_area *vma) return -1; } -static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) +static int find_blobs(pid_t pid, struct vm_area *vdso, + struct vm_area *vvar, struct vm_area *vvar_vclock) { char buf[BUF_SZ]; int ret = -1; @@ -39,6 +40,8 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) vdso->end = VDSO_BAD_ADDR; vvar->start = VVAR_BAD_ADDR; vvar->end = VVAR_BAD_ADDR; + vvar_vclock->start = VVAR_BAD_ADDR; + vvar_vclock->end = VVAR_BAD_ADDR; if (snprintf(buf, BUF_SZ, "/proc/%d/maps", pid) < 0) { pr_perror("snprintf() failure for path"); @@ -57,12 +60,18 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) if (strstr(buf, "[vvar]") && parse_vm_area(buf, vvar)) goto err; + if (strstr(buf, "[vvar_vclock]") && + parse_vm_area(buf, vvar_vclock)) + goto err; } if (vdso->start != VDSO_BAD_ADDR) test_msg("[vdso] %lx-%lx\n", vdso->start, vdso->end); if (vvar->start != VVAR_BAD_ADDR) test_msg("[vvar] %lx-%lx\n", vvar->start, vvar->end); + if (vvar_vclock->start != VVAR_BAD_ADDR) + test_msg("[vvar_vclock] %lx-%lx\n", + vvar_vclock->start, vvar_vclock->end); ret = 0; err: fclose(maps); @@ -143,10 +152,10 @@ void sys_exit(int status) static int unmap_blobs(void) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; int ret; - if (find_blobs(getpid(), &vdso, &vvar)) + if (find_blobs(getpid(), &vdso, &vvar, &vvar_vclock)) return -1; if (vdso.start != VDSO_BAD_ADDR) { @@ -159,13 +168,19 @@ static int unmap_blobs(void) if (ret) return ret; } + if (vvar_vclock.start != VVAR_BAD_ADDR) { + ret = sys_munmap((void *)vvar_vclock.start, + vvar_vclock.end - vvar_vclock.start); + if (ret) + return ret; + } return 0; } int main(int argc, char *argv[]) { - struct vm_area vdso, vvar; + struct vm_area vdso, vvar, vvar_vclock; pid_t child; int status, ret = -1; @@ -201,9 +216,11 @@ int main(int argc, char *argv[]) goto out_kill; } - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; - if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { + if (vdso.start != VDSO_BAD_ADDR || + vvar.start != VVAR_BAD_ADDR || + vvar_vclock.start != VVAR_BAD_ADDR) { pr_err("Found vvar or vdso blob(s) in child, which should have unmapped them\n"); goto out_kill; } @@ -211,7 +228,7 @@ int main(int argc, char *argv[]) test_daemon(); test_waitsig(); - if (find_blobs(child, &vdso, &vvar)) + if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) goto out_kill; if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Child without vdso got it after C/R\n"); diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile index 98440f4e2..ddf2faaad 100644 --- a/test/zdtm/transition/Makefile +++ b/test/zdtm/transition/Makefile @@ -25,6 +25,7 @@ TST_NOFILE = \ pidfd_store_sk \ rseq01 \ rseq02 \ + stack \ TST_FILE = \ @@ -83,7 +84,9 @@ ptrace: LDFLAGS += -pthread fork2: CFLAGS += -D FORK2 thread-bomb.o: CFLAGS += -pthread thread-bomb: LDFLAGS += -pthread +rseq01: LDLIBS += -pthread rseq02: CFLAGS += -D NORESTART +rseq02: LDLIBS += -pthread %: %.sh cp $< $@ diff --git a/test/zdtm/transition/epoll.c b/test/zdtm/transition/epoll.c index fdd492ab2..803e50541 100644 --- a/test/zdtm/transition/epoll.c +++ b/test/zdtm/transition/epoll.c @@ -158,9 +158,11 @@ int main(int argc, char **argv) exit(1); } for (i = 0; i < rv; i++) { - while (read(events[i].data.fd, buf, buf_size) > 0) + int ret; + + while ((ret = read(events[i].data.fd, buf, buf_size)) > 0) ; - if (errno != EAGAIN && errno != 0 && errno) { + if (ret < 0 && errno != EAGAIN) { pr_perror("read error"); killall(); exit(1); diff --git a/test/zdtm/transition/ipc.c b/test/zdtm/transition/ipc.c index 0f16dbc68..7660f70af 100644 --- a/test/zdtm/transition/ipc.c +++ b/test/zdtm/transition/ipc.c @@ -178,7 +178,7 @@ int main(int argc, char **argv) pr_perror("Child 2 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 2 couldn't inititalise"); + pr_perror("Child 2 couldn't initialise"); } out_child: kill(pid1, SIGTERM); @@ -188,7 +188,7 @@ out_child: pr_perror("Child 1 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 1 couldn't inititalise"); + pr_perror("Child 1 couldn't initialise"); } out_shdt: shmdt(mem); diff --git a/test/zdtm/transition/lazy-thp.c b/test/zdtm/transition/lazy-thp.c index 2bf99dc4c..2e9722b96 100644 --- a/test/zdtm/transition/lazy-thp.c +++ b/test/zdtm/transition/lazy-thp.c @@ -25,7 +25,7 @@ int main(int argc, char **argv) test_init(argc, argv); - /* we presume that malloc returns not page aliged address */ + /* we presume that malloc returns not page aligned address */ mem = malloc(PAGE_SIZE * N_PAGES); org = malloc(PAGE_SIZE); if (!mem || !org) { diff --git a/test/zdtm/transition/maps007.c b/test/zdtm/transition/maps007.c index 8a605cfe0..35c196bc4 100644 --- a/test/zdtm/transition/maps007.c +++ b/test/zdtm/transition/maps007.c @@ -38,7 +38,7 @@ int main(int argc, char **argv) struct { futex_t delta; futex_t stop; - } * shm; + } *shm; uint32_t v; unsigned long long count = 0; int i; diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c index b6d470785..08a7a8e1a 100644 --- a/test/zdtm/transition/rseq01.c +++ b/test/zdtm/transition/rseq01.c @@ -33,7 +33,10 @@ static inline void *thread_pointer(void) static inline void unregister_old_rseq(void) { /* unregister rseq */ - syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), __rseq_size, 1, RSEQ_SIG); + unsigned int size = __rseq_size; + if (__rseq_size < 32) + size = 32; + syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), size, 1, RSEQ_SIG); } #else static inline void unregister_old_rseq(void) @@ -86,7 +89,7 @@ struct rseq { #endif /* EOF */ -static volatile struct rseq *rseq_ptr; +static __thread volatile struct rseq *rseq_ptr; static __thread volatile struct rseq __rseq_abi; static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) @@ -119,7 +122,7 @@ static void check_thread(void) #define rseq_after_asm_goto() asm volatile("" : : : "memory") -static int rseq_addv(intptr_t *v, intptr_t count, int cpu) +static int rseq_addv(intptr_t *v, intptr_t count, int cpu, bool ignore_abort, const char *id) { double a = 10000000000000000.0; double b = -1; @@ -177,7 +180,7 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) ); /* clang-format on */ rseq_after_asm_goto(); - test_msg("exit %lx %lx %f %f\n", rseq_cs1, rseq_cs2, a, b); + test_msg("exit %s, %lx %lx %f %f\n", id, rseq_cs1, rseq_cs2, a, b); if (rseq_cs1 != rseq_cs2) { /* * It means that we finished critical section @@ -192,16 +195,45 @@ static int rseq_addv(intptr_t *v, intptr_t count, int cpu) return 0; abort: rseq_after_asm_goto(); - test_msg("abort %lx %lx %f %f\n", rseq_cs1, rseq_cs2, a, b); + test_msg("abort %s, %lx %lx %f %f\n", id, rseq_cs1, rseq_cs2, a, b); + if (ignore_abort) + return 0; return -1; } +static task_waiter_t waiter; +static intptr_t *cpu_data; +bool ignore_abort = true; +int thread_ret; + +void *thread_routine(void *args) +{ + int cpu; + + rseq_ptr = &__rseq_abi; + memset((void *)rseq_ptr, 0, sizeof(struct rseq)); + register_thread(); + task_waiter_complete(&waiter, 1); + task_waiter_wait4(&waiter, 2); + + while (test_go()) { + cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); + thread_ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort, "thread"); + + if (thread_ret) + break; + } + + check_thread(); + return NULL; +} + int main(int argc, char *argv[]) { int cpu = 0; int ret; - intptr_t *cpu_data; long nr_cpus; + pthread_t thread; rseq_ptr = &__rseq_abi; memset((void *)rseq_ptr, 0, sizeof(struct rseq)); @@ -225,31 +257,37 @@ int main(int argc, char *argv[]) * https://github.com/torvalds/linux/blob/ce522ba9/kernel/rseq.c#L192 */ #ifdef NORESTART + ignore_abort = false; rseq_ptr->flags = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE; #endif + task_waiter_init(&waiter); + if (pthread_create(&thread, NULL, thread_routine, NULL)) { + fail("pthread_create"); + exit(EXIT_FAILURE); + } + task_waiter_wait4(&waiter, 1); + test_daemon(); + task_waiter_complete(&waiter, 2); while (test_go()) { cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); - ret = rseq_addv(&cpu_data[cpu], 2, cpu); - -/* NORESTART is NOT set */ -#ifndef NORESTART - /* just ignore abort */ - ret = 0; -#endif + ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort, "task"); if (ret) break; } - test_waitsig(); - check_thread(); - if (ret) + if (pthread_join(thread, NULL)) { + fail("pthread_join"); + exit(EXIT_FAILURE); + } + + if (ret || thread_ret) fail(); else pass(); diff --git a/test/zdtm/transition/stack.c b/test/zdtm/transition/stack.c new file mode 100644 index 000000000..9548b9182 --- /dev/null +++ b/test/zdtm/transition/stack.c @@ -0,0 +1,16 @@ +#include "zdtmtst.h" + +const char *test_doc = "Tests that parasite code does not write past the start of the stack"; +const char *test_author = "Younes Manton "; + +int main(int argc, char **argv) +{ + test_init(argc, argv); + + test_daemon(); + test_waitsig(); + + pass(); + + return 0; +} diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 0e8eeff8a..44316893d 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -93,44 +93,50 @@ static int create_timens(void) int main(int argc, char **argv) { + uid_t uid; pid_t pid; int status; + uid = getuid(); + /* * pidns is used to avoid conflicts * mntns is used to mount /proc - * net is used to avoid conflicts of parasite sockets + * net is used to avoid conflicts between network tests */ - if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) - return 1; + if (!uid) + if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) + return 1; pid = fork(); if (pid == 0) { - if (create_timens()) - exit(1); - if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { - fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); - return 1; + if (!uid) { + if (create_timens()) + exit(1); + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { + fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); + return 1; + } + umount2("/proc", MNT_DETACH); + umount2("/dev/pts", MNT_DETACH); + if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { + fprintf(stderr, "mount(/proc): %m"); + return 1; + } + if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { + fprintf(stderr, "mount(pts): %m"); + return 1; + } + if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { + fprintf(stderr, "mount(binfmt_misc): %m"); + return 1; + } + if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { + fprintf(stderr, "mount(ptmx): %m"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; } - umount2("/proc", MNT_DETACH); - umount2("/dev/pts", MNT_DETACH); - if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { - fprintf(stderr, "mount(/proc): %m"); - return 1; - } - if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { - fprintf(stderr, "mount(pts): %m"); - return 1; - } - if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { - fprintf(stderr, "mount(binfmt_misc): %m"); - return 1; - } - if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { - fprintf(stderr, "mount(ptmx): %m"); - return 1; - } - if (system("ip link set up dev lo")) - return 1; execv(argv[1], argv + 1); fprintf(stderr, "execve: %m"); return 1;