diff --git a/.circleci/config.yml b/.circleci/config.yml index 785b383e1..47f7ad9b1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2.1 jobs: test-local-gcc: machine: - image: default + image: ubuntu-2004:202010-01 working_directory: ~/criu steps: - checkout @@ -11,7 +11,7 @@ jobs: command: sudo -E make -C scripts/ci local test-local-clang: machine: - image: default + image: ubuntu-2004:202010-01 working_directory: ~/criu steps: - checkout diff --git a/.cirrus.yml b/.cirrus.yml index 72dbb3898..671178d8b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -13,89 +13,67 @@ task: nested_virtualization: true setup_script: | - contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker + scripts/ci/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker sudo kvm-ok + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto build_script: | make -C scripts/ci vagrant-fedora-no-vdso task: - name: CentOS Stream 9 based test + name: CentOS 8 based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: image_project: centos-cloud - image: family/centos-stream-9 + image: family/centos-8 platform: linux cpu: 4 memory: 8G setup_script: | - dnf config-manager --set-enabled crb # Same as CentOS 8 powertools - dnf -y install epel-release epel-next-release - contrib/dependencies/dnf-packages.sh - # The image has a too old version of nettle which does not work with gnutls. - # Just upgrade to the latest to make the error go away. - dnf -y upgrade nettle nettle-devel + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core + yum config-manager --set-enabled powertools + yum install -y --allowerasing asciidoc gcc git gnutls-devel libaio-devel libasan libcap-devel libnet-devel libnl3-devel libselinux-devel make protobuf-c-devel protobuf-devel python3-devel python3-flake8 python3-PyYAML python3-future python3-protobuf xmlto + alternatives --set python /usr/bin/python3 systemctl stop sssd - # Even with selinux in permissive mode the selinux tests will be executed. + # Even with selinux in permissive mode the selinux tests will be executed # The Cirrus CI user runs as a service from selinux point of view and is - # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0). - # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode. + # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) + # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode setenforce 0 + pip3 install junit_xml build_script: | - make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_OPTS="-x zdtm/static/socket-raw" + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 task: - name: Vagrant Fedora Rawhide based test + name: CentOS 7 based test environment: HOME: "/root" CIRRUS_WORKING_DIR: "/tmp/criu" compute_engine_instance: - image_project: cirrus-images - image: family/docker-kvm + image_project: centos-cloud + image: family/centos-7 platform: linux cpu: 4 - memory: 16G - nested_virtualization: true + memory: 8G setup_script: | - contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker - sudo kvm-ok + ln -sf /usr/include/google/protobuf/descriptor.proto images/google/protobuf/descriptor.proto + yum install -y findutils gcc git gnutls-devel iproute iptables libaio-devel libasan libcap-devel libnet-devel libnl3-devel make procps-ng protobuf-c-devel protobuf-devel protobuf-python python python-flake8 python-ipaddress python2-future python2-junit_xml python-yaml python-six sudo tar which e2fsprogs python2-pip rubygem-asciidoctor libselinux-devel + # Even with selinux in permissive mode the selinux tests will be executed + # The Cirrus CI user runs as a service from selinux point of view and is + # much more restricted than a normal shell (system_u:system_r:unconfined_service_t:s0) + # The test case above (vagrant-fedora-no-vdso) should run selinux tests in enforcing mode + setenforce 0 + # Enable user namespaces on CentOS 7 + echo 10000 > /proc/sys/user/max_user_namespaces + # Adapt sudoers to our needs + echo 'root ALL=(ALL:ALL) ALL' | EDITOR='tee -a' visudo + build_script: | - make -C scripts/ci vagrant-fedora-rawhide - -task: - name: Vagrant Fedora based test (non-root) - environment: - HOME: "/root" - CIRRUS_WORKING_DIR: "/tmp/criu" - - compute_engine_instance: - image_project: cirrus-images - image: family/docker-kvm - platform: linux - cpu: 4 - memory: 16G - nested_virtualization: true - - setup_script: | - contrib/apt-install make gcc pkg-config git perl-modules iproute2 kmod wget cpu-checker - sudo kvm-ok - build_script: | - make -C scripts/ci vagrant-fedora-non-root - -task: - name: aarch64 Fedora Rawhide - arm_container: - image: registry.fedoraproject.org/fedora:rawhide - cpu: 4 - memory: 4G - script: uname -a - build_script: | - scripts/ci/prepare-for-fedora-rawhide.sh - make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 - make -C test/zdtm -j 4 + make -C scripts/ci local SKIP_CI_PREP=1 CC=gcc CD_TO_TOP=1 ZDTM_IGNORE_TAINT=1 ZDTM_OPTS="-x zdtm/static/socket-raw -x zdtm/static/child_subreaper_existing_child -x zdtm/static/fifo_upon_unix_socket01 -x zdtm/static/overmount_sock -x zdtm/static/tempfs_overmounted" diff --git a/.clang-format b/.clang-format index fb40bc613..dd4ade370 100644 --- a/.clang-format +++ b/.clang-format @@ -15,7 +15,7 @@ AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignEscapedNewlines: Left # Unknown to clang-format-4.0 AlignOperands: true -AlignTrailingComments: true +AlignTrailingComments: false AlignConsecutiveMacros: true AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false @@ -53,7 +53,7 @@ BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0 BreakAfterJavaFieldAnnotations: false BreakStringLiterals: false -ColumnLimit: 0 +ColumnLimit: 120 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false # Unknown to clang-format-4.0 ConstructorInitializerAllOnOneLineOrOnePerLine: false @@ -71,7 +71,6 @@ FixNamespaceComments: false # Unknown to clang-format-4.0 # | sort | uniq ForEachMacros: - 'for_each_pstree_item' - - 'for_each_bit' - 'apei_estatus_for_each_section' - 'ata_for_each_dev' - 'ata_for_each_link' @@ -516,7 +515,6 @@ IncludeCategories: Priority: 1 IncludeIsMainRegex: '(Test)?$' IndentCaseLabels: false -IndentGotoLabels: false IndentPPDirectives: None # Unknown to clang-format-5.0 IndentWidth: 8 IndentWrappedFunctionNames: false diff --git a/.codespellrc b/.codespellrc deleted file mode 100644 index 5def594b2..000000000 --- a/.codespellrc +++ /dev/null @@ -1,3 +0,0 @@ -[codespell] -skip = ./.git,./test/pki,./tags,./plugins/amdgpu/amdgpu_drm.h,./plugins/amdgpu/drm.h,./plugins/amdgpu/drm_mode.h -ignore-words-list = creat,fpr,fle,ue,bord,parms,nd,te,testng,inh,wronly,renderd,bui,clen,sems diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 000000000..07eb8be65 --- /dev/null +++ b/.drone.yml @@ -0,0 +1,82 @@ +--- +kind: pipeline +type: docker +name: aarch64 build GCC (native) + +platform: + os: linux + arch: arm64 + +steps: +- name: build + image: ubuntu:focal + commands: + - scripts/ci/apt-install make + - make -C scripts/ci local + +--- +kind: pipeline +type: docker +name: aarch64 build CLANG (native) + +platform: + os: linux + arch: arm64 + +steps: +- name: build + image: ubuntu:focal + commands: + - scripts/ci/apt-install make + - make -C scripts/ci local CLANG=1 + +--- +kind: pipeline +type: docker +name: armhf build GCC (native) + +platform: + os: linux + arch: arm + +steps: +- name: build + # At the time of setting up focal did not work + image: ubuntu:bionic + commands: + - scripts/ci/apt-install make + - make -C scripts/ci local + +--- +kind: pipeline +type: docker +name: armhf build CLANG (native) + +platform: + os: linux + arch: arm + +steps: +- name: build + # At the time of setting up focal did not work + image: ubuntu:bionic + commands: + - scripts/ci/apt-install make + - make -C scripts/ci local CLANG=1 + +--- +kind: pipeline +type: docker +name: aarch64 Fedora Rawhide + +platform: + os: linux + arch: arm64 + +steps: +- name: build + image: registry.fedoraproject.org/fedora:rawhide + commands: + - scripts/ci/prepare-for-fedora-rawhide.sh + - make -C scripts/ci/ local CC=gcc SKIP_CI_PREP=1 SKIP_CI_TEST=1 CD_TO_TOP=1 + - make -C test/zdtm -j 4 diff --git a/.github/workflows/aarch64-test.yaml b/.github/workflows/aarch64-test.yaml deleted file mode 100644 index ebbecadb3..000000000 --- a/.github/workflows/aarch64-test.yaml +++ /dev/null @@ -1,34 +0,0 @@ -name: aarch64 test - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: aarch64-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - strategy: - matrix: - os: [ubuntu-24.04-arm, ubuntu-22.04-arm] - target: [GCC=1, CLANG=1] - - runs-on: ${{ matrix.os }} - - steps: - - uses: actions/checkout@v4 - - name: Run Tests ${{ matrix.target }} on ${{ matrix.os }} - # Following tests are failing on the VMs: - # ./change_mnt_context --pidfile=change_mnt_context.pid --outfile=change_mnt_context.out - # 45: ERR: change_mnt_context.c:23: mount (errno = 22 (Invalid argument)) - # - # In combination with '--remote-lazy-pages' following error occurs: - # 138: FAIL: maps05.c:84: Data corrupted at page 1639 (errno = 11 (Resource temporarily unavailable)) - run: | - # The 'sched_policy00' needs the following: - sudo sysctl -w kernel.sched_rt_runtime_us=-1 - # etc/hosts entry is needed for netns_lock_iptables - echo "127.0.0.1 localhost" | sudo tee -a /etc/hosts - sudo -E make -C scripts/ci local ${{ matrix.target }} RUN_TESTS=1 \ - ZDTM_OPTS="-x zdtm/static/change_mnt_context -x zdtm/static/maps05" diff --git a/.github/workflows/alpine-test.yml b/.github/workflows/alpine-test.yml index 0f5c20f48..6fc546ff5 100644 --- a/.github/workflows/alpine-test.yml +++ b/.github/workflows/alpine-test.yml @@ -2,20 +2,14 @@ name: Alpine Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: alpine-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: + runs-on: ubuntu-20.04 strategy: matrix: - os: [ubuntu-22.04, ubuntu-22.04-arm] target: [GCC=1, CLANG=1] - runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Alpine ${{ matrix.target }} Test run: sudo -E make -C scripts/ci alpine ${{ matrix.target }} diff --git a/.github/workflows/archlinux-test.yml b/.github/workflows/archlinux-test.yml index 425f0662b..bb98623a8 100644 --- a/.github/workflows/archlinux-test.yml +++ b/.github/workflows/archlinux-test.yml @@ -2,15 +2,10 @@ name: Arch Linux Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: archlinux-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Arch Linux Test run: sudo -E make -C scripts/ci archlinux diff --git a/.github/workflows/check-commits.yml b/.github/workflows/check-commits.yml deleted file mode 100644 index bf7d06697..000000000 --- a/.github/workflows/check-commits.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Verify self-contained commits - -on: pull_request - -# Cancel any preceding run on the pull request -concurrency: - group: commit-test-${{ github.event.pull_request.number }} - -jobs: - build: - runs-on: ubuntu-latest - # Check if pull request does not have label "not-selfcontained-ok" - if: "!contains(github.event.pull_request.labels.*.name, 'not-selfcontained-ok')" - steps: - - uses: actions/checkout@v4 - with: - # Needed to rebase against the base branch - fetch-depth: 0 - # Checkout pull request HEAD commit instead of merge commit - ref: ${{ github.event.pull_request.head.sha }} - - name: Install dependencies - run: sudo contrib/apt-install libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnl-3-dev libnet-dev libcap-dev uuid-dev - - name: Configure git user details - run: | - git config --global user.email "checkpoint-restore@users.noreply.github.com" - git config --global user.name "checkpoint-restore" - - name: Configure base branch without switching current branch - run: git fetch origin ${{ github.base_ref }}:${{ github.base_ref }} - - name: Build each commit - run: git rebase ${{ github.base_ref }} -x "make -C scripts/ci check-commit" diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml deleted file mode 100644 index 9c9e46c1b..000000000 --- a/.github/workflows/codeql.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: "CodeQL" - -on: - push: - branches: [ "criu-dev", "master" ] - pull_request: - branches: [ "criu-dev" ] - schedule: - - cron: "11 6 * * 3" - -# Cancel any preceding run on the pull request. -concurrency: - group: codeql-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - permissions: - actions: read - contents: read - security-events: write - - strategy: - fail-fast: false - matrix: - language: [ python, cpp ] - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Install Packages (cpp) - if: ${{ matrix.language == 'cpp' }} - run: | - sudo contrib/apt-install protobuf-c-compiler libprotobuf-c-dev libprotobuf-dev build-essential libprotobuf-dev libprotobuf-c-dev protobuf-c-compiler protobuf-compiler python3-protobuf libnet-dev pkg-config libnl-3-dev libbsd0 libbsd-dev iproute2 libcap-dev libaio-dev libbsd-dev python3-yaml libnl-route-3-dev gnutls-dev - - name: Initialize CodeQL - uses: github/codeql-action/init@v3 - with: - languages: ${{ matrix.language }} - queries: +security-and-quality - - - name: Autobuild - uses: github/codeql-action/autobuild@v3 - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 - with: - category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/compat-test.yml b/.github/workflows/compat-test.yml index 8a64ce185..5ae25fb73 100644 --- a/.github/workflows/compat-test.yml +++ b/.github/workflows/compat-test.yml @@ -2,20 +2,15 @@ name: Compat Tests on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: compat-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 strategy: matrix: target: [GCC, CLANG] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Compat Tests (${{ matrix.target }}) run: sudo -E make -C scripts/ci local COMPAT_TEST=y ${{ matrix.target }}=1 diff --git a/.github/workflows/cross-compile-daily.yml b/.github/workflows/cross-compile-daily.yml index c709cca00..701213276 100644 --- a/.github/workflows/cross-compile-daily.yml +++ b/.github/workflows/cross-compile-daily.yml @@ -10,11 +10,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - target: [armv7-stable-cross, aarch64-stable-cross, ppc64-stable-cross, mips64el-stable-cross, riscv64-stable-cross] + target: [armv7-cross, aarch64-cross, ppc64-cross, mips64el-cross] branches: [criu-dev, master] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 with: ref: ${{ matrix.branches }} - name: Run Cross Compilation Targets diff --git a/.github/workflows/cross-compile.yml b/.github/workflows/cross-compile.yml index 96672b294..90862e7ab 100644 --- a/.github/workflows/cross-compile.yml +++ b/.github/workflows/cross-compile.yml @@ -2,39 +2,16 @@ name: Cross Compile Tests on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: cross-compile-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: runs-on: ubuntu-latest - continue-on-error: ${{ matrix.experimental }} strategy: - fail-fast: false matrix: - experimental: [false] - target: [ - armv7-stable-cross, - aarch64-stable-cross, - ppc64-stable-cross, - mips64el-stable-cross, - riscv64-stable-cross, - ] - include: - - experimental: true - target: armv7-unstable-cross - - experimental: true - target: aarch64-unstable-cross - - experimental: true - target: ppc64-unstable-cross - - experimental: true - target: mips64el-unstable-cross + target: [armv7-cross, aarch64-cross, ppc64-cross, mips64el-cross] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Cross Compilation Targets run: > sudo make -C scripts/ci ${{ matrix.target }} diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index 23696905a..564691449 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -2,18 +2,13 @@ name: Docker Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: docker-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-22.04] + os: [ubuntu-20.04] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Docker Test (${{ matrix.os }}) run: sudo make -C scripts/ci docker-test diff --git a/.github/workflows/fedora-asan-test.yml b/.github/workflows/fedora-asan-test.yml index 02dc9a1b3..44b0f16d6 100644 --- a/.github/workflows/fedora-asan-test.yml +++ b/.github/workflows/fedora-asan-test.yml @@ -2,16 +2,11 @@ name: Fedora ASAN Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: fedora-asan-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Fedora ASAN Test run: sudo -E make -C scripts/ci fedora-asan diff --git a/.github/workflows/fedora-rawhide-test.yml b/.github/workflows/fedora-rawhide-test.yml index 83e2ead82..00bc3b2bd 100644 --- a/.github/workflows/fedora-rawhide-test.yml +++ b/.github/workflows/fedora-rawhide-test.yml @@ -2,20 +2,11 @@ name: Fedora Rawhide Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: fedora-rawhide-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Fedora Rawhide Test - # We need to pass environment variables from the CI environment to - # distinguish between CI environments. However, we need to make sure that - # XDG_RUNTIME_DIR environment variable is not set due to a bug in Podman. - # FIXME: https://github.com/containers/podman/issues/14920 - run: sudo -E XDG_RUNTIME_DIR= make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" + run: sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined" diff --git a/.github/workflows/gcov-test.yml b/.github/workflows/gcov-test.yml index cc4e1d44a..f1b38e77e 100644 --- a/.github/workflows/gcov-test.yml +++ b/.github/workflows/gcov-test.yml @@ -2,20 +2,13 @@ name: Coverage Tests on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: gcov-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Coverage Tests run: sudo -E make -C scripts/ci local GCOV=1 - - name: Run gcov - run: sudo -E find . -name '*gcda' -type f -print0 | sudo -E xargs --null --max-args 128 --max-procs 4 gcov - name: Run Coverage Analysis run: sudo -E make codecov diff --git a/.github/workflows/java-test.yml b/.github/workflows/java-test.yml deleted file mode 100644 index cbd3c1f23..000000000 --- a/.github/workflows/java-test.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: Java Test - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: java-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v4 - - name: Run Java Test - run: sudo make -C scripts/ci java-test diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index f7da4f6f6..50b241e9f 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -2,11 +2,6 @@ name: Run code linter on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: lint-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: runs-on: ubuntu-latest @@ -14,27 +9,18 @@ jobs: image: registry.fedoraproject.org/fedora:latest steps: - name: Install tools - run: sudo dnf -y install git make ruff xz clang-tools-extra codespell git-clang-format ShellCheck - - - uses: actions/checkout@v4 - - - name: Set git safe directory - # https://github.com/actions/checkout/issues/760 - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - + run: sudo dnf -y install git make python3-flake8 ShellCheck clang-tools-extra which findutils + - uses: actions/checkout@v2 - name: Run make lint run: make lint - - name: Run make indent - continue-on-error: true - run: | - if [ -z "${{github.base_ref}}" ]; then - git fetch --deepen=1 - make indent - else - git fetch origin ${{github.base_ref}} - make indent BASE=origin/${{github.base_ref}} + run: > + make indent && + STATUS=$(git status --porcelain) && + if [ ! -z "$STATUS" ]; then + echo "FAIL: some files are not correctly formatted."; + echo "$STATUS" + git diff + echo "FAIL: please run 'make indent'"; + exit 1; fi - - name: Raise in-line make indent warnings - run: | - git diff | ./scripts/github-indent-warnings.py diff --git a/.github/workflows/loongarch64-qemu-test.yml b/.github/workflows/loongarch64-qemu-test.yml deleted file mode 100644 index d7c554c87..000000000 --- a/.github/workflows/loongarch64-qemu-test.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: LoongArch64 Qemu Test - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: loongarch64-qemu-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v4 - - run: sudo make -C scripts/ci loongarch64-qemu-test diff --git a/.github/workflows/manage-labels.yml b/.github/workflows/manage-labels.yml deleted file mode 100644 index a2bcd8860..000000000 --- a/.github/workflows/manage-labels.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: Remove labels -on: [issue_comment, pull_request_review_comment] -jobs: - remove-labels-on-comments: - name: Remove labels on comments - if: github.event_name == 'issue_comment' - runs-on: ubuntu-latest - steps: - - uses: mondeja/remove-labels-gh-action@v1 - with: - token: ${{ secrets.GITHUB_TOKEN }} - labels: | - changes requested - awaiting reply diff --git a/.github/workflows/nftables-test.yml b/.github/workflows/nftables-test.yml deleted file mode 100644 index 7a7d8bd30..000000000 --- a/.github/workflows/nftables-test.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Nftables bases testing - -on: [push, pull_request] - -# Cancel any preceding run on the pull request. -concurrency: - group: nftables-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - -jobs: - build: - runs-on: ubuntu-24.04 - steps: - - uses: actions/checkout@v4 - - name: Remove iptables - run: sudo apt remove -y iptables - - name: Install libnftables-dev - run: sudo contrib/apt-install libnftables-dev - - name: chmod 755 /home/runner - # CRIU's tests are sometimes running as some random user and need - # to be able to access the test files. - run: sudo chmod 755 /home/runner - - name: Build with nftables network locking backend - run: sudo make -C scripts/ci local COMPILE_FLAGS="NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES" diff --git a/.github/workflows/openj9-test.yml b/.github/workflows/openj9-test.yml new file mode 100644 index 000000000..1d7a1eb6b --- /dev/null +++ b/.github/workflows/openj9-test.yml @@ -0,0 +1,11 @@ +name: OpenJ9 Test + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: Run OpenJ9 Test + run: sudo make -C scripts/ci openj9-test diff --git a/.github/workflows/podman-test.yml b/.github/workflows/podman-test.yml index a07edbe5b..447cbf0b6 100644 --- a/.github/workflows/podman-test.yml +++ b/.github/workflows/podman-test.yml @@ -2,15 +2,10 @@ name: Podman Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: podman-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run Podman Test run: sudo make -C scripts/ci podman-test diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 76d55d4c9..beb6774e4 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/stale@v5 + - uses: actions/stale@v1 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-issue-message: 'A friendly reminder that this issue had no activity for 30 days.' diff --git a/.github/workflows/stream-test.yml b/.github/workflows/stream-test.yml index 76bd96edf..ecdd81e0a 100644 --- a/.github/workflows/stream-test.yml +++ b/.github/workflows/stream-test.yml @@ -2,16 +2,11 @@ name: CRIU Image Streamer Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: stream-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run CRIU Image Streamer Test run: sudo -E make -C scripts/ci local STREAM_TEST=1 diff --git a/.github/workflows/x86-64-clang-test.yml b/.github/workflows/x86-64-clang-test.yml index 1f0a469bd..e6e84ef52 100644 --- a/.github/workflows/x86-64-clang-test.yml +++ b/.github/workflows/x86-64-clang-test.yml @@ -2,15 +2,10 @@ name: X86_64 CLANG Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: clang-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run X86_64 CLANG Test run: sudo make -C scripts/ci x86_64 CLANG=1 diff --git a/.github/workflows/x86-64-gcc-test.yml b/.github/workflows/x86-64-gcc-test.yml index 15e84a0df..b8b81ef15 100644 --- a/.github/workflows/x86-64-gcc-test.yml +++ b/.github/workflows/x86-64-gcc-test.yml @@ -2,15 +2,10 @@ name: X86_64 GCC Test on: [push, pull_request] -# Cancel any preceding run on the pull request. -concurrency: - group: gcc-test-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/criu-dev' }} - jobs: build: - runs-on: ubuntu-22.04 + runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v2 - name: Run X86_64 GCC Test run: sudo make -C scripts/ci x86_64 diff --git a/.gitignore b/.gitignore index 94daa13ea..d5135f5f8 100644 --- a/.gitignore +++ b/.gitignore @@ -20,16 +20,26 @@ compel/compel compel/compel-host-bin images/*.c images/*.h +images/google/protobuf/*.c +images/google/protobuf/*.h .gitid criu/criu criu/unittest/unittest +crit/crit +criu/arch/*/sys-exec-tbl*.c +# x86 syscalls-table is not generated +!criu/arch/x86/sys-exec-tbl.c +criu/arch/*/syscalls*.S +criu/include/syscall-codes*.h +criu/include/syscall*.h criu/include/version.h criu/pie/restorer-blob.h criu/pie/parasite-blob.h criu/protobuf-desc-gen.h lib/build/ lib/c/criu.pc +lib/.crit-setup.files compel/include/asm include/common/asm include/common/config.h -build/** +build/ diff --git a/.lgtm.yml b/.lgtm.yml deleted file mode 100644 index 4beadcc63..000000000 --- a/.lgtm.yml +++ /dev/null @@ -1,25 +0,0 @@ -extraction: - cpp: - prepare: - packages: - - "protobuf-c-compiler" - - "libprotobuf-c-dev" - - "libprotobuf-dev" - - "build-essential" - - "libprotobuf-dev" - - "libprotobuf-c-dev" - - "protobuf-c-compiler" - - "protobuf-compiler" - - "python3-protobuf" - - "libnet-dev" - - "pkg-config" - - "libnl-3-dev" - - "libbsd0" - - "libbsd-dev" - - "iproute2" - - "libcap-dev" - - "libaio-dev" - - "libbsd-dev" - - "python3-yaml" - - "libnl-route-3-dev" - - "gnutls-dev" diff --git a/.mailmap b/.mailmap index 8076f0bc9..6f046b972 100644 --- a/.mailmap +++ b/.mailmap @@ -6,5 +6,3 @@ Andrei Vagin Andrei Vagin Andrei Vagin Cyrill Gorcunov -Alexander Mikhalitsyn -Alexander Mikhalitsyn diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..94841b3f3 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,35 @@ +language: c +os: linux +dist: bionic +services: + - docker +jobs: + include: + - os: linux + arch: ppc64le + env: TR_ARCH=local + dist: bionic + - os: linux + arch: ppc64le + env: TR_ARCH=local CLANG=1 + dist: bionic + - os: linux + arch: s390x + env: TR_ARCH=local + dist: bionic + - os: linux + arch: arm64-graviton2 + env: TR_ARCH=local RUN_TESTS=1 + dist: focal + group: edge + virt: vm + - os: linux + arch: arm64-graviton2 + env: TR_ARCH=local CLANG=1 RUN_TESTS=1 + group: edge + virt: vm + dist: bionic +script: + - sudo make -C scripts/ci $TR_ARCH +after_success: + - make -C scripts/ci after_success diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 120000 index e3c5a92d9..000000000 --- a/CLAUDE.md +++ /dev/null @@ -1 +0,0 @@ -GEMINI.md \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 03875639d..96972296e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,8 @@ +[![master](https://travis-ci.org/checkpoint-restore/criu.svg?branch=master)](https://travis-ci.org/checkpoint-restore/criu) +[![development](https://travis-ci.org/checkpoint-restore/criu.svg?branch=criu-dev)](https://travis-ci.org/checkpoint-restore/criu) +[![Codacy Badge](https://api.codacy.com/project/badge/Grade/55251ec7db28421da4481fc7c1cb0cee)](https://www.codacy.com/app/xemul/criu?utm_source=github.com&utm_medium=referral&utm_content=xemul/criu&utm_campaign=Badge_Grade) +

+ ## How to contribute to CRIU CRIU project is (almost) the never-ending story, because we have to always keep up with the @@ -8,8 +13,8 @@ Here are some useful hints to get involved. * We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; * CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; -* Feedback is expected on the GitHub issues page and on the [mailing list](https://lore.kernel.org/criu); -* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lore.kernel.org/criu). +* Feedback is expected on the GitHub issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); +* We accept GitHub pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [CRIU development mailing list](https://lists.openvz.org/mailman/listinfo/criu). Below we describe in more detail recommend practices for CRIU development. * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); @@ -27,137 +32,54 @@ The repository may contain multiple branches. Development happens in the **criu- To clone CRIU repo and switch to the proper branch, run: ``` -git clone https://github.com/checkpoint-restore/criu criu -cd criu -git checkout criu-dev + git clone https://github.com/checkpoint-restore/criu criu + cd criu + git checkout criu-dev ``` -### Building from source +### Compile -Follow these steps to compile CRIU from source code. +First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. -#### Installing build dependencies - -First, you need to install the required build dependencies. We provide scripts to simplify this process for several Linux distributions in [contrib/dependencies](contrib/dependencies). For a complete list of dependencies, please refer to the [installation guide](https://criu.org/Installation). - -##### On Ubuntu/Debian-based systems: +To compile CRIU, run: ``` -./contrib/dependencies/apt-packages.sh -``` - -##### On Fedora/CentOS-based systems: - -``` -./contrib/dependencies/dnf-packages.sh -``` - -##### Using Nix: - -``` -nix develop -``` - -#### Compiling CRIU - -Once the dependencies are installed, you can compile CRIU by running the `make` command from the root of the source directory: - -``` -make + make ``` This should create the `./criu/criu` executable. ## Edit the source code +If you use ctags, you can generate the ctags file by running + +``` + make tags +``` + When you change the source code, please keep in mind the following code conventions: -* code is written to be read, so the code readability is the most important thing you need to have in mind when preparing patches * we prefer tabs and indentations to be 8 characters width -* we prefer line length of 80 characters or less, more is allowed if it helps with code readability -* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community +* CRIU mostly follows [Linux kernel coding style](https://www.kernel.org/doc/Documentation/process/coding-style.rst), but we are less strict than the kernel community. -Other conventions can be learned from the source code itself. In short, make sure your new code looks similar to what is already there. - -## Automatic tools to fix coding-style - -Important: These tools are there to advise you, but should not be considered as a "source of truth", as tools also make nasty mistakes from time to time which can completely break code readability. - -The following command can be used to automatically run a code linter for Python files (ruff), Shell scripts (shellcheck), -text spelling (codespell), and a number of CRIU-specific checks (usage of print macros and EOL whitespace for C files). - -``` -make lint -``` - -In addition, we have adopted a [clang-format configuration file](https://www.kernel.org/doc/Documentation/process/clang-format.rst) -based on the kernel source tree. However, compliance with the clang-format autoformat rules is optional. If the automatic code formatting -results in decreased readability, we may choose to ignore these errors. - -Run the following command to check if your changes are compliant with the clang-format rules: - -``` -make indent -``` - -This command is built upon the `git-clang-format` tool and supports two options `BASE` and `OPTS`. The `BASE` option allows you to -specify a range of commits to check for coding style issues. By default, it is set to `HEAD~1`, so that only the last commit is checked. -If you are developing on top of the criu-dev branch and want to check all your commits for compliance with the clang-format rules, you -can use `BASE=origin/criu-dev`. The `OPTS` option can be used to pass additional options to `git-clang-format`. For example, if you want -to check the last *N* commits for formatting errors, without applying the changes to the codebase you can use the following command. - -``` -make indent OPTS=--diff BASE=HEAD~N -``` - -Note that for pull requests, the "Run code linter" workflow runs these checks for all commits. If a clang-format error is detected -we need to review the suggested changes and decide if they should be fixed before merging. - -Here are some bad examples of clang-format-ing: - -* if clang-format tries to force 120 characters and breaks readability - it is wrong: - -``` -@@ -58,8 +59,7 @@ static int register_membarriers(void) - } - - if (!all_ok) { -- fail("can't register membarrier()s - tried %#x, kernel %#x", -- barriers_registered, barriers_supported); -+ fail("can't register membarrier()s - tried %#x, kernel %#x", barriers_registered, barriers_supported); - return -1; - } -``` - -* if clang-format breaks your beautiful readability friendly alignment in structures, comments or defines - it is wrong: - -``` ---- a/test/zdtm/static/membarrier.c -+++ b/test/zdtm/static/membarrier.c -@@ -27,9 +27,10 @@ static const struct { - int register_cmd; - int execute_cmd; - } membarrier_cmds[] = { -- { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, -- { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, -- { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, -+ { "", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, MEMBARRIER_CMD_PRIVATE_EXPEDITED }, -+ { "_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, -+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, -+ { "_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, - }; -``` +Other conventions can be learned from the source code itself. In short, make sure your new code +looks similar to what is already there. ## Test your changes CRIU comes with an extensive test suite. To check whether your changes introduce any regressions, run ``` -make test + make test ``` The command runs [ZDTM Test Suite](https://criu.org/ZDTM_Test_Suite). Check for any error messages produced by it. +In case you'd rather have someone else run the tests, you can use travis-ci for your +own GitHub fork of CRIU. It will check the compilation for various supported platforms, +as well as run most of the tests from the suite. See https://travis-ci.org/checkpoint-restore/criu +for more details. + ## Describe your changes Describe your problem. Whether your change is a one-line bug fix or @@ -185,21 +107,21 @@ If your change fixes a bug in a specific commit, e.g. you found an issue using the SHA-1 ID, and the one line summary. For example: ``` -Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") + Fixes: 9433b7b9db3e ("make: use cflags/ldflags for config.h detection mechanism") ``` The following `git config` settings can be used to add a pretty format for outputting the above style in the `git log` or `git show` commands: ``` -[pretty] - fixes = Fixes: %h (\"%s\") + [pretty] + fixes = Fixes: %h (\"%s\") ``` If your change address an issue listed in GitHub, please use `Fixes:` tag with the number of the issue. For instance: ``` -Fixes: #339 + Fixes: #339 ``` The `Fixes:` tags should be put at the end of the detailed description. @@ -282,7 +204,7 @@ can certify the below: then you just add a line saying ``` -Signed-off-by: Random J Developer + Signed-off-by: Random J Developer ``` using your real name (please, no pseudonyms or anonymous contributions if @@ -294,14 +216,14 @@ commit message. To append such line to a commit you already made, use ``` From: Random J Developer -Subject: [PATCH] component: Short patch description + Subject: [PATCH] component: Short patch description -Long patch description (could be skipped if patch -is trivial enough) + Long patch description (could be skipped if patch + is trivial enough) -Signed-off-by: Random J Developer ---- -Patch body here + Signed-off-by: Random J Developer + --- + Patch body here ``` ## Submit your work upstream @@ -335,8 +257,8 @@ contains the following: revisions should be listed. For example: ``` -v3: rebase on the current criu-dev -v2: add commit to foo() and update bar() coding style + v3: rebase on the current criu-dev + v2: add commit to foo() and update bar() coding style ``` If there are only minor updates to the commits in a pull request, it is @@ -354,7 +276,7 @@ Historically, CRIU worked with mailing lists and patches so if you still prefer To create a patch, run ``` -git format-patch --signoff origin/criu-dev + git format-patch --signoff origin/criu-dev ``` You might need to read GIT documentation on how to prepare patches @@ -365,8 +287,8 @@ at all. We recommend to post patches using `git send-email` ``` -git send-email --cover-letter --no-chain-reply-to --annotate \ - --confirm=always --to=criu@lists.linux.dev criu-dev + git send-email --cover-letter --no-chain-reply-to --annotate \ + --confirm=always --to=criu@openvz.org criu-dev ``` Note that the `git send-email` subcommand may not be in @@ -378,14 +300,14 @@ If this is your first time using git send-email, you might need to configure it to point it to your SMTP server with something like: ``` -git config --global sendemail.smtpServer stmp.example.net + git config --global sendemail.smtpServer stmp.example.net ``` -If you get tired of typing `--to=criu@lists.linux.dev` all the time, +If you get tired of typing `--to=criu@openvz.org` all the time, you can configure that to be automatically handled as well: ``` -git config sendemail.to criu@lists.linux.dev + git config sendemail.to criu@openvz.org ``` If a developer is sending another version of the patch (e.g. to address @@ -398,7 +320,7 @@ version if needed though). ### Mail patches -The patches should be sent to CRIU development mailing list, `criu AT lists.linux.dev`. Note that you need to be subscribed first in order to post. The list web interface is available at https://lore.kernel.org/criu; you can also use standard mailman aliases to work with it. +The patches should be sent to CRIU development mailing list, `criu AT openvz.org`. Note that you need to be subscribed first in order to post. The list web interface is available at https://openvz.org/mailman/listinfo/criu; you can also use standard mailman aliases to work with it. Please make sure the email client you're using doesn't screw your patch (line wrapping and so on). @@ -415,3 +337,5 @@ sometimes a patch may fly around a week before it gets reviewed. Wiki article: [Continuous integration](https://criu.org/Continuous_integration) CRIU tests are run for each series sent to the mailing list. If you get a message from our patchwork that patches failed to pass the tests, you have to investigate what is wrong. + +We also recommend you to [enable Travis CI for your repo](https://criu.org/Continuous_integration#Enable_Travis_CI_for_your_repo) to check patches in your git branch, before sending them to the mailing list. diff --git a/Documentation/Makefile b/Documentation/Makefile index de0cc448d..edadfb81c 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -14,7 +14,6 @@ FOOTER := footer.txt SRC1 += crit.txt SRC1 += criu-ns.txt SRC1 += compel.txt -SRC1 += criu-amdgpu-plugin.txt SRC8 += criu.txt SRC := $(SRC1) $(SRC8) XMLS := $(patsubst %.txt,%.xml,$(SRC)) diff --git a/Documentation/compel.txt b/Documentation/compel.txt index 506228f59..6ccd20861 100644 --- a/Documentation/compel.txt +++ b/Documentation/compel.txt @@ -90,17 +90,14 @@ The parasitic code is compiled and converted to a header using *compel*, and inc *#include "parasite.h"* -Following steps are performed to infect the victim process: +Following steps are perfomed to infect the victim process: - stop the task: *int compel_stop_task(int pid);* - prepare infection handler: *struct parasite_ctl *compel_prepare(int pid);* - execute system call: *int compel_syscall(ctl, int syscall_nr, long *ret, int arg ...);* - infect victim: *int compel_infect(ctl, nr_thread, size_of_args_area);* - cure the victim: *int compel_cure(ctl);* //ctl pointer is freed by this call - - Resume victim: *int compel_resume_task(pid, orig_state, state)* or - *int compel_resume_task_sig(pid, orig_state, state, stop_signo).* - //compel_resume_task_sig() could be used in case when victim is in stopped state. - stop_signo could be read by calling compel_parse_stop_signo(). + - Resume victim: *int compel_resume_task(pid, orig_state, state);* *ctl* must be configured with blob information by calling *PREFIX_setup_c_header()*, with ctl as its argument. *PREFIX* is the argument given to *-p* when calling hgen, else it is deduced from file name. diff --git a/Documentation/criu-amdgpu-plugin.txt b/Documentation/criu-amdgpu-plugin.txt deleted file mode 100644 index fe76fc3bc..000000000 --- a/Documentation/criu-amdgpu-plugin.txt +++ /dev/null @@ -1,114 +0,0 @@ -ROCM Support(1) -=============== - -NAME ----- -criu-amdgpu-plugin - A plugin extension to CRIU to support checkpoint/restore in -userspace for AMD GPUs. - - -CURRENT SUPPORT ---------------- -Single and Multi GPU systems (Gfx9) -Checkpoint / Restore on different system -Checkpoint / Restore inside a docker container -Pytorch -Tensorflow -Using CRIU Image Streamer -Parallel Restore - -DESCRIPTION ------------ -Though *criu* is a great tool for checkpointing and restoring running -applications, it has certain limitations such as it cannot handle -applications that have device files open. In order to support *ROCm* based -workloads with *criu* we need to augment criu's core functionality with a -plugin based extension mechanism. *criu-amdgpu-plugin* provides the necessary support -to criu to allow Checkpoint / Restore with ROCm. - - -Dependencies ------------- -*amdkfd support*:: - In order to snapshot the *VRAM* and other *GPU* device states, we require - an updated version of amdkfd(amdgpu) driver. - -OPTIONS -------- -Optional parameters can be passed in as environment variables before -executing criu command. - -*KFD_FW_VER_CHECK*:: - Enable or disable firmware version check. - If enabled, firmware version on restored gpu needs to be greater than or - equal firmware version on checkpointed GPU. Default:Enabled - - E.g: - KFD_FW_VER_CHECK=0 - -*KFD_SDMA_FW_VER_CHECK*:: - Enable or disable SDMA firmware version check. - If enabled, SDMA firmware version on restored gpu needs to be greater than or - equal firmware version on checkpointed GPU. Default:Enabled - - E.g: - KFD_SDMA_FW_VER_CHECK=0 - -*KFD_CACHES_COUNT_CHECK*:: - Enable or disable caches count check. If enabled, the caches count on - restored GPU needs to be greater than or equal caches count on checkpointed - GPU. Default:Enabled - - E.g: - KFD_CACHES_COUNT_CHECK=0 - -*KFD_NUM_GWS_CHECK*:: - Enable or disable num_gws check. If enabled, the num_gws on - restored GPU needs to be greater than or equal num_gws on checkpointed - GPU. Default:Enabled - - E.g: - KFD_NUM_GWS_CHECK=0 - -*KFD_VRAM_SIZE_CHECK*:: - Enable or disable VRAM size check. If enabled, the VRAM size on - restored GPU needs to be greater than or equal VRAM size on checkpointed - GPU. Default:Enabled - - E.g: - KFD_VRAM_SIZE_CHECK=0 - -*KFD_NUMA_CHECK*:: - Enable or disable NUMA CPU region check. If enabled, the plugin will restore - GPUs that belong to one CPU NUMA region to the same CPU NUMA region. - Default:Enabled - - E.g: - KFD_NUMA_CHECK=1 - -*KFD_CAPABILITY_CHECK*:: - Enable or disable capability check. If enabled, the capability on - restored GPU needs to be equal to the capability on the checkpointed GPU. - Default:Enabled - - E.g: - KFD_CAPABILITY_CHECK=1 - -*KFD_MAX_BUFFER_SIZE*:: - On some systems, VRAM sizes may exceed RAM sizes, and so buffers for dumping - and restoring VRAM may be unable to fit. Set to a nonzero value (in bytes) - to set a limit on the plugin's memory usage. - Default:0 (Disabled) - - E.g: - KFD_MAX_BUFFER_SIZE="2G" - - -AUTHOR ------- -The AMDKFD team. - - -COPYRIGHT ---------- -Copyright \(C) 2020-2021, Advanced Micro Devices, Inc. (AMD) diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 0c9a9e527..690f61e14 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -106,7 +106,7 @@ be restored with *mount(src, mountpoint, flags, options)*. When used, *dump* is expected to always succeed if a mountpoint is to be auto-detected, however *restore* may fail (or do something wrong) if the assumption for restore logic is incorrect. This option is -not compatible with *--external* *dev*. +not compatable with *--external* *dev*. *--action-script* 'script':: Add an external action script to be executed at certain stages. @@ -155,17 +155,6 @@ not compatible with *--external* *dev*. notification message contains a file descriptor for the master pty - *query-ext-files*::: - called after the process tree is stopped and network is locked. - This hook is used only in the RPC mode. The notification reply - contains file ids to be added to external file list (may be empty). - -*--unprivileged*:: - This option tells *criu* to accept the limitations when running - as non-root. Running as non-root requires *criu* at least to have - *CAP_SYS_ADMIN* or *CAP_CHECKPOINT_RESTORE*. For details about running - *criu* as non-root please consult the *NON-ROOT* section. - *-V*, *--version*:: Print program version and exit. @@ -253,12 +242,6 @@ In other words, do not use it unless really needed. Tell *criu* that one end of a pair of UNIX sockets (created by *socketpair*(2)) with the given _id_ is OK to be disconnected. -*--external* **net[**__inode__**]:**__name__:: - Mark a network namespace as external and do not include it in the - checkpoint. The label 'name' can be used with *--inherit-fd* during - restore to specify a file descriptor to a preconfigured network - namespace. - *--external* **pid[**__inode__**]:**__name__:: Mark a PID namespace as external. This can be later used to restore a process into an existing PID namespace. The label 'name' can be @@ -345,8 +328,7 @@ mount -t cgroup -o devices,freezer none devices,freezer Checkpoint established TCP connections. *--tcp-close*:: - Don't dump the state of, or block, established tcp connections - (including the connection is once established but now closed). + Don't dump the state of, or block, established tcp connections. This is useful when tcp connections are not going to be restored. *--skip-in-flight*:: @@ -378,10 +360,6 @@ mount -t cgroup -o devices,freezer none devices,freezer Allows to link unlinked files back, if possible (modifies filesystem during *restore*). -*--timeout* 'number':: - Set a time limit in seconds for collecting tasks during the - dump operation. The timeout is 10 seconds by default. - *--ghost-limit* 'size':: Set the maximum size of deleted file to be carried inside image. By default, up to 1M file is allowed. Using this @@ -389,13 +367,6 @@ mount -t cgroup -o devices,freezer none devices,freezer 'size' may be postfixed with a *K*, *M* or *G*, which stands for kilo-, mega, and gigabytes, accordingly. -*--ghost-fiemap*:: - Enable an optimization based on fiemap ioctl that can reduce the - number of system calls used when checkpointing highly sparse ghost - files. This option is enabled by default, and it can be disabled - with *--no-ghost-fiemap*. An automatic fallback to SEEK_HOLE/SEEK_DATA - is used when fiemap is not supported. - *-j*, *--shell-job*:: Allow one to dump shell jobs. This implies the restored task will inherit session and process group ID from the *criu* itself. @@ -437,7 +408,7 @@ By default the option is set to *fpu* and *ins*. Set the method to be used to validate open files. Validation is done to ensure that the version of the file being restored is the same version when it was dumped. -+ + The 'mode' may be one of the following: *filesize*::: @@ -462,33 +433,6 @@ The 'mode' may be one of the following: *nftables*::: Use nftables rules to drop the packets. - *skip*::: Don't lock the network. If *--tcp-close* is not used, the network - must be locked externally to allow CRIU to dump TCP connections. - -*--allow-uprobes*:: - Allow dumping when uprobes vma is present. When used on dump, this option is - required on restore as well. - - A uprobes vma is automatically created by the kernel once a uprobe is - triggered. This mapping is not removed even once the uprobe is deleted. So, - even if a process once had uprobes attached to it, and they're removed by - the time the process is dumped, this option is still required because criu - has no way of knowing whether there are active uprobes or not. - - When using this option on restore, make sure the uprobes (if any) active on - the dumped processes are still active. Otherwise, when execution reaches - a uprobe'd location in any of the restored processes, that process will be - sent a SIGTRAP. - - As an example, say a uprobe is set at function foo in the executable of the - process p_bar. Whenever execution in p_bar reaches function foo, the uprobe - is triggered. If the uprobe has been triggered at least once, then the kernel - will have created the uprobes vma. To dump p_bar, this option is - necessary. After dumping, say the uprobe is deleted. Now, on restoring with - this option, once execution reaches function foo, SIGTRAP will be sent to - the restored p_bar. Unless it has a signal handler installed for SIGTRAP, - it will be terminated and core dumped. - *restore* ~~~~~~~~~ Restores previously checkpointed processes. @@ -502,8 +446,8 @@ Restores previously checkpointed processes. The 'resource' argument can be one of the following: + - **tty[**__rdev__**:**__dev__**]** - - **pipe:[**__inode__**]** - - **socket:[**__inode__*]* + - **pipe[**__inode__**]** + - **socket[**__inode__*]* - **file[**__mnt_id__**:**__inode__**]** - 'path/to/file' @@ -577,7 +521,7 @@ usually need to be escaped from shell. Restore cgroups configuration associated with a task from the image. Controllers are always restored in an optimistic way -- if already present in system, *criu* reuses it, otherwise it will be created. -+ + The 'mode' may be one of the following: *none*::: Do not restore cgroup properties but require cgroup to @@ -701,7 +645,7 @@ are not adequate, but this can be suppressed by using *--cpu-cap=none*. Set the method to be used to validate open files. Validation is done to ensure that the version of the file being restored is the same version when it was dumped. -+ + The 'mode' may be one of the following: *filesize*::: @@ -713,13 +657,6 @@ The 'mode' may be one of the following: build-ID cannot be obtained, 'chksm-first' method will be used. This is the default if mode is unspecified. -*--skip-file-rwx-check*:: - Skip checking file permissions (r/w/x for u/g/o) on restore. - -*--allow-uprobes*:: - Required when dumped with this option. Refer to this option in the section - on dumping for more details. - *check* ~~~~~~~ Checks whether the kernel supports the features needed by *criu* to @@ -926,42 +863,6 @@ configuration file will overwrite all other configuration file settings or RPC options. *This can lead to undesired behavior of criu and should only be used carefully.* -NON-ROOT --------- -*criu* can be used as non-root with either the *CAP_SYS_ADMIN* capability -or with the *CAP_CHECKPOINT_RESTORE* capability introduces in Linux kernel 5.9. -*CAP_CHECKPOINT_RESTORE* is the minimum that is required. - -*criu* also needs either *CAP_SYS_PTRACE* or a value of 0 in -*/proc/sys/kernel/yama/ptrace_scope* (see *ptrace*(2)) to be able to interrupt -the process for dumping. - -Running *criu* as non-root has many limitations and depending on the process -to checkpoint and restore it may not be possible. - -In addition to *CAP_CHECKPOINT_RESTORE* it is possible to give *criu* additional -capabilities to enable additional features in non-root mode. - -Currently *criu* can benefit from the following additional capabilities: - - - *CAP_NET_ADMIN* - - *CAP_SYS_CHROOT* - - *CAP_SETUID* - - *CAP_SYS_RESOURCE* - -Note that for some operations, having a capability in a namespace other than -the init namespace (i.e. the default/root namespace) is not sufficient. For -example, in order to read symlinks in proc/[pid]/map_files CRIU requires -CAP_CHECKPOINT_RESTORE in the init namespace; having CAP_CHECKPOINT_RESTORE -while running in another user namespace (e.g. in a container) does not allow -CRIU to read symlinks in /proc/[pid]/map_files. - -Without access to /proc/[pid]/map_files checkpointing/restoring processes -that have mapped deleted files may not be possible. - -Independent of the capabilities it is always necessary to use "*--unprivileged*" to -accept *criu*'s limitation in non-root mode. - EXAMPLES -------- To checkpoint a program with pid of *1234* and write all image files into diff --git a/Documentation/logo.svg b/Documentation/logo.svg deleted file mode 100644 index f713e72b7..000000000 --- a/Documentation/logo.svg +++ /dev/null @@ -1,136 +0,0 @@ - - - - - - - diff --git a/GEMINI.md b/GEMINI.md deleted file mode 100644 index e56c1de12..000000000 --- a/GEMINI.md +++ /dev/null @@ -1,136 +0,0 @@ -# CRIU (Checkpoint/Restore In User-space) - -CRIU is a tool for saving the state of a running application to a set of files -(checkpointing) and restoring it back to a live state. It is primarily used for -live migration of containers, in-place updates, and fast application startup. - -It is implemented as a command-line tool called `criu`. The two primary commands -are `dump` and `restore`. - -- `dump`: Saves a process tree and all its related resources (file - descriptors, IPC, sockets, namespaces, etc.) into a collection of image - files. -- `restore`: Restores processes from image files to the same state they were - in before the dump. - -## Quick Start - -To get a feel for `criu`, you can try checkpointing and restoring a simple -process. - -1. **Run a simple process:** - Open a terminal and run a command that will run for a while. Find its PID. - ```bash - sleep 1000 & - [1] 12345 - ``` - -2. **Dump the process:** - As root, use `criu dump` with the process ID (`-t`) and a directory for the - image files (`-D`). - ```bash - sudo criu dump -t 12345 -D /tmp/sleep_images -v4 --shell-job - ``` - The `sleep` process will no longer be running. - -3. **Restore the process:** - Use `criu restore` to bring the process back to life from the images. - ```bash - sudo criu restore -D /tmp/sleep_images -v4 --shell-job - ``` - The `sleep` process will be running again as if nothing happened. - -# For Developers and Contributors - -This section contains more technical details about CRIU's internals and -development process. - -## Dump Process - -On dump, CRIU uses available kernel interfaces to collect information about -processes. For properties that can only be retrieved from within the process -itself, CRIU injects a binary blob (called a "parasite") into the process's -address space and executes it in the context of one of the process's threads. -This injection is handled by a subproject called **Compel**. - -## Restore Process - -On restore, CRIU reads the image files to reconstruct the processes. The goal is -to restore them to the exact state they were in before the dump. The restore -process is divided into several stages (defined as `CR_STATE_*` in -`./criu/include/restorer.h`). - -The main `criu` process acts as a coordinator. It first restores resources with -inter-process dependencies (file descriptors, sockets, shared memory, -namespaces, etc.). It then forks the process tree and sets up namespaces. -Finally, it restores process-specific resources like file descriptors and memory -mappings. - -A key step involves a small, self-contained binary called the "restorer". All -restored processes switch to executing this code, which unmaps the CRIU-specific -memory and restores the application's original memory mappings. On the final -step, the restorer calls `sigreturn` on a prepared signal frame to resume the -process with the state it had at the moment of the dump. - -## Compel - -Compel is a subproject responsible for generating the binary blobs used for the -parasite code (for dumping) and the restorer code (for restoring). It provides a -library for injecting and executing this code within the target process's -address space. It is a separate project because the logic for generating and -injecting Position-Independent Executable (PIE) code is complex and -self-contained. - -## Coding Style - -The C code in the CRIU project follows the -[Linux Kernel Coding Style](https://www.kernel.org/doc/html/latest/process/coding-style.html). -Here are some of the main points: - -- **Indentation**: Use tabs, which are set to 8 characters. -- **Line Length**: The preferred line limit is 80 characters, but it can be - extended to 120 if it improves code readability. -- **Braces**: - - The opening brace for a function goes on a new line. - - The opening brace for a block (like `if`, `for`, `while`, `switch`) goes - on the same line. -- **Spaces**: Use spaces around operators (`+`, `-`, `*`, `/`, `%`, `<`, `>`, - `=`, etc.). -- **Naming**: Use descriptive names for functions and variables. -- **Comments**: Use C-style comments (`/* ... */`). For multi-line comments, - the preferred format is: - ```c - /* - * This is a multi-line - * comment. - */ - ``` - -## Code Layout - -The code is organized into the following directories: - -- `./compel`: The Compel sub-project. -- `./criu`: The main `criu` tool source code. -- `./images`: Protobuf descriptions for the image files. -- `./test`: All tests. -- `./test/zdtm`: The Zero-Downtime Migration (ZDTM) test suite. -- `./test/zdtm.py`: The executor script for ZDTM tests. -- `./scripts`: Helper scripts. -- `./scripts/build`: Docker image files used for CI and cross-compilation - checks. -- `./crit`: A tool to inspect and manipulate CRIU image files. -- `./soccr`: A library for TCP socket checkpoint/restore. - -## Tests - -The main test suite is ZDTM. Here is an example of how to run a single test: - -```bash -sudo ./test/zdtm.py run -t zdtm/static/env00 -``` - -Each ZDTM test has three stages: preparation, C/R, and results checks. During -the test, a process calls `test_daemon()` to signal it is ready for C/R, then -calls `test_waitsig()` to wait for the C/R stage to complete. After being -restored, the test checks that all its resources are still in a valid state. diff --git a/INSTALL.md b/INSTALL.md index af0702518..d786d06eb 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,31 +1,11 @@ -## Building CRIU from source code - -First, you need to install compile-time dependencies. Check [Installation dependencies](https://criu.org/Installation#Dependencies) for more info. - -To compile CRIU, run: -``` -make -``` -This should create the `./criu/criu` executable. - -To change the default behaviour of CRIU, the following variables can be passed -to the make command: - - * **NETWORK_LOCK_DEFAULT**, can be set to one of the following - values: `NETWORK_LOCK_IPTABLES`, `NETWORK_LOCK_NFTABLES`, - `NETWORK_LOCK_SKIP`. CRIU defaults to `NETWORK_LOCK_IPTABLES` - if nothing is specified. If another network locking backend is - needed, `make` can be called like this: - `make NETWORK_LOCK_DEFAULT=NETWORK_LOCK_NFTABLES` - ## Installing CRIU from source code Once CRIU is built one can easily setup the complete CRIU package (which includes executable itself, CRIT tool, libraries, manual and etc) simply typing -``` -make install -``` + + make install + this command accepts the following variables: * **DESTDIR**, to specify global root where all components will be placed under (empty by default); @@ -36,17 +16,17 @@ this command accepts the following variables: * **LIBDIR**, to specify directory where to put libraries (guess the correct path by default). Thus one can type -``` -make DESTDIR=/some/new/place install -``` + + make DESTDIR=/some/new/place install + and get everything installed under `/some/new/place`. ## Uninstalling CRIU To clean up previously installed CRIU instance one can type -``` -make uninstall -``` + + make uninstall + and everything should be removed. Note though that if some variable (**DESTDIR**, **BINDIR** and such) has been used during installation procedure, the same *must* be passed with uninstall action. diff --git a/MAINTAINERS b/MAINTAINERS index 8fee8e571..bb153f1ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4,5 +4,3 @@ Mike Rapoport Dmitry Safonov <0x7f454c46@gmail.com> Adrian Reber Pavel Tikhomirov -Radostin Stoyanov -Alexander Mikhalitsyn diff --git a/MAINTAINERS_GUIDE.md b/MAINTAINERS_GUIDE.md index 5de8e6cb6..2830a3caa 100644 --- a/MAINTAINERS_GUIDE.md +++ b/MAINTAINERS_GUIDE.md @@ -103,7 +103,7 @@ architecture changes should be reviewed by the chief maintainer. Also the chief maintainer has the veto power on any change submitted to any branch. Naturally, a change in the criu-dev branch can be reverted after a chief maintainer veto, a change in the master branch must be -carefully reviewed by the chief maintainer and vetoed in advance. +carefully reviwed by the chief maintainer and vetoed in advance. ### How are maintainers added (and removed)? diff --git a/Makefile b/Makefile index e26807158..17e40bbf4 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ endif # # Supported Architectures -ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips loongarch64 riscv64,$(ARCH)),) +ifneq ($(filter-out x86 arm aarch64 ppc64 s390 mips,$(ARCH)),) $(error "The architecture $(ARCH) isn't supported") endif @@ -35,18 +35,18 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - ARCHCFLAGS += -march=armv6 + USERCFLAGS += -march=armv6 endif ifeq ($(ARMV),7) - ARCHCFLAGS += -march=armv7-a+fp + USERCFLAGS += -march=armv7-a endif ifeq ($(ARMV),8) - # Running 'setarch linux32 uname -m' returns armv8l on aarch64. + # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. # This tells CRIU to handle armv8l just as armv7hf. Right now this is # only used for compile testing. No further verification of armv8l exists. - ARCHCFLAGS += -march=armv7-a + USERCFLAGS += -march=armv7-a ARMV := 7 endif @@ -64,8 +64,6 @@ endif ifeq ($(ARCH),aarch64) DEFINES := -DCONFIG_AARCH64 - CC_MBRANCH_PROT := $(shell $(CC) -c -x c /dev/null -mbranch-protection=none -o /dev/null >/dev/null 2>&1 && echo "-mbranch-protection=none") - CFLAGS_PIE := $(CC_MBRANCH_PROT) endif ifeq ($(ARCH),ppc64) @@ -82,14 +80,6 @@ ifeq ($(ARCH),mips) DEFINES := -DCONFIG_MIPS endif -ifeq ($(ARCH),loongarch64) - DEFINES := -DCONFIG_LOONGARCH64 -endif - -ifeq ($(ARCH),riscv64) - DEFINES := -DCONFIG_RISCV64 -endif - # # CFLAGS_PIE: # @@ -112,20 +102,10 @@ export PROTOUFIX DEFINES # # Independent options for all tools. DEFINES += -D_FILE_OFFSET_BITS=64 -DEFINES += -D_LARGEFILE64_SOURCE DEFINES += -D_GNU_SOURCE WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes -# -Wdangling-pointer results in false warning when we add a list element to -# local list head variable. It is false positive because before leaving the -# function we always check that local list head variable is empty, thus -# insuring that pointer to it is not dangling anywhere, but gcc can't -# understand it. -# Note: There is similar problem with kernel list, where this warning is also -# disabled: https://github.com/torvalds/linux/commit/49beadbd47c2 -WARNINGS += -Wno-dangling-pointer -Wno-unknown-warning-option - CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline -fprofile-update=atomic export CFLAGS-GCOV @@ -133,19 +113,11 @@ ifeq ($(ARCH),mips) WARNINGS := -rdynamic endif -ifeq ($(ARCH),loongarch64) -WARNINGS += -Wno-implicit-function-declaration -endif - ifneq ($(GCOV),) LDFLAGS += -lgcov CFLAGS += $(CFLAGS-GCOV) endif -ifneq ($(NETWORK_LOCK_DEFAULT),) - CFLAGS += -DNETWORK_LOCK_DEFAULT=$(NETWORK_LOCK_DEFAULT) -endif - ifeq ($(ASAN),1) CFLAGS-ASAN := -fsanitize=address export CFLAGS-ASAN @@ -170,12 +142,12 @@ export GMON GMONLDOPT endif AFLAGS += -D__ASSEMBLY__ -CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ +CFLAGS += $(USERCFLAGS) $(WARNINGS) $(DEFINES) -iquote include/ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/ export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS # Default target -all: criu lib crit cuda_plugin +all: criu lib crit .PHONY: all # @@ -278,19 +250,26 @@ criu: $(criu-deps) $(Q) $(MAKE) $(build)=criu all .PHONY: criu +crit/Makefile: ; +crit/%: criu .FORCE + $(Q) $(MAKE) $(build)=crit $@ +crit: criu + $(Q) $(MAKE) $(build)=crit all +.PHONY: crit + unittest: $(criu-deps) $(Q) $(MAKE) $(build)=criu unittest .PHONY: unittest # -# Libraries next once criu is ready +# Libraries next once crit it ready # (we might generate headers and such # when building criu itself). lib/Makefile: ; -lib/%: criu .FORCE +lib/%: crit .FORCE $(Q) $(MAKE) $(build)=lib $@ -lib: criu +lib: crit $(Q) $(MAKE) $(build)=lib all .PHONY: lib @@ -299,28 +278,21 @@ clean mrproper: $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=soccr $@ $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ + $(Q) $(MAKE) $(build)=lib $@ + $(Q) $(MAKE) $(build)=crit $@ .PHONY: clean mrproper -clean-amdgpu_plugin: - $(Q) $(MAKE) -C plugins/amdgpu clean -.PHONY: clean-amdgpu_plugin - -clean-cuda_plugin: - $(Q) $(MAKE) -C plugins/cuda clean -.PHONY: clean-cuda_plugin - clean-top: $(Q) $(MAKE) -C Documentation clean $(Q) $(MAKE) $(build)=test/compel clean $(Q) $(RM) .gitid .PHONY: clean-top -clean: clean-top clean-amdgpu_plugin clean-cuda_plugin +clean: clean-top -mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin +mrproper-top: clean-top $(Q) $(RM) $(CONFIG_HEADER) $(Q) $(RM) $(VERSION_HEADER) $(Q) $(RM) $(COMPEL_VERSION_HEADER) @@ -348,18 +320,6 @@ test: zdtm $(Q) $(MAKE) -C test .PHONY: test -amdgpu_plugin: criu - $(Q) $(MAKE) -C plugins/amdgpu all -.PHONY: amdgpu_plugin - -cuda_plugin: criu - $(Q) $(MAKE) -C plugins/cuda all -.PHONY: cuda_plugin - -crit: lib - $(Q) $(MAKE) -C crit -.PHONY: crit - # # Generating tar requires tag matched CRIU_VERSION. # If not found then simply use GIT's describe with @@ -425,7 +385,6 @@ help: @echo ' Targets:' @echo ' all - Build all [*] targets' @echo ' * criu - Build criu' - @echo ' * crit - Build crit' @echo ' zdtm - Build zdtm test-suite' @echo ' docs - Build documentation' @echo ' install - Install CRIU (see INSTALL.md)' @@ -441,72 +400,43 @@ help: @echo ' unittest - Run unit tests' @echo ' lint - Run code linters' @echo ' indent - Indent C code' - @echo ' amdgpu_plugin - Make AMD GPU plugin' - @echo ' cuda_plugin - Make NVIDIA CUDA plugin' .PHONY: help -ruff: - @ruff --version - ruff check ${RUFF_FLAGS} --config=scripts/ruff.toml \ - test/zdtm.py \ - test/inhfd/*.py \ - test/others/rpc/config_file.py \ - test/others/action-script/check_actions.py \ - test/others/pycriu/*.py \ - lib/pycriu/criu.py \ - lib/pycriu/__init__.py \ - lib/pycriu/images/pb2dict.py \ - lib/pycriu/images/images.py \ - scripts/criu-ns \ - test/others/criu-ns/run.py \ - crit/*.py \ - crit/crit/*.py \ - scripts/uninstall_module.py \ - coredump/ coredump/coredump \ - scripts/github-indent-warnings.py - -shellcheck: +lint: + flake8 --version + flake8 --config=scripts/flake8.cfg test/zdtm.py + flake8 --config=scripts/flake8.cfg test/inhfd/*.py + flake8 --config=scripts/flake8.cfg test/others/rpc/config_file.py + flake8 --config=scripts/flake8.cfg lib/py/images/pb2dict.py + flake8 --config=scripts/flake8.cfg scripts/criu-ns shellcheck --version shellcheck scripts/*.sh - shellcheck scripts/ci/*.sh - shellcheck contrib/apt-install contrib/dependencies/*.sh - shellcheck -x test/others/crit/*.sh - shellcheck -x test/others/libcriu/*.sh - shellcheck -x test/others/crit/*.sh test/others/criu-coredump/*.sh - shellcheck -x test/others/config-file/*.sh - shellcheck -x test/others/action-script/*.sh - -codespell: - codespell - -lint: ruff shellcheck codespell - # Do not append \n to pr_perror, pr_pwarn or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>.*\\n"' - # Do not use %m with pr_* or fail - ! git --no-pager grep -E '^\s*\<(pr_(err|perror|warn|pwarn|debug|info|msg)|fail)\>.*%m' - # Do not use errno with pr_perror, pr_pwarn or fail - ! git --no-pager grep -E '^\s*\<(pr_perror|pr_pwarn|fail)\>\(".*".*errno' + shellcheck scripts/ci/*.sh scripts/ci/apt-install + shellcheck test/others/crit/*.sh + shellcheck test/others/config-file/*.sh + # Do not append \n to pr_perror or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*\\n"' + # Do not use %m with pr_perror or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>.*%m' + # Do not use errno with pr_perror or fail + ! git --no-pager grep -E '^\s*\<(pr_perror|fail)\>\(".*".*errno' # End pr_(err|warn|msg|info|debug) with \n ! git --no-pager grep -En '^\s*\.*);$$' | grep -v '\\n' # No EOL whitespace for C files ! git --no-pager grep -E '\s+$$' \*.c \*.h -.PHONY: lint ruff shellcheck codespell +.PHONY: lint -codecov: SHELL := $(shell command -v bash) +codecov: SHELL := $(shell which bash) codecov: - curl -Os https://uploader.codecov.io/latest/linux/codecov - chmod +x codecov - ./codecov + bash <(curl -s https://codecov.io/bash) .PHONY: codecov fetch-clang-format: .FORCE $(E) ".clang-format" $(Q) scripts/fetch-clang-format.sh -BASE ?= "HEAD~1" -OPTS ?= "--quiet" indent: - git clang-format --style file --extensions c,h $(OPTS) $(BASE) + find . -name '*.[ch]' -type f -print0 | xargs --null --max-args 128 --max-procs 4 clang-format -i .PHONY: indent include Makefile.install diff --git a/Makefile.compel b/Makefile.compel index a4209edc5..764afadc8 100644 --- a/Makefile.compel +++ b/Makefile.compel @@ -50,8 +50,8 @@ compel/plugins/%: $(compel-deps) .FORCE # # GNU make 4.x supports targets matching via wide -# match targeting, where GNU make 3.x series is not, -# so we have to write them here explicitly. +# match targeting, where GNU make 3.x series (used on +# Travis) is not, so we have to write them here explicitly. compel/plugins/std.lib.a: $(compel-deps) .FORCE $(Q) $(MAKE) $(build)=compel/plugins $@ diff --git a/Makefile.config b/Makefile.config index 5cf4b8216..6e3e1b062 100644 --- a/Makefile.config +++ b/Makefile.config @@ -2,15 +2,12 @@ include $(__nmk_dir)utils.mk include $(__nmk_dir)msg.mk include scripts/feature-tests.mak -# This is a kludge for $(info ...) to not eat spaces. -S := - ifeq ($(call try-cc,$(FEATURE_TEST_LIBBSD_DEV),-lbsd),true) LIBS_FEATURES += -lbsd FEATURE_DEFINES += -DCONFIG_HAS_LIBBSD else - $(info Note: Building without setproctitle() support.) - $(info $S Install libbsd-devel (RPM) / libbsd-dev (DEB) to fix.) + $(info Note: Building without setproctitle() and strlcpy() support.) + $(info $(info) To enable these features, please install libbsd-devel (RPM) / libbsd-dev (DEB).) endif ifeq ($(call pkg-config-check,libselinux),y) @@ -24,21 +21,12 @@ ifeq ($(call pkg-config-check,libbpf),y) export CONFIG_HAS_LIBBPF := y endif -ifeq ($(call pkg-config-check,libdrm),y) - export CONFIG_AMDGPU := y - $(info Note: Building with amdgpu_plugin.) -else - $(info Note: Building without amdgpu_plugin.) - $(info $S Install libdrm-devel (RPM) or libdrm-dev (DEB) to fix.) -endif - ifeq ($(NO_GNUTLS)x$(call pkg-config-check,gnutls),xy) LIBS_FEATURES += -lgnutls export CONFIG_GNUTLS := y FEATURE_DEFINES += -DCONFIG_GNUTLS else - $(info Note: Building without GnuTLS support.) - $(info $S Install gnutls-devel (RPM) or gnutls-dev (DEB) to fix.) + $(info Note: Building without GnuTLS support) endif ifeq ($(call pkg-config-check,libnftables),y) @@ -50,19 +38,16 @@ ifeq ($(call pkg-config-check,libnftables),y) LIBS_FEATURES += $(LIB_NFTABLES) FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 else - $(info Warn: Building without nftables support (incompatible API version).) + $(warning Warn: you have libnftables installed but it has incompatible API) + $(warning Warn: Building without nftables support) endif else - $(info Warn: Building without nftables support.) - $(info $S Install nftables-devel (RPM) or libnftables-dev (DEB) to fix.) + $(warning Warn: you have no libnftables installed) + $(warning Warn: Building without nftables support) endif export LIBS += $(LIBS_FEATURES) -ifneq ($(PLUGINDIR),) - FEATURE_DEFINES += -DCR_PLUGIN_DEFAULT="\"$(PLUGINDIR)\"" -endif - CONFIG_FILE = .config $(CONFIG_FILE): @@ -74,26 +59,24 @@ ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y FEATURE_DEFINES += -DCONFIG_COMPAT else - $(info Note: Building without ia32 C/R, missing ia32 support in gcc.) - $(info $S It may be related to missing gcc-multilib in your) - $(info $S distribution, or you may have Debian with buggy toolchain.) - $(info $S See https://github.com/checkpoint-restore/criu/issues/315.) + $(info Note: Building without ia32 C/R, missed ia32 support in gcc) + $(info $(info) That may be related to missing gcc-multilib in your) + $(info $(info) distribution or you may have Debian with buggy toolchain) + $(info $(info) (issue https://github.com/checkpoint-restore/criu/issues/315)) endif endif export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) -FEATURES_LIST := TCP_REPAIR PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT TCP_REPAIR_WINDOW MEMFD_CREATE \ - OPENAT2 NO_LIBC_RSEQ_DEFS +FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE # $1 - config name define gen-feature-test ifeq ($$(call try-cc,$$(FEATURE_TEST_$(1)),$$(LIBS_FEATURES),$$(DEFINES)),true) $(Q) echo '#define CONFIG_HAS_$(1)' >> $$@ -else - $(Q) echo '// CONFIG_HAS_$(1) is not set' >> $$@ + $(Q) echo '' >> $$@ endif endef diff --git a/Makefile.install b/Makefile.install index 70c607ec6..3987bcc6f 100644 --- a/Makefile.install +++ b/Makefile.install @@ -7,7 +7,6 @@ MANDIR ?= $(PREFIX)/share/man INCLUDEDIR ?= $(PREFIX)/include LIBEXECDIR ?= $(PREFIX)/libexec RUNDIR ?= /run -PLUGINDIR ?= $(PREFIX)/lib/criu # # For recent Debian/Ubuntu with multiarch support. @@ -27,34 +26,7 @@ endif LIBDIR ?= $(PREFIX)/lib export PREFIX BINDIR SBINDIR MANDIR RUNDIR -export LIBDIR INCLUDEDIR LIBEXECDIR PLUGINDIR - -# Detect externally managed Python environment (PEP 668). -PYTHON_EXTERNALLY_MANAGED := $(shell $(PYTHON) -c 'import os, sysconfig; print(int(os.path.isfile(os.path.join(sysconfig.get_path("stdlib"), "EXTERNALLY-MANAGED"))))') -PIP_BREAK_SYSTEM_PACKAGES ?= 0 - -# If Python environment is externally managed and PIP_BREAK_SYSTEM_PACKAGES is not set, skip pip install. -SKIP_PIP_INSTALL := 0 -ifeq ($(PYTHON_EXTERNALLY_MANAGED),1) -ifeq ($(PIP_BREAK_SYSTEM_PACKAGES),0) - -SKIP_PIP_INSTALL := 1 -$(info Warn: Externally managed python environment) -$(info Consider using PIP_BREAK_SYSTEM_PACKAGES=1) - -endif -endif - -# Default flags for pip install: -# --ignore-installed: Overwrite already installed pycriu/crit packages -# --no-build-isolation: Use current Python environment to build pycriu/crit packages -# --no-deps: Don't install any dependencies -# --no-index: Don't use PyPI index to find packages -# --progress-bar: Cleaner output -# --upgrade: Treat the install as an upgrade when replacing the installed version -PIPFLAGS ?= --ignore-installed --no-build-isolation --no-deps --no-index --progress-bar off --upgrade - -export SKIP_PIP_INSTALL PIPFLAGS +export LIBDIR INCLUDEDIR LIBEXECDIR install-man: $(Q) $(MAKE) -C Documentation install @@ -64,37 +36,22 @@ install-lib: lib $(Q) $(MAKE) $(build)=lib install .PHONY: install-lib -install-crit: lib - $(Q) $(MAKE) $(build)=crit install -.PHONY: install-crit - install-criu: criu $(Q) $(MAKE) $(build)=criu install .PHONY: install-criu -install-amdgpu_plugin: amdgpu_plugin - $(Q) $(MAKE) -C plugins/amdgpu install -.PHONY: install-amdgpu_plugin - -install-cuda_plugin: cuda_plugin - $(Q) $(MAKE) -C plugins/cuda install -.PHONY: install-cuda_plugin - install-compel: $(compel-install-targets) $(Q) $(MAKE) $(build)=compel install $(Q) $(MAKE) $(build)=compel/plugins install .PHONY: install-compel -install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ; +install: install-man install-lib install-criu install-compel ; .PHONY: install uninstall: $(Q) $(MAKE) -C Documentation $@ $(Q) $(MAKE) $(build)=lib $@ - $(Q) $(MAKE) $(build)=crit $@ $(Q) $(MAKE) $(build)=criu $@ $(Q) $(MAKE) $(build)=compel $@ $(Q) $(MAKE) $(build)=compel/plugins $@ - $(Q) $(MAKE) -C plugins/amdgpu $@ - $(Q) $(MAKE) -C plugins/cuda $@ .PHONY: uninstall diff --git a/Makefile.versions b/Makefile.versions index 3e6c9ed22..32f2e95ce 100644 --- a/Makefile.versions +++ b/Makefile.versions @@ -1,10 +1,10 @@ # # CRIU version. -CRIU_VERSION_MAJOR := 4 -CRIU_VERSION_MINOR := 2 +CRIU_VERSION_MAJOR := 3 +CRIU_VERSION_MINOR := 16 CRIU_VERSION_SUBLEVEL := CRIU_VERSION_EXTRA := -CRIU_VERSION_NAME := CRIUTIBILITY +CRIU_VERSION_NAME := Petrified Puffin CRIU_VERSION := $(CRIU_VERSION_MAJOR)$(if $(CRIU_VERSION_MINOR),.$(CRIU_VERSION_MINOR))$(if $(CRIU_VERSION_SUBLEVEL),.$(CRIU_VERSION_SUBLEVEL))$(if $(CRIU_VERSION_EXTRA),.$(CRIU_VERSION_EXTRA)) export CRIU_VERSION_MAJOR CRIU_VERSION_MINOR CRIU_VERSION_SUBLEVEL diff --git a/README.md b/README.md index 6e2a0de9e..fd86b2c15 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,8 @@ -[![X86_64 GCC Test](https://github.com/checkpoint-restore/criu/workflows/X86_64%20GCC%20Test/badge.svg)]( - https://github.com/checkpoint-restore/criu/actions/workflows/x86-64-gcc-test.yml) -[![Docker Test](https://github.com/checkpoint-restore/criu/actions/workflows/docker-test.yml/badge.svg)]( - https://github.com/checkpoint-restore/criu/actions/workflows/docker-test.yml) -[![Podman Test](https://github.com/checkpoint-restore/criu/actions/workflows/podman-test.yml/badge.svg)]( - https://github.com/checkpoint-restore/criu/actions/workflows/podman-test.yml) -[![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)]( - https://circleci.com/gh/checkpoint-restore/criu) +![X86_64 GCC Test](https://github.com/checkpoint-restore/criu/workflows/X86_64%20GCC%20Test/badge.svg) +![Podman Test](https://github.com/checkpoint-restore/criu/workflows/Podman%20Test/badge.svg) +[![CircleCI](https://circleci.com/gh/checkpoint-restore/criu.svg?style=svg)](https://circleci.com/gh/checkpoint-restore/criu) -

+

## CRIU -- A project to implement checkpoint/restore functionality for Linux @@ -35,10 +30,10 @@ Pages worth starting with are: - [Installation instructions](http://criu.org/Installation) - [A simple example of usage](http://criu.org/Simple_loop) - [Examples of more advanced usage](https://criu.org/Category:HOWTO) -- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/index.php?title=FAQ) +- Troubleshooting can be hard, some help can be found [here](https://criu.org/When_C/R_fails), [here](https://criu.org/What_cannot_be_checkpointed) and [here](https://criu.org/FAQ) ### Checkpoint and restore of simple loop process -

+[

](https://asciinema.org/a/232445) ## Advanced features diff --git a/compel/.gitignore b/compel/.gitignore index 5e770a86c..eab3337d6 100644 --- a/compel/.gitignore +++ b/compel/.gitignore @@ -4,9 +4,6 @@ arch/arm/plugins/std/syscalls/syscalls.S arch/aarch64/plugins/std/syscalls/syscalls.S arch/s390/plugins/std/syscalls/syscalls.S arch/ppc64/plugins/std/syscalls/syscalls.S -arch/mips/plugins/std/syscalls/syscalls-64.S -arch/loongarch64/plugins/std/syscalls/syscalls-64.S -arch/riscv64/plugins/std/syscalls/syscalls.S include/version.h plugins/include/uapi/std/asm/syscall-types.h plugins/include/uapi/std/syscall-64.h diff --git a/compel/Makefile b/compel/Makefile index c0b8a82a0..b79aee687 100644 --- a/compel/Makefile +++ b/compel/Makefile @@ -32,8 +32,8 @@ ifeq ($(ARCH),x86) lib-y += arch/$(ARCH)/src/lib/thread_area.o endif -# handle_elf() has no support of ELF relocations on ARM and RISCV64 (yet?) -ifneq ($(filter arm aarch64 loongarch64 riscv64,$(ARCH)),) +# handle_elf() has no support of ELF relocations on ARM (yet?) +ifneq ($(filter arm aarch64,$(ARCH)),) CFLAGS += -DNO_RELOCS HOSTCFLAGS += -DNO_RELOCS endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h index 8a61b268f..5f090490d 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/breakpoints.h @@ -2,41 +2,14 @@ #define __COMPEL_BREAKPOINTS_H__ #define ARCH_SI_TRAP TRAP_BRKPT -#include -#include +static inline int ptrace_set_breakpoint(pid_t pid, void *addr) +{ + return 0; +} -struct hwbp_cap { - char arch; - char bp_count; -}; - -/* copied from `linux/arch/arm64/include/asm/hw_breakpoint.h` */ -/* Lengths */ -#define ARM_BREAKPOINT_LEN_1 0x1 -#define ARM_BREAKPOINT_LEN_2 0x3 -#define ARM_BREAKPOINT_LEN_3 0x7 -#define ARM_BREAKPOINT_LEN_4 0xf -#define ARM_BREAKPOINT_LEN_5 0x1f -#define ARM_BREAKPOINT_LEN_6 0x3f -#define ARM_BREAKPOINT_LEN_7 0x7f -#define ARM_BREAKPOINT_LEN_8 0xff - -/* Privilege Levels */ -#define AARCH64_BREAKPOINT_EL1 1 -#define AARCH64_BREAKPOINT_EL0 2 - -/* Breakpoint */ -#define ARM_BREAKPOINT_EXECUTE 0 - -/* Watchpoints */ -#define ARM_BREAKPOINT_LOAD 1 -#define ARM_BREAKPOINT_STORE 2 -#define AARCH64_ESR_ACCESS_MASK (1 << 6) - -#define DISABLE_HBP 0 -#define ENABLE_HBP 1 - -int ptrace_set_breakpoint(pid_t pid, void *addr); -int ptrace_flush_breakpoints(pid_t pid); +static inline int ptrace_flush_breakpoints(pid_t pid) +{ + return 0; +} #endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h deleted file mode 100644 index 9f9655e3b..000000000 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/gcs-types.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef __UAPI_ASM_GCS_TYPES_H__ -#define __UAPI_ASM_GCS_TYPES_H__ - -#ifndef NT_ARM_GCS -#define NT_ARM_GCS 0x410 /* ARM GCS state */ -#endif - -/* Shadow Stack/Guarded Control Stack interface */ -#define PR_GET_SHADOW_STACK_STATUS 74 -#define PR_SET_SHADOW_STACK_STATUS 75 -#define PR_LOCK_SHADOW_STACK_STATUS 76 - -/* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack */ -#ifndef PR_SHADOW_STACK_ENABLE -#define PR_SHADOW_STACK_ENABLE (1UL << 0) -#endif - -/* Allows explicit GCS stores (eg. using GCSSTR) */ -#ifndef PR_SHADOW_STACK_WRITE -#define PR_SHADOW_STACK_WRITE (1UL << 1) -#endif - -/* Allows explicit GCS pushes (eg. using GCSPUSHM) */ -#ifndef PR_SHADOW_STACK_PUSH -#define PR_SHADOW_STACK_PUSH (1UL << 2) -#endif - -#ifndef SHADOW_STACK_SET_TOKEN -#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ -#endif - -#define PR_SHADOW_STACK_ALL_MODES \ - PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH - -/* copied from: arch/arm64/include/asm/sysreg.h */ -#define GCS_CAP_VALID_TOKEN 0x1 -#define GCS_CAP_ADDR_MASK 0xFFFFFFFFFFFFF000ULL -#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | GCS_CAP_VALID_TOKEN) -#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) - -#include - -#ifndef HWCAP_GCS -#define HWCAP_GCS (1UL << 32) -#endif - -#endif /* __UAPI_ASM_GCS_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 606c92ffe..f91e73dc4 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -2,7 +2,6 @@ #define UAPI_COMPEL_ASM_TYPES_H__ #include -#include #include #include #include @@ -17,35 +16,17 @@ */ typedef struct user_pt_regs user_regs_struct_t; - -/* - * GCS (Guarded Control Stack) - * - * This mirrors the kernel definition but renamed to cr_user_gcs - * to avoid conflict with kernel headers (/usr/include/asm/ptrace.h). - */ -struct cr_user_gcs { - __u64 features_enabled; - __u64 features_locked; - __u64 gcspr_el0; -}; - -struct user_fpregs_struct { - struct user_fpsimd_state fpstate; - struct cr_user_gcs gcs; -}; -typedef struct user_fpregs_struct user_fpregs_struct_t; +typedef struct user_fpsimd_state user_fpregs_struct_t; #define __compel_arch_fetch_thread_area(tid, th) 0 #define compel_arch_fetch_thread_area(tctl) 0 #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) -#define REG_RES(r) ((uint64_t)(r).regs[0]) -#define REG_IP(r) ((uint64_t)(r).pc) -#define SET_REG_IP(r, val) ((r).pc = (val)) -#define REG_SP(r) ((uint64_t)((r).sp)) -#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) +#define REG_RES(r) ((uint64_t)(r).regs[0]) +#define REG_IP(r) ((uint64_t)(r).pc) +#define REG_SP(r) ((uint64_t)((r).sp)) +#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[8]) #define user_regs_native(pregs) true @@ -57,12 +38,4 @@ typedef struct user_fpregs_struct user_fpregs_struct_t; __NR_##syscall; \ }) -extern bool __compel_host_supports_gcs(void); -#define compel_host_supports_gcs __compel_host_supports_gcs - -struct parasite_ctl; -extern int __parasite_setup_shstk(struct parasite_ctl *ctl, - user_fpregs_struct_t *ext_regs); -#define parasite_setup_shstk __parasite_setup_shstk - #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h index 7efee528f..f8ec55d6c 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/sigframe.h @@ -1,34 +1,24 @@ #ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ #define UAPI_COMPEL_ASM_SIGFRAME_H__ -#include +#include #include #include -#include /* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */ #define FPSIMD_MAGIC 0x46508001 -#define GCS_MAGIC 0x47435300 typedef struct fpsimd_context fpu_state_t; -struct gcs_context { - struct _aarch64_ctx head; - __u64 gcspr; - __u64 features_enabled; - __u64 reserved; -}; - struct aux_context { struct fpsimd_context fpsimd; - struct gcs_context gcs; /* additional context to be added before "end" */ struct _aarch64_ctx end; }; -// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include @@ -72,7 +62,6 @@ struct cr_sigcontext { #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct aux_context *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) #define RT_SIGFRAME_OFFSET(rt_sigframe) 0 -#define RT_SIGFRAME_GCS(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->gcs) #define rt_sigframe_erase_sigset(sigframe) memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) #define rt_sigframe_copy_sigset(sigframe, from) memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) diff --git a/compel/arch/aarch64/src/lib/infect.c b/compel/arch/aarch64/src/lib/infect.c index 42f593c79..7cfa637eb 100644 --- a/compel/arch/aarch64/src/lib/infect.c +++ b/compel/arch/aarch64/src/lib/infect.c @@ -2,9 +2,7 @@ #include #include #include -#include -#include - +#include #include #include "common/page.h" #include "uapi/compel/asm/infect-types.h" @@ -12,9 +10,6 @@ #include "errno.h" #include "infect.h" #include "infect-priv.h" -#include "asm/breakpoints.h" -#include "asm/gcs-types.h" -#include unsigned __page_size = 0; unsigned __page_shift = 0; @@ -24,7 +19,7 @@ unsigned __page_shift = 0; */ const char code_syscall[] = { 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */ - 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ + 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); @@ -35,54 +30,24 @@ static inline void __always_unused __check_code_syscall(void) BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); } -bool __compel_host_supports_gcs(void) -{ - unsigned long hwcap = getauxval(AT_HWCAP); - return (hwcap & HWCAP_GCS) != 0; -} - -static bool __compel_gcs_enabled(struct cr_user_gcs *gcs) -{ - if (!compel_host_supports_gcs()) - return false; - - return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; -} - int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); - struct gcs_context *gcs = RT_SIGFRAME_GCS(sigframe); memcpy(sigframe->uc.uc_mcontext.regs, regs->regs, sizeof(regs->regs)); - pr_debug("sigreturn_prep_regs_plain: sp %lx pc %lx\n", (long)regs->sp, (long)regs->pc); - sigframe->uc.uc_mcontext.sp = regs->sp; sigframe->uc.uc_mcontext.pc = regs->pc; sigframe->uc.uc_mcontext.pstate = regs->pstate; - memcpy(fpsimd->vregs, fpregs->fpstate.vregs, 32 * sizeof(__uint128_t)); + memcpy(fpsimd->vregs, fpregs->vregs, 32 * sizeof(__uint128_t)); - fpsimd->fpsr = fpregs->fpstate.fpsr; - fpsimd->fpcr = fpregs->fpstate.fpcr; + fpsimd->fpsr = fpregs->fpsr; + fpsimd->fpcr = fpregs->fpcr; fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); - if (__compel_gcs_enabled(&fpregs->gcs)) { - gcs->head.magic = GCS_MAGIC; - gcs->head.size = sizeof(*gcs); - gcs->reserved = 0; - gcs->gcspr = fpregs->gcs.gcspr_el0 - 8; - gcs->features_enabled = fpregs->gcs.features_enabled; - - pr_debug("sigframe gcspr=%llx features_enabled=%llx\n", fpregs->gcs.gcspr_el0 - 8, fpregs->gcs.features_enabled); - } else { - pr_debug("sigframe gcspr=[disabled]\n"); - memset(gcs, 0, sizeof(*gcs)); - } - return 0; } @@ -94,6 +59,7 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { + user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; struct iovec iov; int ret; @@ -106,28 +72,14 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; } - iov.iov_base = &ext_regs->fpstate; - iov.iov_len = sizeof(ext_regs->fpstate); + iov.iov_base = fpsimd; + iov.iov_len = sizeof(*fpsimd); if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { pr_perror("Failed to obtain FPU registers for %d", pid); goto err; } - memset(&ext_regs->gcs, 0, sizeof(ext_regs->gcs)); - - iov.iov_base = &ext_regs->gcs; - iov.iov_len = sizeof(ext_regs->gcs); - if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &iov) == 0) { - pr_info("gcs: GCSPR_EL0 for %d: 0x%llx, features: 0x%llx\n", - pid, ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); - - if (!__compel_gcs_enabled(&ext_regs->gcs)) - pr_info("gcs: GCS is NOT enabled\n"); - } else { - pr_info("gcs: GCS state not available for %d\n", pid); - } - - ret = save(pid, arg, regs, ext_regs); + ret = save(arg, regs, fpsimd); err: return ret; } @@ -136,44 +88,14 @@ int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) { struct iovec iov; - struct cr_user_gcs gcs; - struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; - pr_info("Restoring GP/FPU registers for %d\n", pid); - iov.iov_base = &ext_regs->fpstate; - iov.iov_len = sizeof(ext_regs->fpstate); + iov.iov_base = ext_regs; + iov.iov_len = sizeof(*ext_regs); if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { pr_perror("Failed to set FPU registers for %d", pid); return -1; } - - if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { - pr_warn("gcs: Failed to get GCS for %d\n", pid); - } else { - ext_regs->gcs = gcs; - compel_set_task_gcs_regs(pid, ext_regs); - } - - return 0; -} - -int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs) -{ - struct iovec iov; - - pr_info("gcs: restoring GCS registers for %d\n", pid); - pr_info("gcs: restoring GCS: gcspr=%llx features=%llx\n", - ext_regs->gcs.gcspr_el0, ext_regs->gcs.features_enabled); - - iov.iov_base = &ext_regs->gcs; - iov.iov_len = sizeof(ext_regs->gcs); - - if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &iov)) { - pr_perror("gcs: Failed to set GCS registers for %d", pid); - return -1; - } - return 0; } @@ -254,176 +176,3 @@ unsigned long compel_task_size(void) break; return task_size; } - -static struct hwbp_cap *ptrace_get_hwbp_cap(pid_t pid) -{ - static struct hwbp_cap info; - static int available = -1; - - if (available == -1) { - unsigned int val; - struct iovec iovec = { - .iov_base = &val, - .iov_len = sizeof(val), - }; - - if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_HW_BREAK, &iovec) < 0) - available = 0; - else { - info.arch = (char)((val >> 8) & 0xff); - info.bp_count = (char)(val & 0xff); - - available = (info.arch != 0); - } - } - - return available == 1 ? &info : NULL; -} - -int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - k_rtsigset_t block; - struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); - struct user_hwdebug_state regs = {}; - unsigned int ctrl = 0; - struct iovec iovec; - - if (info == NULL || info->bp_count == 0) - return 0; - - /* - * The struct is copied from `arch/arm64/include/asm/hw_breakpoint.h` in - * linux kernel: - * struct arch_hw_breakpoint_ctrl { - * __u32 __reserved : 19, - * len : 8, - * type : 2, - * privilege : 2, - * enabled : 1; - * }; - * - * The part of `struct arch_hw_breakpoint_ctrl` bits meaning is defined - * in <>, - * D13.3.2 DBGBCR_EL1, Debug Breakpoint Control Registers. - */ - ctrl = ARM_BREAKPOINT_LEN_4; - ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; - ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; - ctrl = (ctrl << 1) | ENABLE_HBP; - regs.dbg_regs[0].addr = (__u64)addr; - regs.dbg_regs[0].ctrl = ctrl; - iovec.iov_base = ®s; - iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); - - if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) - return -1; - - /* - * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler - * will be reset to the default one. - */ - ksigfillset(&block); - ksigdelset(&block, SIGTRAP); - if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { - pr_perror("Can't block signals for %d", pid); - return -1; - } - - if (ptrace(PTRACE_CONT, pid, NULL, NULL) != 0) { - pr_perror("Unable to restart the stopped tracee process %d", pid); - return -1; - } - - return 1; -} - -int ptrace_flush_breakpoints(pid_t pid) -{ - struct hwbp_cap *info = ptrace_get_hwbp_cap(pid); - struct user_hwdebug_state regs = {}; - unsigned int ctrl = 0; - struct iovec iovec; - - if (info == NULL || info->bp_count == 0) - return 0; - - ctrl = ARM_BREAKPOINT_LEN_4; - ctrl = (ctrl << 2) | ARM_BREAKPOINT_EXECUTE; - ctrl = (ctrl << 2) | AARCH64_BREAKPOINT_EL0; - ctrl = (ctrl << 1) | DISABLE_HBP; - regs.dbg_regs[0].addr = 0ul; - regs.dbg_regs[0].ctrl = ctrl; - - iovec.iov_base = ®s; - iovec.iov_len = (offsetof(struct user_hwdebug_state, dbg_regs) + sizeof(regs.dbg_regs[0])); - - if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_BREAK, &iovec)) - return -1; - - return 0; -} - -int inject_gcs_cap_token(struct parasite_ctl *ctl, pid_t pid, struct cr_user_gcs *gcs) -{ - struct iovec gcs_iov = { .iov_base = gcs, .iov_len = sizeof(*gcs) }; - - uint64_t token_addr = gcs->gcspr_el0 - 8; - uint64_t sigtramp_addr = gcs->gcspr_el0 - 16; - - uint64_t cap_token = ALIGN_DOWN(GCS_SIGNAL_CAP(token_addr), 8); - unsigned long restorer_addr; - - pr_info("gcs: (setup) CAP token: 0x%lx at addr: 0x%lx\n", cap_token, token_addr); - - /* Inject capability token at gcspr_el0 - 8 */ - if (ptrace(PTRACE_POKEDATA, pid, (void *)token_addr, cap_token)) { - pr_perror("gcs: (setup) Inject GCS cap token failed"); - return -1; - } - - /* Inject restorer trampoline address (gcspr_el0 - 16) */ - restorer_addr = ctl->parasite_ip; - if (ptrace(PTRACE_POKEDATA, pid, (void *)sigtramp_addr, restorer_addr)) { - pr_perror("gcs: (setup) Inject GCS restorer failed"); - return -1; - } - - /* Update GCSPR_EL0 */ - gcs->gcspr_el0 = token_addr; - if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_GCS, &gcs_iov)) { - pr_perror("gcs: PTRACE_SETREGS FAILED"); - return -1; - } - - pr_debug("gcs: parasite_ip=%#lx sp=%#llx gcspr_el0=%#llx\n", - ctl->parasite_ip, ctl->orig.regs.sp, gcs->gcspr_el0); - - return 0; -} - -int parasite_setup_shstk(struct parasite_ctl *ctl, user_fpregs_struct_t *ext_regs) -{ - struct cr_user_gcs gcs; - struct iovec gcs_iov = { .iov_base = &gcs, .iov_len = sizeof(gcs) }; - pid_t pid = ctl->rpid; - - if(!__compel_host_supports_gcs()) - return 0; - - if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) != 0) { - pr_perror("GCS state not available for %d", pid); - return -1; - } - - if (!__compel_gcs_enabled(&gcs)) - return 0; - - if (inject_gcs_cap_token(ctl, pid, &gcs)) { - pr_perror("Failed to inject GCS cap token for %d", pid); - return -1; - } - - pr_info("gcs: GCS enabled for %d\n", pid); - - return 0; -} diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index f4deb02b2..e6508ed9f 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -39,7 +39,7 @@ recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, str sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) shutdown 210 293 (int sockfd, int how) -bind 200 282 (int sockfd, const struct sockaddr *addr, int addrlen) +bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) @@ -85,7 +85,7 @@ timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimer timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) timer_getoverrun 109 260 (int timer_id) timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) +clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp) exit_group 94 248 (int error_code) set_robust_list 99 338 (struct robust_list_head *head, size_t len) get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) @@ -112,16 +112,9 @@ userfaultfd 282 388 (int flags) fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) cacheflush ! 983042 (void *start, void *end, int flags) ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) -move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) -close_range 436 436 (unsigned int fd, unsigned int max_fd, unsigned int flags) pidfd_open 434 434 (pid_t pid, unsigned int flags) -openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) -rseq 293 398 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -membarrier 283 389 (int cmd, unsigned int flags, int cpu_id) -map_shadow_stack 453 ! (unsigned long addr, unsigned long size, unsigned int flags) \ No newline at end of file diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h index 8d328252e..159b6a9fb 100644 --- a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h @@ -56,11 +56,10 @@ struct user_vfp_exc { unsigned long fpinst2; }; -#define REG_RES(regs) ((regs).ARM_r0) -#define REG_IP(regs) ((regs).ARM_pc) -#define SET_REG_IP(regs, val) ((regs).ARM_pc = (val)) -#define REG_SP(regs) ((regs).ARM_sp) -#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) +#define REG_RES(regs) ((regs).ARM_r0) +#define REG_IP(regs) ((regs).ARM_pc) +#define REG_SP(regs) ((regs).ARM_sp) +#define REG_SYSCALL_NR(regs) ((regs).ARM_r7) #define user_regs_native(pregs) true diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index a9fb639e2..6715afdb3 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -18,7 +18,7 @@ */ const char code_syscall[] = { 0x00, 0x00, 0x00, 0xef, /* SVC #0 */ - 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ + 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */ }; static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); @@ -65,9 +65,10 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr } #define PTRACE_GETVFPREGS 27 -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *vfp, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { + user_fpregs_struct_t tmp, *vfp = ext_regs ? ext_regs : &tmp; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); @@ -94,7 +95,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } - ret = save(pid, arg, regs, vfp); + ret = save(arg, regs, vfp); err: return ret; } diff --git a/compel/arch/loongarch64/plugins/include/asm/prologue.h b/compel/arch/loongarch64/plugins/include/asm/prologue.h deleted file mode 100644 index c19ce54d7..000000000 --- a/compel/arch/loongarch64/plugins/include/asm/prologue.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __ASM_PROLOGUE_H__ -#define __ASM_PROLOGUE_H__ - -#ifndef __ASSEMBLY__ - -#include -#include -#include - -#include - -#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) - -typedef struct prologue_init_args { - struct sockaddr_un ctl_sock_addr; - unsigned int ctl_sock_addr_len; - - unsigned int arg_s; - void *arg_p; - - void *sigframe; -} prologue_init_args_t; - -#endif /* __ASSEMBLY__ */ - -/* - * Reserve enough space for sigframe. - * - * FIXME It is rather should be taken from sigframe header. - */ -#define PROLOGUE_SGFRAME_SIZE 4096 - -#define PROLOGUE_INIT_ARGS_SIZE 1024 - -#endif /* __ASM_PROLOGUE_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h b/compel/arch/loongarch64/plugins/include/asm/syscall-types.h deleted file mode 100644 index b883bd8be..000000000 --- a/compel/arch/loongarch64/plugins/include/asm/syscall-types.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ -#define COMPEL_ARCH_SYSCALL_TYPES_H__ - -#include -/* Types for sigaction, sigprocmask syscalls */ -typedef void rt_signalfn_t(int, siginfo_t *, void *); -typedef rt_signalfn_t *rt_sighandler_t; - -typedef void rt_restorefn_t(void); -typedef rt_restorefn_t *rt_sigrestore_t; - -/* refer to arch/loongarch/include/uapi/asm/signal.h */ -#define _KNSIG 64 -#define _NSIG_BPW BITS_PER_LONG -#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) - -typedef struct { - uint64_t sig[_KNSIG_WORDS]; -} k_rtsigset_t; - -typedef struct { - rt_sighandler_t rt_sa_handler; - unsigned long rt_sa_flags; - rt_sigrestore_t rt_sa_restorer; - k_rtsigset_t rt_sa_mask; -} rt_sigaction_t; - -#define SA_RESTORER 0x04000000 - -#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ diff --git a/compel/arch/loongarch64/plugins/include/features.h b/compel/arch/loongarch64/plugins/include/features.h deleted file mode 100644 index b4a3cded2..000000000 --- a/compel/arch/loongarch64/plugins/include/features.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __COMPEL_ARCH_FEATURES_H -#define __COMPEL_ARCH_FEATURES_H - -#endif /* __COMPEL_ARCH_FEATURES_H */ diff --git a/compel/arch/loongarch64/plugins/std/parasite-head.S b/compel/arch/loongarch64/plugins/std/parasite-head.S deleted file mode 100644 index 3a960490e..000000000 --- a/compel/arch/loongarch64/plugins/std/parasite-head.S +++ /dev/null @@ -1,9 +0,0 @@ - -#include "common/asm/linkage.h" - - .section .head.text, "ax" -ENTRY(__export_parasite_head_start) - bl parasite_service; - break 0; -END(__export_parasite_head_start) - diff --git a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls deleted file mode 100644 index 0d08f34e1..000000000 --- a/compel/arch/loongarch64/plugins/std/syscalls/Makefile.syscalls +++ /dev/null @@ -1,117 +0,0 @@ -std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/syscalls-64.o -sys-proto-types := $(obj)/include/uapi/std/syscall-types.h -sys-proto-generic := $(obj)/include/uapi/std/syscall.h -sys-codes-generic := $(obj)/include/uapi/std/syscall-codes.h -sys-codes = $(obj)/include/uapi/std/syscall-codes-$(1).h -sys-proto = $(obj)/include/uapi/std/syscall-$(1).h -sys-def = $(PLUGIN_ARCH_DIR)/std/syscalls/syscall_$(1).tbl -sys-asm = $(PLUGIN_ARCH_DIR)/std/syscalls-$(1).S -sys-asm-common-name = std/syscalls/syscall-common-loongarch-$(1).S -sys-asm-common = $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) -sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h -sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl-$(1).c - -sys-bits := 64 - -AV := $$$$ - -define gen-rule-sys-codes -$(sys-codes): $(sys-def) $(sys-proto-types) - $(call msg-gen, $$@) - $(Q) echo "/* Autogenerated, don't edit */" > $$@ - $(Q) echo "#ifndef ASM_SYSCALL_CODES_H_$(1)__" >> $$@ - $(Q) echo "#define ASM_SYSCALL_CODES_H_$(1)__" >> $$@ - $(Q) cat $$< | awk '/^__NR/{SYSN=$(AV)1; \ - sub("^__NR", "SYS", SYSN); \ - print "\n#ifndef ", $(AV)1; \ - print "#define", $(AV)1, $(AV)2; \ - print "#endif"; \ - print "\n#ifndef ", SYSN; \ - print "#define ", SYSN, $(AV)1; \ - print "#endif";}' >> $$@ - $(Q) echo "#endif /* ASM_SYSCALL_CODES_H_$(1)__ */" >> $$@ -endef - -define gen-rule-sys-proto -$(sys-proto): $(sys-def) $(sys-proto-types) - $(call msg-gen, $$@) - $(Q) echo "/* Autogenerated, don't edit */" > $$@ - $(Q) echo "#ifndef ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ - $(Q) echo "#define ASM_SYSCALL_PROTO_H_$(1)__" >> $$@ - $(Q) echo '#include ' >> $$@ - $(Q) echo '#include ' >> $$@ -ifeq ($(1),32) - $(Q) echo '#include "asm/syscall32.h"' >> $$@ -endif - $(Q) cat $$< | awk '/^__NR/{print "extern long", $(AV)3, \ - substr($(AV)0, index($(AV)0,$(AV)4)), ";"}' >> $$@ - $(Q) echo "#endif /* ASM_SYSCALL_PROTO_H_$(1)__ */" >> $$@ -endef - -define gen-rule-sys-asm -$(sys-asm): $(sys-def) $(sys-asm-common) $(sys-codes) $(sys-proto) $(sys-proto-types) - $(call msg-gen, $$@) - $(Q) echo "/* Autogenerated, don't edit */" > $$@ - $(Q) echo '#include ' >> $$@ - $(Q) echo '#include "$(sys-asm-common-name)"' >> $$@ - $(Q) cat $$< | awk '/^__NR/{print "SYSCALL(", $(AV)3, ",", $(AV)2, ")"}' >> $$@ -endef - -define gen-rule-sys-exec-tbl -$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) $(sys-proto-types) - $(call msg-gen, $$@) - $(Q) echo "/* Autogenerated, don't edit */" > $$@ - $(Q) cat $$< | awk '/^__NR/{print \ - "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ -endef - -$(sys-codes-generic): $(sys-proto-types) - $(call msg-gen, $@) - $(Q) echo "/* Autogenerated, don't edit */" > $@ - $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@ - $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@ - $(Q) echo '#include ' >> $@ - $(Q) cat $< | awk '/^__NR/{NR32=$$1; \ - sub("^__NR", "__NR32", NR32); \ - print "\n#ifndef ", NR32; \ - print "#define ", NR32, $$2; \ - print "#endif";}' >> $@ - $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@ -mrproper-y += $(sys-codes-generic) - -$(sys-proto-generic): $(strip $(call map,sys-proto,$(sys-bits))) $(sys-proto-types) - $(call msg-gen, $@) - $(Q) echo "/* Autogenerated, don't edit */" > $@ - $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@ - $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@ - $(Q) echo "" >> $@ - $(Q) echo '#include ' >> $@ - $(Q) echo "" >> $@ - $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@ -mrproper-y += $(sys-proto-generic) - -define gen-rule-sys-exec-tbl -$(sys-exec-tbl): $(sys-def) $(sys-codes) $(sys-proto) $(sys-proto-generic) - $(call msg-gen, $$@) - $(Q) echo "/* Autogenerated, don't edit */" > $$@ - $(Q) cat $$< | awk '/^__NR/{print \ - "SYSCALL(", substr($(AV)3, 5), ",", $(AV)2, ")"}' >> $$@ -endef - -$(eval $(call map,gen-rule-sys-codes,$(sys-bits))) -$(eval $(call map,gen-rule-sys-proto,$(sys-bits))) -$(eval $(call map,gen-rule-sys-asm,$(sys-bits))) -$(eval $(call map,gen-rule-sys-exec-tbl,$(sys-bits))) - -$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h - $(call msg-gen, $@) - $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) - -std-headers-deps += $(call sys-codes,$(sys-bits)) -std-headers-deps += $(call sys-proto,$(sys-bits)) -std-headers-deps += $(call sys-asm,$(sys-bits)) -std-headers-deps += $(call sys-exec-tbl,$(sys-bits)) -std-headers-deps += $(sys-codes-generic) -std-headers-deps += $(sys-proto-generic) -std-headers-deps += $(sys-asm-types) -mrproper-y += $(std-headers-deps) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S b/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S deleted file mode 100644 index fff894466..000000000 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall-common-loongarch-64.S +++ /dev/null @@ -1,44 +0,0 @@ -#include "common/asm/linkage.h" - -#define SYSCALL(name, opcode) \ -ENTRY(name); \ - addi.d $a7, $zero, opcode; \ - syscall 0; \ - jirl $r0, $r1, 0; \ -END(name) - -#ifndef AT_FDCWD -#define AT_FDCWD -100 -#endif - -#ifndef AT_REMOVEDIR -#define AT_REMOVEDIR 0x200 -#endif - -ENTRY(sys_open) - or $a3, $zero, $a2 - or $a2, $zero, $a1 - or $a1, $zero, $a0 - addi.d $a0, $zero, AT_FDCWD - b sys_openat -END(sys_open) - -ENTRY(sys_mkdir) - or $a3, $zero, $a2 - or $a2, $zero, $a1 - or $a1, $zero, $a0 - addi.d $a0, $zero, AT_FDCWD - b sys_mkdirat -END(sys_mkdir) - -ENTRY(sys_rmdir) - addi.d $a2, $zero, AT_REMOVEDIR - or $a1, $zero, $a0 - addi.d $a0, $zero, AT_FDCWD - b sys_unlinkat -END(sys_rmdir) - -ENTRY(__cr_restore_rt) - addi.d $a7, $zero, __NR_rt_sigreturn - syscall 0 -END(__cr_restore_rt) diff --git a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl b/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl deleted file mode 100644 index 83dcdab4a..000000000 --- a/compel/arch/loongarch64/plugins/std/syscalls/syscall_64.tbl +++ /dev/null @@ -1,122 +0,0 @@ -# -# System calls table, please make sure the table consist only the syscalls -# really used somewhere in project. -# from kernel/linux-3.10.84/arch/mips/include/uapi/asm/unistd.h Linux 64-bit syscalls are in the range from 5000 to 5999. -# -# __NR_name code name arguments -# ------------------------------------------------------------------------------------------------------------------------------------------------------------- -__NR_io_setup 0 sys_io_setup (unsigned nr_events, aio_context_t *ctx) -__NR_io_submit 2 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp) -__NR_io_getevents 4 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) -__NR_fcntl 25 sys_fcntl (int fd, int type, long arg) -__NR_ioctl 29 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg) -__NR_flock 32 sys_flock (int fd, unsigned long cmd) -__NR_mkdirat 34 sys_mkdirat (int dfd, const char *pathname, int flag) -__NR_unlinkat 35 sys_unlinkat (int dfd, const char *pathname, int flag) -__NR_umount2 39 sys_umount2 (char *name, int flags) -__NR_mount 40 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) -__NR_fallocate 47 sys_fallocate (int fd, int mode, loff_t offset, loff_t len) -__NR_close 57 sys_close (int fd) -__NR_openat 56 sys_openat (int dfd, const char *filename, int flags, int mode) -__NR_lseek 62 sys_lseek (int fd, unsigned long offset, unsigned long origin) -__NR_read 63 sys_read (int fd, void *buf, unsigned long count) -__NR_write 64 sys_write (int fd, const void *buf, unsigned long count) -__NR_pread64 67 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos) -__NR_preadv 69 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) -__NR_ppoll 73 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_signalfd4 74 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) -__NR_vmsplice 75 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) -__NR_readlinkat 78 sys_readlinkat (int fd, const char *path, char *buf, int bufsize) -__NR_timerfd_settime 86 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) -__NR_capget 90 sys_capget (struct cap_header *h, struct cap_data *d) -__NR_capset 91 sys_capset (struct cap_header *h, struct cap_data *d) -__NR_personality 92 sys_personality (unsigned int personality) -__NR_exit 93 sys_exit (unsigned long error_code) -__NR_exit_group 94 sys_exit_group (int error_code) -__NR_waitid 95 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) -__NR_set_tid_address 96 sys_set_tid_address (int *tid_addr) -__NR_futex 98 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) -__NR_set_robust_list 99 sys_set_robust_list (struct robust_list_head *head, size_t len) -__NR_get_robust_list 100 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) -__NR_nanosleep 101 sys_nanosleep (struct timespec *req, struct timespec *rem) -__NR_getitimer 102 sys_getitimer (int which, const struct itimerval *val) -__NR_setitimer 103 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old) -__NR_sys_timer_create 107 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) -__NR_sys_timer_gettime 108 sys_timer_gettime (int timer_id, const struct itimerspec *setting) -__NR_sys_timer_getoverrun 109 sys_timer_getoverrun (int timer_id) -__NR_sys_timer_settime 110 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) -__NR_sys_timer_delete 111 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 113 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) -__NR_sched_setscheduler 119 sys_sched_setscheduler (int pid, int policy, struct sched_param *p) -__NR_restart_syscall 128 sys_restart_syscall (void) -__NR_kill 129 sys_kill (long pid, int sig) -__NR_sigaltstack 132 sys_sigaltstack (const void *uss, void *uoss) -__NR_rt_sigaction 134 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) -__NR_rt_sigprocmask 135 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) -__NR_rt_sigqueueinfo 138 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info) -__NR_rt_sigreturn 139 sys_rt_sigreturn (void) -__NR_setpriority 140 sys_setpriority (int which, int who, int nice) -__NR_setresuid 147 sys_setresuid (int uid, int euid, int suid) -__NR_getresuid 148 sys_getresuid (int *uid, int *euid, int *suid) -__NR_setresgid 149 sys_setresgid (int gid, int egid, int sgid) -__NR_getresgid 150 sys_getresgid (int *gid, int *egid, int *sgid) -__NR_getpgid 155 sys_getpgid (pid_t pid) -__NR_setfsuid 151 sys_setfsuid (int fsuid) -__NR_setfsgid 152 sys_setfsgid (int fsgid) -__NR_getsid 156 sys_getsid (void) -__NR_getgroups 158 sys_getgroups (int gsize, unsigned int *groups) -__NR_setgroups 159 sys_setgroups (int gsize, unsigned int *groups) -__NR_setrlimit 164 sys_setrlimit (int resource, struct krlimit *rlim) -__NR_umask 166 sys_umask (int mask) -__NR_prctl 167 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) -__NR_gettimeofday 169 sys_gettimeofday (struct timeval *tv, struct timezone *tz) -__NR_getpid 172 sys_getpid (void) -__NR_ptrace 177 sys_ptrace (long request, pid_t pid, void *addr, void *data) -__NR_gettid 178 sys_gettid (void) -__NR_shmat 196 sys_shmat (int shmid, void *shmaddr, int shmflag) -__NR_socket 198 sys_socket (int domain, int type, int protocol) -__NR_bind 200 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen) -__NR_connect 203 sys_connect (int sockfd, struct sockaddr *addr, int addrlen) -__NR_sendto 206 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) -__NR_recvfrom 207 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) -__NR_setsockopt 208 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen) -__NR_getsockopt 209 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) -__NR_shutdown 210 sys_shutdown (int sockfd, int how) -__NR_sendmsg 211 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags) -__NR_recvmsg 212 sys_recvmsg (int sockfd, struct msghdr *msg, int flags) -__NR_brk 214 sys_brk (void *addr) -__NR_munmap 215 sys_munmap (void *addr, unsigned long len) -__NR_mremap 216 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) -__NR_clone 220 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) -__NR_mmap 222 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) -__NR_mprotect 226 sys_mprotect (const void *addr, unsigned long len, unsigned long prot) -__NR_mincore 232 sys_mincore (void *addr, unsigned long size, unsigned char *vec) -__NR_madvise 233 sys_madvise (unsigned long start, size_t len, int behavior) -__NR_rt_tgsigqueueinfo 240 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) -__NR_wait4 260 sys_wait4 (int pid, int *status, int options, struct rusage *ru) -__NR_fanotify_init 262 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags) -__NR_fanotify_mark 263 sys_fanotify_mark (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) -__NR_open_by_handle_at 265 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags) -__NR_setns 268 sys_setns (int fd, int nstype) -__NR_kcmp 272 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) -__NR_seccomp 277 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) -__NR_memfd_create 279 sys_memfd_create (const char *name, unsigned int flags) -__NR_userfaultfd 282 sys_userfaultfd (int flags) -__NR_membarrier 283 sys_membarrier (int cmd, unsigned int flags, int cpu_id) -__NR_rseq 293 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) -__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) -__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) -__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) -__NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) -__NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -#__NR_dup2 ! sys_dup2 (int oldfd, int newfd) -#__NR_rmdir ! sys_rmdir (const char *name) -#__NR_unlink ! sys_unlink (char *pathname) -#__NR_cacheflush ! sys_cacheflush (char *addr, int nbytes, int cache) -#__NR_set_thread_area ! sys_set_thread_area (unsigned long *addr) -#__NR_mkdir ! sys_mkdir (const char *name, int mode) -#__NR_open ! sys_open (const char *filename, unsigned long flags, unsigned long mode) diff --git a/compel/arch/loongarch64/scripts/compel-pack.lds.S b/compel/arch/loongarch64/scripts/compel-pack.lds.S deleted file mode 100644 index cfb7a2fb3..000000000 --- a/compel/arch/loongarch64/scripts/compel-pack.lds.S +++ /dev/null @@ -1,32 +0,0 @@ -OUTPUT_ARCH(loongarch) -EXTERN(__export_parasite_head_start) - -SECTIONS -{ - .crblob 0x0 : { - *(.head.text) - ASSERT(DEFINED(__export_parasite_head_start), - "Symbol __export_parasite_head_start is missing"); - *(.text*) - . = ALIGN(32); - *(.data*) - . = ALIGN(32); - *(.rodata*) - . = ALIGN(32); - *(.bss*) - . = ALIGN(32); - *(.got*) - . = ALIGN(32); - *(.toc*) - . = ALIGN(32); - } =0x00000000, - - /DISCARD/ : { - *(.debug*) - *(.comment*) - *(.note*) - *(.group*) - *(.eh_frame*) - *(*) - } -} diff --git a/compel/arch/loongarch64/src/lib/cpu.c b/compel/arch/loongarch64/src/lib/cpu.c deleted file mode 100644 index 172b90e27..000000000 --- a/compel/arch/loongarch64/src/lib/cpu.c +++ /dev/null @@ -1,41 +0,0 @@ -#include -#include - -#include "compel-cpu.h" -#include "common/bitops.h" -#include "common/compiler.h" -#include "log.h" - -#undef LOG_PREFIX -#define LOG_PREFIX "cpu: " - -static compel_cpuinfo_t rt_info; -static bool rt_info_done = false; - -void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) -{ -} - -void compel_clear_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) -{ -} - -int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature) -{ - return 0; -} - -int compel_cpuid(compel_cpuinfo_t *c) -{ - return 0; -} - -bool compel_cpu_has_feature(unsigned int feature) -{ - if (!rt_info_done) { - compel_cpuid(&rt_info); - rt_info_done = true; - } - - return compel_test_cpu_cap(&rt_info, feature); -} diff --git a/compel/arch/loongarch64/src/lib/handle-elf-host.c b/compel/arch/loongarch64/src/lib/handle-elf-host.c deleted file mode 100644 index a605a5a45..000000000 --- a/compel/arch/loongarch64/src/lib/handle-elf-host.c +++ /dev/null @@ -1,22 +0,0 @@ -#include -#include - -#include "handle-elf.h" -#include "piegen.h" -#include "log.h" - -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - -extern int __handle_elf(void *mem, size_t size); - -int handle_binary(void *mem, size_t size) -{ - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); - - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; -} diff --git a/compel/arch/loongarch64/src/lib/handle-elf.c b/compel/arch/loongarch64/src/lib/handle-elf.c deleted file mode 100644 index a605a5a45..000000000 --- a/compel/arch/loongarch64/src/lib/handle-elf.c +++ /dev/null @@ -1,22 +0,0 @@ -#include -#include - -#include "handle-elf.h" -#include "piegen.h" -#include "log.h" - -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - -extern int __handle_elf(void *mem, size_t size); - -int handle_binary(void *mem, size_t size) -{ - if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) - return __handle_elf(mem, size); - - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; -} diff --git a/compel/arch/loongarch64/src/lib/include/handle-elf.h b/compel/arch/loongarch64/src/lib/include/handle-elf.h deleted file mode 100644 index b0a66ef87..000000000 --- a/compel/arch/loongarch64/src/lib/include/handle-elf.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef COMPEL_HANDLE_ELF_H__ -#define COMPEL_HANDLE_ELF_H__ - -#include "elf64-types.h" - -#define arch_is_machine_supported(e_machine) (e_machine == EM_LOONGARCH) - -#endif /* COMPEL_HANDLE_ELF_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/syscall.h b/compel/arch/loongarch64/src/lib/include/syscall.h deleted file mode 100644 index ac3e2799a..000000000 --- a/compel/arch/loongarch64/src/lib/include/syscall.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __COMPEL_SYSCALL_H__ -#define __COMPEL_SYSCALL_H__ - -#ifndef SIGSTKFLT -#define SIGSTKFLT 16 -#endif - -#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h deleted file mode 100644 index 21eb1309f..000000000 --- a/compel/arch/loongarch64/src/lib/include/uapi/asm/breakpoints.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __COMPEL_BREAKPOINTS_H__ -#define __COMPEL_BREAKPOINTS_H__ -#define ARCH_SI_TRAP TRAP_BRKPT -extern int ptrace_set_breakpoint(pid_t pid, void *addr); -extern int ptrace_flush_breakpoints(pid_t pid); -#endif diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h deleted file mode 100644 index e568df789..000000000 --- a/compel/arch/loongarch64/src/lib/include/uapi/asm/cpu.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CR_ASM_CPU_H__ -#define __CR_ASM_CPU_H__ - -typedef struct { -} compel_cpuinfo_t; -#endif /* __CR_ASM_CPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h deleted file mode 100644 index 7f476d541..000000000 --- a/compel/arch/loongarch64/src/lib/include/uapi/asm/fpu.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __CR_ASM_FPU_H__ -#define __CR_ASM_FPU_H__ - -#endif /* __CR_ASM_FPU_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h deleted file mode 100644 index 0b047a5b0..000000000 --- a/compel/arch/loongarch64/src/lib/include/uapi/asm/infect-types.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef UAPI_COMPEL_ASM_TYPES_H__ -#define UAPI_COMPEL_ASM_TYPES_H__ - -#include - -#define SIGMAX 64 -#define SIGMAX_OLD 31 - -/* - * From the Linux kernel header arch/loongarch/include/uapi/asm/ptrace.h - * - * A thread LoongArch CPU context - * - * struct user_fp_state { - * uint64_t fpr[32]; - * uint64_t fcc; - * uint32_t fcsr; - * }; - * - * struct user_pt_regs { - * unsigned long regs[32]; - * unsigned long csr_era; - * unsigned long csr_badv; - * unsigned long reserved[11]; - * }; - */ - -struct user_gp_regs { - uint64_t regs[32]; - uint64_t orig_a0; - uint64_t pc; - uint64_t csr_badv; - uint64_t reserved[10]; -} __attribute__((aligned(8))); - -struct user_fp_regs { - uint64_t regs[32]; - uint64_t fcc; - uint32_t fcsr; -}; - -typedef struct user_gp_regs user_regs_struct_t; -typedef struct user_fp_regs user_fpregs_struct_t; - -#define user_regs_native(regs) true - -#define __compel_arch_fetch_thread_area(tid, th) 0 -#define compel_arch_fetch_thread_area(tctl) 0 -#define compel_arch_get_tls_task(ctl, tls) -#define compel_arch_get_tls_thread(tctl, tls) - -#define REG_RES(r) ((uint64_t)(r).regs[4]) -#define REG_IP(r) ((uint64_t)(r).pc) -#define REG_SP(r) ((uint64_t)(r).regs[3]) -#define REG_SYSCALL_NR(r) ((uint64_t)(r).regs[11]) -#define SET_REG_IP(r, val) ((r).pc = (val)) - -#define GPR_NUM 32 -#define FPR_NUM 32 - -#define __NR(syscall, compat) \ - ({ \ - (void)compat; \ - __NR_##syscall; \ - }) - -#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h deleted file mode 100644 index fcb545a1d..000000000 --- a/compel/arch/loongarch64/src/lib/include/uapi/asm/sigframe.h +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ -#define UAPI_COMPEL_ASM_SIGFRAME_H__ - -#include -#include -#include - -#include -#include - -#include - -#define rt_sigcontext sigcontext -/* sigcontext defined in usr/include/uapi/asm/sigcontext.h*/ -#include -typedef __u32 u32; - -typedef struct sigcontext_t { - __u64 pc; - __u64 regs[32]; - __u32 flags; - __u64 extcontext[0] __attribute__((__aligned__(16))); -} sigcontext_t; - -typedef struct context_info_t { - __u32 magic; - __u32 size; - __u64 padding; -} context_info_t; - -#define FPU_CTX_MAGIC 0x46505501 -#define FPU_CTX_ALIGN 8 -typedef struct fpu_context_t { - __u64 regs[32]; - __u64 fcc; - __u64 fcsr; -} fpu_context_t; - -typedef struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - sigset_t uc_sigmask; - __u8 __unused[1024 / 8 - sizeof(sigset_t)]; - sigcontext_t uc_mcontext; -} ucontext; - -/* Copy from the kernel source arch/loongarch/kernel/signal.c */ -struct rt_sigframe { - rt_siginfo_t rs_info; - ucontext rs_uc; -}; - -#define RT_SIGFRAME_UC(rt_sigframe) (&(rt_sigframe->rs_uc)) -#define RT_SIGFRAME_SIGMASK(rt_sigframe) ((k_rtsigset_t *)&RT_SIGFRAME_UC(rt_sigframe)->uc_sigmask) -#define RT_SIGFRAME_SIGCTX(rt_sigframe) (&(RT_SIGFRAME_UC(rt_sigframe)->uc_mcontext)) -#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(RT_SIGFRAME_SIGCTX(rt_sigframe)->pc)) -#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1) - -#define RT_SIGFRAME_FPU(rt_sigframe) \ - ({ \ - context_info_t *ctx = (context_info_t *)RT_SIGFRAME_SIGCTX(rt_sigframe)->extcontext; \ - ctx->magic = FPU_CTX_MAGIC; \ - ctx->size = sizeof(context_info_t) + sizeof(fpu_context_t); \ - (fpu_context_t *)((char *)ctx + sizeof(context_info_t)); \ - }) - -#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 - -/* clang-format off */ -#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ - asm volatile( \ - "addi.d $sp, %0, 0 \n" \ - "addi.d $a7, $zero, "__stringify(__NR_rt_sigreturn)" \n" \ - "syscall 0" \ - : \ - :"r"(new_sp) \ - : "$a7", "memory") -/* clang-format on */ - -int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe); - -#define rt_sigframe_erase_sigset(sigframe) memset(RT_SIGFRAME_SIGMASK(sigframe), 0, sizeof(k_rtsigset_t)) -#define rt_sigframe_copy_sigset(sigframe, from) memcpy(RT_SIGFRAME_SIGMASK(sigframe), from, sizeof(k_rtsigset_t)) - -#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ diff --git a/compel/arch/loongarch64/src/lib/infect.c b/compel/arch/loongarch64/src/lib/infect.c deleted file mode 100644 index 190c39227..000000000 --- a/compel/arch/loongarch64/src/lib/infect.c +++ /dev/null @@ -1,204 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include -#include "errno.h" -#include -#include -#include "common/err.h" -#include "common/page.h" -#include "asm/infect-types.h" -#include "ptrace.h" -#include "infect.h" -#include "infect-priv.h" -#include "log.h" -#include "common/bug.h" - -/* - * Injected syscall instruction - * loongarch64 is Little Endian - */ -const char code_syscall[] = { - 0x00, 0x00, 0x2b, 0x00, /* syscall */ - 0x00, 0x00, 0x2a, 0x00 /* break */ -}; - -int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - sigcontext_t *sc; - fpu_context_t *fpu; - - sc = RT_SIGFRAME_SIGCTX(sigframe); - memcpy(sc->regs, regs->regs, sizeof(regs->regs)); - sc->pc = regs->pc; - - fpu = RT_SIGFRAME_FPU(sigframe); - memcpy(fpu->regs, fpregs->regs, sizeof(fpregs->regs)); - fpu->fcc = fpregs->fcc; - fpu->fcsr = fpregs->fcsr; - return 0; -} - -int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) -{ - return 0; -} - -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, - void *arg, __maybe_unused unsigned long flags) -{ - user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; - struct iovec iov; - int ret; - - pr_info("Dumping GP/FPU registers for %d\n", pid); - - iov.iov_base = regs; - iov.iov_len = sizeof(user_regs_struct_t); - if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) { - pr_perror("Failed to obtain CPU registers for %d", pid); - goto err; - } - - /* - * Refer to Linux kernel arch/loongarch/kernel/signal.c - */ - if (regs->regs[0]) { - switch (regs->regs[4]) { - case -ERESTARTNOHAND: - case -ERESTARTSYS: - case -ERESTARTNOINTR: - regs->regs[4] = regs->orig_a0; - regs->pc -= 4; - break; - case -ERESTART_RESTARTBLOCK: - regs->regs[4] = regs->orig_a0; - regs->regs[11] = __NR_restart_syscall; - regs->pc -= 4; - break; - } - regs->regs[0] = 0; /* Don't deal with this again. */ - } - - iov.iov_base = fpregs; - iov.iov_len = sizeof(user_fpregs_struct_t); - if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { - pr_perror("Failed to obtain FPU registers for %d", pid); - goto err; - } - - ret = save(pid, arg, regs, fpregs); -err: - return 0; -} - -int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) -{ - struct iovec iov; - - pr_info("Restoring GP/FPU registers for %d\n", pid); - - iov.iov_base = ext_regs; - iov.iov_len = sizeof(*ext_regs); - if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { - pr_perror("Failed to set FPU registers for %d", pid); - return -1; - } - return 0; -} - -/* - * Registers $4 ~ $11 represents arguments a0 ~ a7, especially a7 is - * used as syscall number. - */ -int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) -{ - int err; - user_regs_struct_t regs = ctl->orig.regs; - - regs.regs[11] = (unsigned long)nr; - regs.regs[4] = arg1; - regs.regs[5] = arg2; - regs.regs[6] = arg3; - regs.regs[7] = arg4; - regs.regs[8] = arg5; - regs.regs[9] = arg6; - err = compel_execute_syscall(ctl, ®s, code_syscall); - - *ret = regs.regs[4]; - - return err; -} - -void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) -{ - long map; - int err; - - err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset >> PAGE_SHIFT); - - if (err < 0 || IS_ERR_VALUE(map)) { - pr_err("remote mmap() failed: %s\n", strerror(-map)); - return NULL; - } - - return (void *)map; -} - -/* - * regs must be inited when calling this function from original context - */ -void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) -{ - regs->pc = new_ip; - if (stack) - regs->regs[4] = (unsigned long)stack; -} - -bool arch_can_dump_task(struct parasite_ctl *ctl) -{ - return true; -} - -int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) -{ - long ret; - int err; - - err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->rs_uc.uc_stack, 0, 0, 0, 0); - return err ? err : ret; -} - -/* - * TODO: add feature - */ -int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} - -/* - * Refer to Linux kernel arch/loongarch/include/asm/processor.h - */ -#define TASK_SIZE32 (1UL) << 31 -#define TASK_SIZE64_MIN (1UL) << 40 -#define TASK_SIZE64_MAX (1UL) << 48 - -unsigned long compel_task_size(void) -{ - unsigned long task_size; - for (task_size = TASK_SIZE64_MIN; task_size < TASK_SIZE64_MAX; task_size <<= 1) - if (munmap((void *)task_size, page_size())) - break; - return task_size; -} diff --git a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl index ad3d44634..7a6db192c 100644 --- a/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/mips/plugins/std/syscalls/syscall_64.tbl @@ -84,7 +84,7 @@ __NR_sys_timer_settime 5217 sys_timer_settime (kernel_timer_t timer_id, int fl __NR_sys_timer_gettime 5218 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 5219 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 5220 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 5222 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) +__NR_clock_gettime 5222 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 5205 sys_exit_group (int error_code) __NR_set_thread_area 5242 sys_set_thread_area (unsigned long *addr) __NR_openat 5247 sys_openat (int dfd, const char *filename, int flags, int mode) @@ -109,15 +109,9 @@ __NR_memfd_create 5314 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 5317 sys_userfaultfd (int flags) ##TODO for kernel -__NR_open_tree 5428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 5429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 5430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 5431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 5432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 5435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_close_range 5436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 5434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_openat2 5437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 5438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -__NR_rseq 5327 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_membarrier 5318 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/mips/src/lib/handle-elf.c b/compel/arch/mips/src/lib/handle-elf.c index e086761c2..a605a5a45 100644 --- a/compel/arch/mips/src/lib/handle-elf.c +++ b/compel/arch/mips/src/lib/handle-elf.c @@ -5,31 +5,18 @@ #include "piegen.h" #include "log.h" +static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + extern int __handle_elf(void *mem, size_t size); int handle_binary(void *mem, size_t size) { - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)mem; + if (memcmp(mem, elf_ident_64_le, sizeof(elf_ident_64_le)) == 0) + return __handle_elf(mem, size); - /* check ELF magic */ - if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || - ehdr->e_ident[EI_MAG1] != ELFMAG1 || - ehdr->e_ident[EI_MAG2] != ELFMAG2 || - ehdr->e_ident[EI_MAG3] != ELFMAG3) { - pr_err("Invalid ELF magic\n"); - return -EINVAL; - } - - /* check ELF class and data encoding */ - if (ehdr->e_ident[EI_CLASS] != ELFCLASS64 || - ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { - pr_err("Unsupported ELF class or data encoding\n"); - return -EINVAL; - } - - if (ehdr->e_ident[EI_ABIVERSION] != 0) { - pr_warn("Unusual ABI version: %d\n", ehdr->e_ident[EI_ABIVERSION]); - } - - return __handle_elf(mem, size); + pr_err("Unsupported Elf format detected\n"); + return -EINVAL; } diff --git a/compel/arch/mips/src/lib/include/ldsodefs.h b/compel/arch/mips/src/lib/include/ldsodefs.h index 97e79755d..8cfde2496 100644 --- a/compel/arch/mips/src/lib/include/ldsodefs.h +++ b/compel/arch/mips/src/lib/include/ldsodefs.h @@ -69,8 +69,8 @@ struct La_mips_64_retval; /* An entry in a 64 bit SHT_REL section. */ typedef struct { - Elf32_Word r_sym; /* Symbol index */ - unsigned char r_ssym; /* Special symbol for 2nd relocation */ + Elf32_Word r_sym; /* Symbol index */ + unsigned char r_ssym; /* Special symbol for 2nd relocation */ unsigned char r_type3; /* 3rd relocation type */ unsigned char r_type2; /* 2nd relocation type */ unsigned char r_type1; /* 1st relocation type */ @@ -82,14 +82,14 @@ typedef union { } _Elf64_Mips_R_Info_union; typedef struct { - Elf64_Addr r_offset; /* Address */ + Elf64_Addr r_offset; /* Address */ _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ } Elf64_Mips_Rel; typedef struct { - Elf64_Addr r_offset; /* Address */ + Elf64_Addr r_offset; /* Address */ _Elf64_Mips_R_Info_union r_info; /* Relocation type and symbol index */ - Elf64_Sxword r_addend; /* Addend */ + Elf64_Sxword r_addend; /* Addend */ } Elf64_Mips_Rela; #define ELF64_MIPS_R_SYM(i) ((__extension__(_Elf64_Mips_R_Info_union)(i)).r_info_fields.r_sym) diff --git a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h index 481566a12..70b3f85a5 100644 --- a/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/mips/src/lib/include/uapi/asm/infect-types.h @@ -56,11 +56,10 @@ static inline bool user_regs_native(user_regs_struct_t *pregs) #define compel_arch_get_tls_task(ctl, tls) #define compel_arch_get_tls_thread(tctl, tls) -#define REG_RES(regs) ((regs).MIPS_v0) -#define REG_IP(regs) ((regs).cp0_epc) -#define SET_REG_IP(regs, val) ((regs).cp0_epc = (val)) -#define REG_SP(regs) ((regs).MIPS_sp) -#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) +#define REG_RES(regs) ((regs).MIPS_v0) +#define REG_IP(regs) ((regs).cp0_epc) +#define REG_SP(regs) ((regs).MIPS_sp) +#define REG_SYSCALL_NR(regs) ((regs).MIPS_v0) //#define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) #define __NR(syscall, compat) __NR_##syscall diff --git a/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h b/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h index 6db1ddbd3..82ae6096b 100644 --- a/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h +++ b/compel/arch/mips/src/lib/include/uapi/asm/siginfo.h @@ -52,14 +52,14 @@ typedef struct siginfo { /* kill() */ struct { - __kernel_pid_t _pid; /* sender's pid */ + __kernel_pid_t _pid; /* sender's pid */ __ARCH_SI_UID_T _uid; /* sender's uid */ } _kill; /* POSIX.1b timers */ struct { __kernel_timer_t _tid; /* timer id */ - int _overrun; /* overrun count */ + int _overrun; /* overrun count */ char _pad[sizeof(__ARCH_SI_UID_T) - sizeof(int)]; sigval_t _sigval; /* same as below */ int _sys_private; /* not to be passed to user */ @@ -67,16 +67,16 @@ typedef struct siginfo { /* POSIX.1b signals */ struct { - __kernel_pid_t _pid; /* sender's pid */ + __kernel_pid_t _pid; /* sender's pid */ __ARCH_SI_UID_T _uid; /* sender's uid */ sigval_t _sigval; } _rt; /* SIGCHLD */ struct { - __kernel_pid_t _pid; /* which child */ + __kernel_pid_t _pid; /* which child */ __ARCH_SI_UID_T _uid; /* sender's uid */ - int _status; /* exit code */ + int _status; /* exit code */ __ARCH_SI_CLOCK_T _utime; __ARCH_SI_CLOCK_T _stime; } _sigchld; @@ -104,8 +104,8 @@ typedef struct siginfo { /* SIGSYS */ struct { - void *_call_addr; /* calling user insn */ - int _syscall; /* triggering system call number */ + void *_call_addr; /* calling user insn */ + int _syscall; /* triggering system call number */ unsigned int _arch; /* AUDIT_ARCH_* of syscall */ } _sigsys; } _sifields; diff --git a/compel/arch/mips/src/lib/infect.c b/compel/arch/mips/src/lib/infect.c index a1d4865cc..68d0a2728 100644 --- a/compel/arch/mips/src/lib/infect.c +++ b/compel/arch/mips/src/lib/infect.c @@ -24,7 +24,7 @@ */ const char code_syscall[] = { 0x0c, 0x00, 0x00, 0x00, /* syscall */ - 0x0d, 0x00, 0x00, 0x00 /* break */ + 0x0d, 0x00, 0x00, 0x00 /* break */ }; /* 10-byte legacy floating point register */ @@ -119,9 +119,10 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { + user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping GP/FPU registers for %d\n", pid); @@ -149,7 +150,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct regs->regs[0] = 0; } - ret = save(pid, arg, regs, xs); + ret = save(arg, regs, xs); return ret; } diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index 3deb41cf7..1bb626bc5 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 246 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) +__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 234 sys_exit_group (int error_code) __NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len) @@ -108,15 +108,9 @@ __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_preadv 320 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_userfaultfd 364 sys_userfaultfd (int flags) __NR_ppoll 281 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -__NR_rseq 387 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_membarrier 365 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h index 25fc747e2..fe6192e20 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h @@ -21,13 +21,13 @@ typedef struct { unsigned long xer; unsigned long ccr; unsigned long softe; /* Soft enabled/disabled */ - unsigned long trap; /* Reason for being here */ + unsigned long trap; /* Reason for being here */ /* * N.B. for critical exceptions on 4xx, the dar and dsisr * fields are overloaded to hold srr0 and srr1. */ - unsigned long dar; /* Fault registers */ - unsigned long dsisr; /* on 4xx/Book-E used for ESR */ + unsigned long dar; /* Fault registers */ + unsigned long dsisr; /* on 4xx/Book-E used for ESR */ unsigned long result; /* Result of a system call */ } user_regs_struct_t; @@ -72,11 +72,10 @@ typedef struct { } tm; } user_fpregs_struct_t; -#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) -#define REG_IP(regs) ((uint64_t)(regs).nip) -#define SET_REG_IP(regs, val) ((regs).nip = (val)) -#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) -#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) +#define REG_RES(regs) ((uint64_t)(regs).gpr[3]) +#define REG_IP(regs) ((uint64_t)(regs).nip) +#define REG_SP(regs) ((uint64_t)(regs).gpr[1]) +#define REG_SYSCALL_NR(regs) ((uint64_t)(regs).gpr[0]) #define user_regs_native(pregs) true diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index 0c4ccb648..eb12c9f7c 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -14,7 +14,7 @@ */ #include -// XXX: the identifier rt_sigcontext is expected to be struct by the CRIU code +// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code #define rt_sigcontext sigcontext #include @@ -23,11 +23,6 @@ /* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */ #define USER_REDZONE_SIZE 512 -#if _CALL_ELF != 2 -#error Only supporting ABIv2. -#else -#define STACK_FRAME_MIN_SIZE 32 -#endif /* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */ #define TRAMP_SIZE 6 diff --git a/compel/arch/ppc64/src/lib/infect.c b/compel/arch/ppc64/src/lib/infect.c index 54abd48a4..fc174d0dd 100644 --- a/compel/arch/ppc64/src/lib/infect.c +++ b/compel/arch/ppc64/src/lib/infect.c @@ -11,7 +11,6 @@ #include "log.h" #include "common/bug.h" #include "common/page.h" -#include "common/err.h" #include "infect.h" #include "infect-priv.h" @@ -31,7 +30,7 @@ unsigned __page_shift = 0; */ const uint32_t code_syscall[] = { 0x44000002, /* sc */ - 0x0fe00000 /* twi 31,0,0 */ + 0x0fe00000 /* twi 31,0,0 */ }; static inline __always_unused void __check_code_syscall(void) @@ -304,58 +303,33 @@ out_free: return -1; /* still failing the checkpoint */ } -/* - * This is inspired by kernel function check_syscall_restart in - * arch/powerpc/kernel/signal.c - */ - -#ifndef TRAP -#define TRAP(r) ((r).trap & ~0xF) -#endif - -static bool trap_is_scv(user_regs_struct_t *regs) -{ - return TRAP(*regs) == 0x3000; -} - -static bool trap_is_syscall(user_regs_struct_t *regs) -{ - return trap_is_scv(regs) || TRAP(*regs) == 0x0C00; -} - -static void handle_syscall(pid_t pid, user_regs_struct_t *regs) -{ - unsigned long ret = regs->gpr[3]; - - if (trap_is_scv(regs)) { - if (!IS_ERR_VALUE(ret)) - return; - ret = -ret; - } else if (!(regs->ccr & 0x10000000)) { - return; - } - - /* Restart or interrupt the system call */ - switch (ret) { - case ERESTARTNOHAND: - case ERESTARTSYS: - case ERESTARTNOINTR: - regs->gpr[3] = regs->orig_gpr3; - regs->nip -= 4; - break; - case ERESTART_RESTARTBLOCK: - pr_warn("Will restore %d with interrupted system call\n", pid); - regs->gpr[3] = trap_is_scv(regs) ? -EINTR : EINTR; - break; - } -} - static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { pr_info("Dumping GP/FPU registers for %d\n", pid); - if (trap_is_syscall(regs)) - handle_syscall(pid, regs); + /* + * This is inspired by kernel function check_syscall_restart in + * arch/powerpc/kernel/signal.c + */ +#ifndef TRAP +#define TRAP(r) ((r).trap & ~0xF) +#endif + + if (TRAP(*regs) == 0x0C00 && regs->ccr & 0x10000000) { + /* Restart the system call */ + switch (regs->gpr[3]) { + case ERESTARTNOHAND: + case ERESTARTSYS: + case ERESTARTNOINTR: + regs->gpr[3] = regs->orig_gpr3; + regs->nip -= 4; + break; + case ERESTART_RESTARTBLOCK: + pr_warn("Will restore %d with interrupted system call\n", pid); + regs->gpr[3] = EINTR; + break; + } + } /* Resetting trap since we are now coming from user space. */ regs->trap = 0; @@ -391,16 +365,17 @@ static int __get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_stru return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; int ret; ret = __get_task_regs(pid, regs, fpregs); if (ret) return ret; - return save(pid, arg, regs, fpregs); + return save(arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) @@ -466,13 +441,13 @@ void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) { /* - * OpenPOWER ABI requires that r12 is set to the calling function address + * OpenPOWER ABI requires that r12 is set to the calling function addressi * to compute the TOC pointer. */ regs->gpr[12] = new_ip; regs->nip = new_ip; if (stack) - regs->gpr[1] = (unsigned long)stack - STACK_FRAME_MIN_SIZE; + regs->gpr[1] = (unsigned long)stack; regs->trap = 0; } diff --git a/compel/arch/riscv64/plugins/include/asm/prologue.h b/compel/arch/riscv64/plugins/include/asm/prologue.h deleted file mode 100644 index 5c22b7b06..000000000 --- a/compel/arch/riscv64/plugins/include/asm/prologue.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef __ASM_PROLOGUE_H__ -#define __ASM_PROLOGUE_H__ - -#ifndef __ASSEMBLY__ - -#include -#include -#include - -#include - -#define sys_recv(sockfd, ubuf, size, flags) sys_recvfrom(sockfd, ubuf, size, flags, NULL, NULL) - -typedef struct prologue_init_args { - struct sockaddr_un ctl_sock_addr; - unsigned int ctl_sock_addr_len; - - unsigned int arg_s; - void *arg_p; - - void *sigframe; -} prologue_init_args_t; - -#endif /* __ASSEMBLY__ */ - -/* - * Reserve enough space for sigframe. - * - * FIXME It is rather should be taken from sigframe header. - */ -#define PROLOGUE_SGFRAME_SIZE 4096 - -#define PROLOGUE_INIT_ARGS_SIZE 1024 - -#endif /* __ASM_PROLOGUE_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/asm/syscall-types.h b/compel/arch/riscv64/plugins/include/asm/syscall-types.h deleted file mode 100644 index b9740a9ee..000000000 --- a/compel/arch/riscv64/plugins/include/asm/syscall-types.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef COMPEL_ARCH_SYSCALL_TYPES_H__ -#define COMPEL_ARCH_SYSCALL_TYPES_H__ - -#define SA_RESTORER 0x04000000 - -typedef void rt_signalfn_t(int, siginfo_t *, void *); -typedef rt_signalfn_t *rt_sighandler_t; - -typedef void rt_restorefn_t(void); -typedef rt_restorefn_t *rt_sigrestore_t; - -#define _KNSIG 64 // number of signals -#define _NSIG_BPW 64 // number of signals per word - -#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW) - -typedef struct { - unsigned long sig[_KNSIG_WORDS]; -} k_rtsigset_t; - -typedef struct { - rt_sighandler_t rt_sa_handler; - unsigned long rt_sa_flags; - rt_sigrestore_t rt_sa_restorer; - k_rtsigset_t rt_sa_mask; -} rt_sigaction_t; - -#endif /* COMPEL_ARCH_SYSCALL_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/include/features.h b/compel/arch/riscv64/plugins/include/features.h deleted file mode 100644 index 274cee52a..000000000 --- a/compel/arch/riscv64/plugins/include/features.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __COMPEL_ARCH_FEATURES_H -#define __COMPEL_ARCH_FEATURES_H - -#endif /* __COMPEL_ARCH_FEATURES_H */ \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/parasite-head.S b/compel/arch/riscv64/plugins/std/parasite-head.S deleted file mode 100644 index 3e9d272e3..000000000 --- a/compel/arch/riscv64/plugins/std/parasite-head.S +++ /dev/null @@ -1,7 +0,0 @@ -#include "common/asm/linkage.h" - - .section .head.text, "ax" -ENTRY(__export_parasite_head_start) - jal parasite_service - ebreak -END(__export_parasite_head_start) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls b/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls deleted file mode 100644 index 5af35bcb4..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/Makefile.syscalls +++ /dev/null @@ -1,59 +0,0 @@ -ccflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ -asflags-y += -iquote $(PLUGIN_ARCH_DIR)/std/syscalls/ - -sys-types := $(obj)/include/uapi/std/syscall-types.h -sys-codes := $(obj)/include/uapi/std/syscall-codes.h -sys-proto := $(obj)/include/uapi/std/syscall.h - -sys-def := $(PLUGIN_ARCH_DIR)/std/syscalls/syscall.def -sys-asm-common-name := std/syscalls/syscall-common.S -sys-asm-common := $(PLUGIN_ARCH_DIR)/$(sys-asm-common-name) -sys-asm-types := $(obj)/include/uapi/std/asm/syscall-types.h -sys-exec-tbl = $(PLUGIN_ARCH_DIR)/std/sys-exec-tbl.c - -sys-gen := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-syscalls.pl -sys-gen-tbl := $(PLUGIN_ARCH_DIR)/std/syscalls/gen-sys-exec-tbl.pl - -sys-asm := ./$(PLUGIN_ARCH_DIR)/std/syscalls/syscalls.S -std-lib-y += $(sys-asm:.S=).o - -ifeq ($(ARCH),arm) -arch_bits := 32 -else -arch_bits := 64 -endif - -sys-exec-tbl := sys-exec-tbl.c - -$(sys-asm) $(sys-types) $(sys-codes) $(sys-proto): $(sys-gen) $(sys-def) $(sys-asm-common) $(sys-asm-types) - $(E) " GEN " $@ - $(Q) perl \ - $(sys-gen) \ - $(sys-def) \ - $(sys-codes) \ - $(sys-proto) \ - $(sys-asm) \ - $(sys-asm-common-name) \ - $(sys-types) \ - $(arch_bits) - -$(sys-asm:.S=).o: $(sys-asm) - -$(sys-exec-tbl): $(sys-gen-tbl) $(sys-def) - $(E) " GEN " $@ - $(Q) perl \ - $(sys-gen-tbl) \ - $(sys-def) \ - $(sys-exec-tbl) \ - $(arch_bits) - -$(sys-asm-types): $(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h - $(call msg-gen, $@) - $(Q) ln -s ../../../../../../$(PLUGIN_ARCH_DIR)/include/asm/syscall-types.h $(sys-asm-types) - $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.S $(obj)/include/uapi/std/syscall-aux.S - $(Q) ln -s ../../../../../$(PLUGIN_ARCH_DIR)/std/syscalls/syscall-aux.h $(obj)/include/uapi/std/syscall-aux.h - -std-headers-deps += $(sys-asm) $(sys-codes) $(sys-proto) $(sys-asm-types) $(sys-codes) -mrproper-y += $(std-headers-deps) -mrproper-y += $(obj)/include/uapi/std/syscall-aux.S -mrproper-y += $(obj)/include/uapi/std/syscall-aux.h \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl deleted file mode 100755 index 61a807eb6..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/gen-sys-exec-tbl.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -my $in = $ARGV[0]; -my $tblout = $ARGV[1]; -my $bits = $ARGV[2]; - -my $code = "code$bits"; - -open TBLOUT, ">", $tblout or die $!; -open IN, "<", $in or die $!; - -print TBLOUT "/* Autogenerated, don't edit */\n"; -print TBLOUT "static struct syscall_exec_desc sc_exec_table[] = {\n"; - -for () { - if ($_ =~ /\#/) { - next; - } - - my $sys_name; - my $sys_num; - - if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { - $sys_name = $+{alias}; - } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { - $sys_name = $+{name}; - } else { - unlink $tblout; - die "Invalid syscall definition file: invalid entry $_\n"; - } - - $sys_num = $+{$code}; - - if ($sys_num ne "!") { - print TBLOUT "SYSCALL($sys_name, $sys_num)\n"; - } -} - -print TBLOUT " { }, /* terminator */"; -print TBLOUT "};" \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl b/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl deleted file mode 100755 index a53f1962f..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/gen-syscalls.pl +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -my $in = $ARGV[0]; -my $codesout = $ARGV[1]; -my $codes = $ARGV[1]; -$codes =~ s/.*include\/uapi\//compel\/plugins\//g; -my $protosout = $ARGV[2]; -my $protos = $ARGV[2]; -$protos =~ s/.*include\/uapi\//compel\/plugins\//g; -my $asmout = $ARGV[3]; -my $asmcommon = $ARGV[4]; -my $prototypes = $ARGV[5]; -$prototypes =~ s/.*include\/uapi\//compel\/plugins\//g; -my $bits = $ARGV[6]; - -my $codesdef = $codes; -$codesdef =~ tr/.\-\//_/; -my $protosdef = $protos; -$protosdef =~ tr/.\-\//_/; -my $code = "code$bits"; -my $need_aux = 0; - -unlink $codesout; -unlink $protosout; -unlink $asmout; - -open CODESOUT, ">", $codesout or die $!; -open PROTOSOUT, ">", $protosout or die $!; -open ASMOUT, ">", $asmout or die $!; -open IN, "<", $in or die $!; - -print CODESOUT <<"END"; -/* Autogenerated, don't edit */ -#ifndef $codesdef -#define $codesdef -END - -print PROTOSOUT <<"END"; -/* Autogenerated, don't edit */ -#ifndef $protosdef -#define $protosdef -#include <$prototypes> -#include <$codes> -END - -print ASMOUT <<"END"; -/* Autogenerated, don't edit */ -#include <$codes> -#include "$asmcommon" -END - - -for () { - if ($_ =~ /\#/) { - next; - } - - my $code_macro; - my $sys_macro; - my $sys_name; - - if (/(?\S+)\s+(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { - $code_macro = "__NR_$+{name}"; - $sys_macro = "SYS_$+{name}"; - $sys_name = "sys_$+{alias}"; - } elsif (/(?\S+)\s+(?\d+|\!)\s+(?(?:\d+|\!))\s+\((?.+)\)/) { - $code_macro = "__NR_$+{name}"; - $sys_macro = "SYS_$+{name}"; - $sys_name = "sys_$+{name}"; - } else { - unlink $codesout; - unlink $protosout; - unlink $asmout; - - die "Invalid syscall definition file: invalid entry $_\n"; - } - - if ($+{$code} ne "!") { - print CODESOUT "#ifndef $code_macro\n#define $code_macro $+{$code}\n#endif\n"; - print CODESOUT "#ifndef $sys_macro\n#define $sys_macro $code_macro\n#endif\n"; - print ASMOUT "syscall $sys_name, $code_macro\n"; - - } else { - $need_aux = 1; - } - - print PROTOSOUT "extern long $sys_name($+{args});\n"; -} - -if ($need_aux == 1) { - print ASMOUT "#include \n"; - print CODESOUT "#include \n"; -} - -print CODESOUT "#endif /* $codesdef */"; -print PROTOSOUT "#endif /* $protosdef */"; \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S deleted file mode 100644 index 04160b7ac..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.S +++ /dev/null @@ -1,37 +0,0 @@ -/** - * This source contains emulation of syscalls - * that are not implemented in the riscv64 Linux kernel - */ - -ENTRY(sys_open) - add a3, x0, a2 - add a2, x0, a1 - add a1, x0, a0 - addi a0, x0, -100 - j sys_openat -END(sys_open) - - -ENTRY(sys_mkdir) - add a3,x0, a2 - add a2, x0, a1 - add a1, x0, a0 - addi a0, x0, -100 - j sys_mkdirat -END(sys_mkdir) - - -ENTRY(sys_rmdir) - addi a2, x0, 0x200 // flags = AT_REMOVEDIR - add a1, x0, a0 - addi a0, x0, -100 - j sys_unlinkat -END(sys_rmdir) - - -ENTRY(sys_unlink) - addi a2, x0, 0 // flags = 0 - add a1, x0, a0 - addi a0, x0, -100 - j sys_unlinkat -END(sys_unlink) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h b/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h deleted file mode 100644 index 881765bbb..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/syscall-aux.h +++ /dev/null @@ -1,3 +0,0 @@ -#ifndef __NR_openat -#define __NR_openat 56 -#endif \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S b/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S deleted file mode 100644 index fdef3b47a..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/syscall-common.S +++ /dev/null @@ -1,17 +0,0 @@ -#include "common/asm/linkage.h" - -syscall_common: - ecall - ret - -.macro syscall name, nr - ENTRY(\name) - li a7, \nr - j syscall_common - END(\name) -.endm - -ENTRY(__cr_restore_rt) - li a7, __NR_rt_sigreturn - ecall -END(__cr_restore_rt) \ No newline at end of file diff --git a/compel/arch/riscv64/plugins/std/syscalls/syscall.def b/compel/arch/riscv64/plugins/std/syscalls/syscall.def deleted file mode 100644 index 967f097f9..000000000 --- a/compel/arch/riscv64/plugins/std/syscalls/syscall.def +++ /dev/null @@ -1,125 +0,0 @@ -# -# System calls table, please make sure the table consists of only the syscalls -# really used somewhere in the project. -# -# The template is (name and arguments are optional if you need only __NR_x -# defined, but no real entry point in syscalls lib). -# -# name/alias code64 code32 arguments -# ----------------------------------------------------------------------- -# -read 63 3 (int fd, void *buf, unsigned long count) -write 64 4 (int fd, const void *buf, unsigned long count) -open ! 5 (const char *filename, unsigned long flags, unsigned long mode) -close 57 6 (int fd) -lseek 62 19 (int fd, unsigned long offset, unsigned long origin) -mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) -mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot) -munmap 215 91 (void *addr, unsigned long len) -brk 214 45 (void *addr) -rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize) -rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize) -rt_sigreturn 139 173 (void) -ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg) -pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos) -ptrace 117 26 (long request, pid_t pid, void *addr, void *data) -mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr) -mincore 232 219 (void *addr, unsigned long size, unsigned char *vec) -madvise 233 220 (unsigned long start, size_t len, int behavior) -shmat 196 305 (int shmid, void *shmaddr, int shmflag) -pause 1061 29 (void) -nanosleep 101 162 (struct timespec *req, struct timespec *rem) -getitimer 102 105 (int which, const struct itimerval *val) -setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old) -getpid 172 20 (void) -socket 198 281 (int domain, int type, int protocol) -connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen) -sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len) -recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len) -sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags) -recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags) -shutdown 210 293 (int sockfd, int how) -bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen) -setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen) -getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen) -clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, unsigned long newtls, void *child_tid) -exit 93 1 (unsigned long error_code) -wait4 260 114 (int pid, int *status, int options, struct rusage *ru) -waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) -kill 129 37 (long pid, int sig) -fcntl 25 55 (int fd, int type, long arg) -flock 32 143 (int fd, unsigned long cmd) -mkdir ! 39 (const char *name, int mode) -rmdir ! 40 (const char *name) -unlink ! 10 (char *pathname) -readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize) -umask 166 60 (int mask) -getgroups 158 205 (int gsize, unsigned int *groups) -setgroups 159 206 (int gsize, unsigned int *groups) -setresuid 147 164 (int uid, int euid, int suid) -getresuid 148 165 (int *uid, int *euid, int *suid) -setresgid 149 170 (int gid, int egid, int sgid) -getresgid 150 171 (int *gid, int *egid, int *sgid) -getpgid 155 132 (pid_t pid) -setfsuid 151 138 (int fsuid) -setfsgid 152 139 (int fsgid) -getsid 156 147 (void) -capget 90 184 (struct cap_header *h, struct cap_data *d) -capset 91 185 (struct cap_header *h, struct cap_data *d) -rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info) -setpriority 140 97 (int which, int who, int nice) -sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p) -sigaltstack 132 186 (const void *uss, void *uoss) -personality 92 136 (unsigned int personality) -prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) -arch_prctl ! 17 (int option, unsigned long addr) -setrlimit 164 75 (int resource, struct krlimit *rlim) -mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data) -umount2 39 52 (char *name, int flags) -gettid 178 224 (void) -futex 98 240 (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) -set_tid_address 96 256 (int *tid_addr) -restart_syscall 128 0 (void) -timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) -timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting) -timer_gettime 108 259 (int timer_id, const struct itimerspec *setting) -timer_getoverrun 109 260 (int timer_id) -timer_delete 111 261 (kernel_timer_t timer_id) -clock_gettime 113 263 (clockid_t which_clock, struct timespec *tp) -exit_group 94 248 (int error_code) -set_robust_list 99 338 (struct robust_list_head *head, size_t len) -get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) -signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) -rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info) -vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) -timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) -fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags) -fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, uint64_t mask, int dfd, const char *pathname) -open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags) -setns 268 375 (int fd, int nstype) -kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) -openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode) -mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode) -unlinkat 35 328 (int dirfd, const char *pathname, int flags) -memfd_create 279 385 (const char *name, unsigned int flags) -io_setup 0 243 (unsigned nr_events, aio_context_t *ctx) -io_submit 2 246 (aio_context_t ctx_id, long nr, struct iocb **iocbpp) -io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) -seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) -gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) -preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) -userfaultfd 282 388 (int flags) -fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) -cacheflush ! 983042 (void *start, void *end, int flags) -ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -fsopen 430 430 (char *fsname, unsigned int flags) -fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) -fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) -clone3 435 435 (struct clone_args *uargs, size_t size) -pidfd_open 434 434 (pid_t pid, unsigned int flags) -pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) -rseq 293 293 (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -move_mount 429 429 (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) -open_tree 428 428 (int dirfd, const char *pathname, unsigned int flags) -openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) -membarrier 283 283 (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/riscv64/scripts/compel-pack.lds.S b/compel/arch/riscv64/scripts/compel-pack.lds.S deleted file mode 100644 index a61235b44..000000000 --- a/compel/arch/riscv64/scripts/compel-pack.lds.S +++ /dev/null @@ -1,32 +0,0 @@ -OUTPUT_ARCH(riscv) -EXTERN(__export_parasite_head_start) - -SECTIONS -{ - .crblob 0x0 : { - *(.head.text) - ASSERT(DEFINED(__export_parasite_head_start), - "Symbol __export_parasite_head_start is missing"); - *(.text*) - . = ALIGN(32); - *(.data*) - . = ALIGN(32); - *(.rodata*) - . = ALIGN(32); - *(.bss*) - . = ALIGN(32); - *(.got*) - . = ALIGN(32); - *(.toc*) - . = ALIGN(32); - } =0x00000000, - - /DISCARD/ : { - *(.debug*) - *(.comment*) - *(.note*) - *(.group*) - *(.eh_frame*) - *(*) - } -} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/cpu.c b/compel/arch/riscv64/src/lib/cpu.c deleted file mode 100644 index 9a0291f70..000000000 --- a/compel/arch/riscv64/src/lib/cpu.c +++ /dev/null @@ -1,78 +0,0 @@ -#include -#include - -#include "compel-cpu.h" - -#include "common/bitops.h" - -#include "log.h" - -#undef LOG_PREFIX -#define LOG_PREFIX "cpu: " - -static compel_cpuinfo_t rt_info; - -static void fetch_rt_cpuinfo(void) -{ - static bool rt_info_done = false; - - if (!rt_info_done) { - compel_cpuid(&rt_info); - rt_info_done = true; - } -} - -void compel_set_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) -{ -} -void compel_clear_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) -{ -} -int compel_test_cpu_cap(compel_cpuinfo_t *info, unsigned int feature) -{ - return 0; -} -int compel_test_fpu_cap(compel_cpuinfo_t *info, unsigned int feature) -{ - return 0; -} -int compel_cpuid(compel_cpuinfo_t *info) -{ - return 0; -} - -bool compel_cpu_has_feature(unsigned int feature) -{ - fetch_rt_cpuinfo(); - return compel_test_cpu_cap(&rt_info, feature); -} - -bool compel_fpu_has_feature(unsigned int feature) -{ - fetch_rt_cpuinfo(); - return compel_test_fpu_cap(&rt_info, feature); -} - -uint32_t compel_fpu_feature_size(unsigned int feature) -{ - fetch_rt_cpuinfo(); - return 0; -} - -uint32_t compel_fpu_feature_offset(unsigned int feature) -{ - fetch_rt_cpuinfo(); - return 0; -} - -void compel_cpu_clear_feature(unsigned int feature) -{ - fetch_rt_cpuinfo(); - return compel_clear_cpu_cap(&rt_info, feature); -} - -void compel_cpu_copy_cpuinfo(compel_cpuinfo_t *c) -{ - fetch_rt_cpuinfo(); - memcpy(c, &rt_info, sizeof(rt_info)); -} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf-host.c b/compel/arch/riscv64/src/lib/handle-elf-host.c deleted file mode 120000 index fe4611886..000000000 --- a/compel/arch/riscv64/src/lib/handle-elf-host.c +++ /dev/null @@ -1 +0,0 @@ -handle-elf.c \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/handle-elf.c b/compel/arch/riscv64/src/lib/handle-elf.c deleted file mode 100644 index 22420bc78..000000000 --- a/compel/arch/riscv64/src/lib/handle-elf.c +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include - -#include "handle-elf.h" -#include "piegen.h" -#include "log.h" - -static const unsigned char __maybe_unused elf_ident_64_le[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - -static const unsigned char __maybe_unused elf_ident_64_be[EI_NIDENT] = { - 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00, /* clang-format */ - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, -}; - -int handle_binary(void *mem, size_t size) -{ - const unsigned char *elf_ident = -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - elf_ident_64_le; -#else - elf_ident_64_be; -#endif - - if (memcmp(mem, elf_ident, sizeof(elf_ident_64_le)) == 0) - return handle_elf_riscv64(mem, size); - - pr_err("Unsupported Elf format detected\n"); - return -EINVAL; -} \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/handle-elf.h b/compel/arch/riscv64/src/lib/include/handle-elf.h deleted file mode 100644 index 582770583..000000000 --- a/compel/arch/riscv64/src/lib/include/handle-elf.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef COMPEL_HANDLE_ELF_H__ -#define COMPEL_HANDLE_ELF_H__ - -#include "elf64-types.h" - -#define __handle_elf handle_elf_riscv64 -#define ELF_RISCV -#define arch_is_machine_supported(e_machine) (e_machine == EM_RISCV) - -extern int handle_elf_riscv64(void *mem, size_t size); - -#endif /* COMPEL_HANDLE_ELF_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/syscall.h b/compel/arch/riscv64/src/lib/include/syscall.h deleted file mode 100644 index 53f10525d..000000000 --- a/compel/arch/riscv64/src/lib/include/syscall.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __COMPEL_SYSCALL_H__ -#define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) \ - ({ \ - (void)compat; \ - __NR_##syscall; \ - }) -#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h b/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h deleted file mode 100644 index f2ba799cb..000000000 --- a/compel/arch/riscv64/src/lib/include/uapi/asm/breakpoints.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __COMPEL_BREAKPOINTS_H__ -#define __COMPEL_BREAKPOINTS_H__ -#define ARCH_SI_TRAP TRAP_BRKPT - -static inline int ptrace_set_breakpoint(pid_t pid, void *addr) -{ - return 0; -} - -static inline int ptrace_flush_breakpoints(pid_t pid) -{ - return 0; -} - -#endif \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h deleted file mode 100644 index ac58567e3..000000000 --- a/compel/arch/riscv64/src/lib/include/uapi/asm/cpu.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef UAPI_COMPEL_ASM_CPU_H__ -#define UAPI_COMPEL_ASM_CPU_H__ - -typedef struct { -} compel_cpuinfo_t; - -#endif /* UAPI_COMPEL_ASM_CPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h b/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h deleted file mode 100644 index a74decc23..000000000 --- a/compel/arch/riscv64/src/lib/include/uapi/asm/fpu.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __CR_ASM_FPU_H__ -#define __CR_ASM_FPU_H__ - -#endif /* __CR_ASM_FPU_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h deleted file mode 100644 index 192810cac..000000000 --- a/compel/arch/riscv64/src/lib/include/uapi/asm/infect-types.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef UAPI_COMPEL_ASM_TYPES_H__ -#define UAPI_COMPEL_ASM_TYPES_H__ - -#include -#include -#include -#include - -#define SIGMAX 64 -#define SIGMAX_OLD 31 - -/* - * Copied from the Linux kernel header arch/riscv/include/uapi/asm/ptrace.h - * - * A thread RISC-V CPU context - */ -typedef struct user_regs_struct user_regs_struct_t; -typedef struct __riscv_d_ext_state user_fpregs_struct_t; - -#define __compel_arch_fetch_thread_area(tid, th) 0 -#define compel_arch_fetch_thread_area(tctl) 0 -#define compel_arch_get_tls_task(ctl, tls) -#define compel_arch_get_tls_thread(tctl, tls) - -#define REG_RES(registers) ((uint64_t)(registers).a0) -#define REG_IP(registers) ((uint64_t)(registers).pc) -#define SET_REG_IP(registers, val) ((registers).pc = (val)) - -/* - * REG_SP is also defined in riscv64-linux-gnu/include/sys/ucontext.h - * with a different meaning, and it's not used in CRIU. So we have to - * undefine it here. - */ -#ifdef REG_SP -#undef REG_SP -#endif - -#define REG_SP(registers) ((uint64_t)((registers).sp)) - -#define REG_SYSCALL_NR(registers) ((uint64_t)(registers).a7) - -#define user_regs_native(pregs) true - -#define ARCH_SI_TRAP TRAP_BRKPT - -#define __NR(syscall, compat) \ - ({ \ - (void)compat; \ - __NR_##syscall; \ - }) - -#endif /* UAPI_COMPEL_ASM_TYPES_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h b/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h deleted file mode 100644 index e231d0465..000000000 --- a/compel/arch/riscv64/src/lib/include/uapi/asm/instruction_formats.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef COMPEL_RELOCATIONS_H__ -#define COMPEL_RELOCATIONS_H__ - -#include - -static inline uint32_t riscv_b_imm(uint32_t val) -{ - return (val & 0x00001000) << 19 | (val & 0x000007e0) << 20 | (val & 0x0000001e) << 7 | (val & 0x00000800) >> 4; -} - -static inline uint32_t riscv_i_imm(uint32_t val) -{ - return val << 20; -} - -static inline uint32_t riscv_u_imm(uint32_t val) -{ - return val & 0xfffff000; -} - -static inline uint32_t riscv_j_imm(uint32_t val) -{ - return (val & 0x00100000) << 11 | (val & 0x000007fe) << 20 | (val & 0x00000800) << 9 | (val & 0x000ff000); -} - -#endif /* COMPEL_RELOCATIONS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h b/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h deleted file mode 100644 index e40fb6fce..000000000 --- a/compel/arch/riscv64/src/lib/include/uapi/asm/processor-flags.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ -#define UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ - -#endif /* UAPI_COMPEL_ASM_PROCESSOR_FLAGS_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h deleted file mode 100644 index 761a08f62..000000000 --- a/compel/arch/riscv64/src/lib/include/uapi/asm/sigframe.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef UAPI_COMPEL_ASM_SIGFRAME_H__ -#define UAPI_COMPEL_ASM_SIGFRAME_H__ - -#include - -#include - -#include - -/* Copied from the kernel header arch/riscv/include/uapi/asm/sigcontext.h */ -/* - * Signal context structure - * - * This contains the context saved before a signal handler is invoked; - * it is restored by sys_sigreturn / sys_rt_sigreturn. - */ -// struct sigcontext { -// struct user_regs_struct sc_regs; -// union __riscv_fp_state sc_fpregs; -// /* -// * 4K + 128 reserved for vector state and future expansion. -// * This space is enough to store the vector context whose VLENB -// * is less or equal to 128. -// * (The size of the vector context is 4144 byte as VLENB is 128) -// */ -// __u8 __reserved[4224] __attribute__((__aligned__(16))); -// }; - -#define rt_sigcontext sigcontext - -#include - -/* Copied from the kernel source arch/riscv/kernel/signal.c */ -struct rt_sigframe { - siginfo_t info; - ucontext_t uc; //ucontext_t structure holds the user context, e.g., the signal mask, GP regs -}; - -/* - generates inline assembly code for triggering the rt_sigreturn system call. - used to return from a signal handler back to the normal execution flow of the process. -*/ -/* clang-format off */ -#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ - asm volatile( \ - "mv sp, %0\n" \ - "li a7, "__stringify(__NR_rt_sigreturn)" \n" \ - "ecall\n" \ - : \ - : "r"(new_sp) \ - : "a7", "memory") -/* clang-format on */ - -#define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) -#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.__gregs[REG_PC]) -#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1 -#define RT_SIGFRAME_OFFSET(rt_sigframe) 0 - -// #define RT_SIGFRAME_SIGCONTEXT(rt_sigframe) ((struct cr_sigcontext *)&(rt_sigframe)->uc.uc_mcontext) -// #define RT_SIGFRAME_AUX_CONTEXT(rt_sigframe) ((struct sigcontext *)&(RT_SIGFRAME_SIGCONTEXT(rt_sigframe)->__reserved)) -// #define RT_SIGFRAME_FPU(rt_sigframe) (&RT_SIGFRAME_AUX_CONTEXT(rt_sigframe)->fpsimd) - -#define rt_sigframe_erase_sigset(sigframe) \ - memset(&sigframe->uc.uc_sigmask, 0, sizeof(k_rtsigset_t)) // erase the signal mask -#define rt_sigframe_copy_sigset(sigframe, from) \ - memcpy(&sigframe->uc.uc_sigmask, from, sizeof(k_rtsigset_t)) // copy the signal mask - -#endif /* UAPI_COMPEL_ASM_SIGFRAME_H__ */ \ No newline at end of file diff --git a/compel/arch/riscv64/src/lib/infect.c b/compel/arch/riscv64/src/lib/infect.c deleted file mode 100644 index 3f3a4b7ec..000000000 --- a/compel/arch/riscv64/src/lib/infect.c +++ /dev/null @@ -1,224 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "common/page.h" -#include "uapi/compel/asm/infect-types.h" -#include "log.h" -#include "errno.h" -#include "infect.h" -#include "infect-priv.h" - -unsigned __page_size = 0; -unsigned __page_shift = 0; - -/* - * Injected syscall instruction - */ -const char code_syscall[] = { - 0x73, 0x00, 0x00, 0x00, /* ecall */ - 0x73, 0x00, 0x10, 0x00 /* ebreak */ -}; - -static const int code_syscall_aligned = round_up(sizeof(code_syscall), sizeof(long)); - -static inline void __always_unused __check_code_syscall(void) -{ - BUILD_BUG_ON(code_syscall_aligned != BUILTIN_SYSCALL_SIZE); - BUILD_BUG_ON(!is_log2(sizeof(code_syscall))); -} - -int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - sigframe->uc.uc_mcontext.__gregs[0] = regs->pc; - sigframe->uc.uc_mcontext.__gregs[1] = regs->ra; - sigframe->uc.uc_mcontext.__gregs[2] = regs->sp; - sigframe->uc.uc_mcontext.__gregs[3] = regs->gp; - sigframe->uc.uc_mcontext.__gregs[4] = regs->tp; - sigframe->uc.uc_mcontext.__gregs[5] = regs->t0; - sigframe->uc.uc_mcontext.__gregs[6] = regs->t1; - sigframe->uc.uc_mcontext.__gregs[7] = regs->t2; - sigframe->uc.uc_mcontext.__gregs[8] = regs->s0; - sigframe->uc.uc_mcontext.__gregs[9] = regs->s1; - sigframe->uc.uc_mcontext.__gregs[10] = regs->a0; - sigframe->uc.uc_mcontext.__gregs[11] = regs->a1; - sigframe->uc.uc_mcontext.__gregs[12] = regs->a2; - sigframe->uc.uc_mcontext.__gregs[13] = regs->a3; - sigframe->uc.uc_mcontext.__gregs[14] = regs->a4; - sigframe->uc.uc_mcontext.__gregs[15] = regs->a5; - sigframe->uc.uc_mcontext.__gregs[16] = regs->a6; - sigframe->uc.uc_mcontext.__gregs[17] = regs->a7; - sigframe->uc.uc_mcontext.__gregs[18] = regs->s2; - sigframe->uc.uc_mcontext.__gregs[19] = regs->s3; - sigframe->uc.uc_mcontext.__gregs[20] = regs->s4; - sigframe->uc.uc_mcontext.__gregs[21] = regs->s5; - sigframe->uc.uc_mcontext.__gregs[22] = regs->s6; - sigframe->uc.uc_mcontext.__gregs[23] = regs->s7; - sigframe->uc.uc_mcontext.__gregs[24] = regs->s8; - sigframe->uc.uc_mcontext.__gregs[25] = regs->s9; - sigframe->uc.uc_mcontext.__gregs[26] = regs->s10; - sigframe->uc.uc_mcontext.__gregs[27] = regs->s11; - sigframe->uc.uc_mcontext.__gregs[28] = regs->t3; - sigframe->uc.uc_mcontext.__gregs[29] = regs->t4; - sigframe->uc.uc_mcontext.__gregs[30] = regs->t5; - sigframe->uc.uc_mcontext.__gregs[31] = regs->t6; - - memcpy(sigframe->uc.uc_mcontext.__fpregs.__d.__f, fpregs->f, sizeof(fpregs->f)); - sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpregs->fcsr; - - return 0; -} - -int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) -{ - return 0; -} - -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, - void *arg, __maybe_unused unsigned long flags) -{ - user_fpregs_struct_t tmp, *fpsimd = ext_regs ? ext_regs : &tmp; - struct iovec iov; - int ret = -1; - - pr_info("Dumping FPU registers for %d\n", pid); - - iov.iov_base = fpsimd; - iov.iov_len = sizeof(*fpsimd); - if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) { - pr_perror("Failed to obtain FPU registers for %d", pid); - return -1; - } - - ret = save(pid, arg, regs, fpsimd); - return ret; -} - -int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) -{ - struct iovec iov; - - pr_info("Restoring GP/FPU registers for %d\n", pid); - - iov.iov_base = ext_regs; - iov.iov_len = sizeof(*ext_regs); - if (ptrace(PTRACE_SETREGSET, pid, NT_PRFPREG, &iov)) { - pr_perror("Failed to set FPU registers for %d", pid); - return -1; - } - return 0; -} - -int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, unsigned long arg1, unsigned long arg2, - unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6) -{ - user_regs_struct_t regs = ctl->orig.regs; - int err; - - regs.a7 = (unsigned long)nr; - regs.a0 = arg1; - regs.a1 = arg2; - regs.a2 = arg3; - regs.a3 = arg4; - regs.a4 = arg5; - regs.a5 = arg6; - regs.a6 = 0; - - err = compel_execute_syscall(ctl, ®s, code_syscall); - - *ret = regs.a0; - return err; -} - -/* - * Calling the mmap system call in the context of the target (victim) process using the compel_syscall function. - * Used during the infection process to allocate memory for the parasite code. -*/ -void *remote_mmap(struct parasite_ctl *ctl, void *addr, size_t length, int prot, int flags, int fd, off_t offset) -{ - long map; - int err; - - err = compel_syscall(ctl, __NR_mmap, &map, (unsigned long)addr, length, prot, flags, fd, offset); - if (err < 0 || (long)map < 0) - map = 0; - - return (void *)map; -} - -void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs) -{ - regs->pc = new_ip; - if (stack) - regs->sp = (unsigned long)stack; -} - -bool arch_can_dump_task(struct parasite_ctl *ctl) -{ - /* - * TODO: Add proper check here. - */ - return true; -} - -/* - * Fetch the signal alternate stack (sigaltstack), - * sas is a separate memory area for the signal handler to run on, - * avoiding potential issues with the main process stack -*/ -int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) -{ - long ret; - int err; - - err = compel_syscall(ctl, __NR_sigaltstack, &ret, 0, (unsigned long)&s->uc.uc_stack, 0, 0, 0, 0); - return err ? err : ret; -} - -/* - * Task size is the maximum virtual address space size that a process can occupy in the memory - * Refer to linux kernel arch/riscv/include/asm/pgtable.h, - * task size is: - * - 0x9fc00000 (~2.5GB) for RV32. - * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu - * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu - * - 0x100000000000000 ( 64PB) for RV64 using SV57 mmu - */ -#define TASK_SIZE_MIN (1UL << 38) -#define TASK_SIZE_MAX (1UL << 56) - -unsigned long compel_task_size(void) -{ - unsigned long task_size; - - for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1) - if (munmap((void *)task_size, page_size())) - break; - return task_size; -} - -/* - * Get task registers (overwrites weak function) - */ -int ptrace_get_regs(int pid, user_regs_struct_t *regs) -{ - struct iovec iov; - - iov.iov_base = regs; - iov.iov_len = sizeof(user_regs_struct_t); - return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov); -} - -/* - * Set task registers (overwrites weak function) - */ -int ptrace_set_regs(int pid, user_regs_struct_t *regs) -{ - struct iovec iov; - - iov.iov_base = regs; - iov.iov_len = sizeof(user_regs_struct_t); - return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); -} diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index ff2f33006..7178bf483 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -82,7 +82,7 @@ __NR_sys_timer_settime 255 sys_timer_settime (kernel_timer_t timer_id, int flag __NR_sys_timer_gettime 256 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 257 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 258 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 260 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) +__NR_clock_gettime 260 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 248 sys_exit_group (int error_code) __NR_waitid 281 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, size_t len) @@ -108,15 +108,9 @@ __NR_userfaultfd 355 sys_userfaultfd (int flags) __NR_preadv 328 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_ppoll 302 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -__NR_rseq 383 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_membarrier 356 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h index 87283bc6b..896d70ed1 100644 --- a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h @@ -62,10 +62,9 @@ typedef struct { uint32_t system_call; } user_regs_struct_t; -#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) -#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) -#define SET_REG_IP(r, val) ((r).prstatus.psw.addr = (val)) -#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) +#define REG_RES(r) ((uint64_t)(r).prstatus.gprs[2]) +#define REG_IP(r) ((uint64_t)(r).prstatus.psw.addr) +#define REG_SP(r) ((uint64_t)(r).prstatus.gprs[15]) /* * We assume that REG_SYSCALL_NR() is only used for pie code where we * always use svc 0 with opcode in %r1. diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index a77b38917..77ace713a 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -293,9 +293,10 @@ static int s390_disable_ri_bit(pid_t pid, user_regs_struct_t *regs) /* * Prepare task registers for restart */ -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, __maybe_unused unsigned long flags) { + user_fpregs_struct_t tmp, *fpregs = ext_regs ? ext_regs : &tmp; struct iovec iov; int rewind; @@ -348,7 +349,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct } } /* Call save_task_regs() */ - return save(pid, arg, regs, fpregs); + return save(arg, regs, fpregs); } int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs) @@ -623,8 +624,8 @@ enum kernel_ts_level { }; /* See arch/s390/include/asm/processor.h */ -#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ -#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ +#define TASK_SIZE_LEVEL_3 0x40000000000UL /* 4 TB */ +#define TASK_SIZE_LEVEL_4 0x20000000000000UL /* 8 PB */ #define TASK_SIZE_LEVEL_5 0xffffffffffffefffUL /* 16 EB - 0x1000 */ /* diff --git a/compel/arch/x86/plugins/include/asm/syscall-types.h b/compel/arch/x86/plugins/include/asm/syscall-types.h index 6987aad16..101d1eb6e 100644 --- a/compel/arch/x86/plugins/include/asm/syscall-types.h +++ b/compel/arch/x86/plugins/include/asm/syscall-types.h @@ -53,7 +53,7 @@ typedef struct { unsigned int read_exec_only : 1; unsigned int limit_in_pages : 1; unsigned int seg_not_present : 1; - unsigned int usable : 1; + unsigned int useable : 1; unsigned int lm : 1; } user_desc_t; diff --git a/compel/arch/x86/plugins/std/parasite-head.S b/compel/arch/x86/plugins/std/parasite-head.S index 42cad4808..4fb38d1f1 100644 --- a/compel/arch/x86/plugins/std/parasite-head.S +++ b/compel/arch/x86/plugins/std/parasite-head.S @@ -34,21 +34,7 @@ END(__export_parasite_head_start_compat) .code64 #endif -/* - * When parasite_service() runs in the daemon mode it will return the stack - * pointer for the sigreturn frame in %rax and we call sigreturn directly - * from here. - * Since a valid stack pointer is positive, it is safe to presume that - * return value <= 0 means that parasite_service() called parasite_trap_cmd() - * in non-daemon mode, and the parasite should stop at int3. - */ ENTRY(__export_parasite_head_start) call parasite_service - cmp $0, %rax - jle 1f - movq %rax, %rsp - movq $15, %rax - syscall -1: int $0x03 END(__export_parasite_head_start) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall32.c b/compel/arch/x86/plugins/std/syscalls/syscall32.c index d09fd38c7..0f2fec3ff 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall32.c +++ b/compel/arch/x86/plugins/std/syscalls/syscall32.c @@ -1,9 +1,9 @@ #include "asm/types.h" #include "syscall-32.h" -#define SYS_SOCKET 1 /* sys_socket(2) */ -#define SYS_BIND 2 /* sys_bind(2) */ -#define SYS_CONNECT 3 /* sys_connect(2) */ +#define SYS_SOCKET 1 /* sys_socket(2) */ +#define SYS_BIND 2 /* sys_bind(2) */ +#define SYS_CONNECT 3 /* sys_connect(2) */ #define SYS_SENDTO 11 /* sys_sendto(2) */ #define SYS_RECVFROM 12 /* sys_recvfrom(2) */ #define SYS_SHUTDOWN 13 /* sys_shutdown(2) */ diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index cc23dc3f3..7e456cdb7 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -96,15 +96,9 @@ __NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char __NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 374 sys_userfaultfd (int flags) __NR_ppoll 309 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -__NR_rseq 386 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_membarrier 375 sys_membarrier (int cmd, unsigned int flags, int cpu_id) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 8c3620c2a..2dfcc6eee 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -85,7 +85,7 @@ __NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int fla __NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting) __NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id) __NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id) -__NR_clock_gettime 228 sys_clock_gettime (clockid_t which_clock, struct timespec *tp) +__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp) __NR_exit_group 231 sys_exit_group (int error_code) __NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru) @@ -107,16 +107,9 @@ __NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1 __NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 323 sys_userfaultfd (int flags) __NR_ppoll 271 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) -__NR_open_tree 428 sys_open_tree (int dirfd, const char *pathname, unsigned int flags) -__NR_move_mount 429 sys_move_mount (int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, int flags) __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) -__NR_close_range 436 sys_close_range (unsigned int fd, unsigned int max_fd, unsigned int flags) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) -__NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) -__NR_rseq 334 sys_rseq (void *rseq, uint32_t rseq_len, int flags, uint32_t sig) -__NR_membarrier 324 sys_membarrier (int cmd, unsigned int flags, int cpu_id) -__NR_map_shadow_stack 453 sys_map_shadow_stack (unsigned long addr, unsigned long size, unsigned int flags) diff --git a/compel/arch/x86/src/lib/cpu.c b/compel/arch/x86/src/lib/cpu.c index f57fb3152..5ca794ea0 100644 --- a/compel/arch/x86/src/lib/cpu.c +++ b/compel/arch/x86/src/lib/cpu.c @@ -140,7 +140,7 @@ static int compel_fpuid(compel_cpuinfo_t *c) memset(c->xstate_comp_offsets, 0xff, sizeof(c->xstate_comp_offsets)); memset(c->xstate_comp_sizes, 0xff, sizeof(c->xstate_comp_sizes)); - /* start at the beginning of the "extended state" */ + /* start at the beginnning of the "extended state" */ last_good_offset = offsetof(struct xsave_struct, extended_state_area); /* diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h index 11c50e0e5..8d54516af 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h @@ -43,16 +43,16 @@ enum cpuid_leafs { #define NCAPINTS_BITS (NCAPINTS * 32) /* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ -#define X86_FEATURE_FPU (0 * 32 + 0) /* Onboard FPU */ -#define X86_FEATURE_VME (0 * 32 + 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE (0 * 32 + 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE (0 * 32 + 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC (0 * 32 + 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR (0 * 32 + 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE (0 * 32 + 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE (0 * 32 + 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 (0 * 32 + 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC (0 * 32 + 9) /* Onboard APIC */ +#define X86_FEATURE_FPU (0 * 32 + 0) /* Onboard FPU */ +#define X86_FEATURE_VME (0 * 32 + 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE (0 * 32 + 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE (0 * 32 + 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC (0 * 32 + 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR (0 * 32 + 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE (0 * 32 + 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE (0 * 32 + 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 (0 * 32 + 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC (0 * 32 + 9) /* Onboard APIC */ #define X86_FEATURE_SEP (0 * 32 + 11) /* SYSENTER/SYSEXIT */ #define X86_FEATURE_MTRR (0 * 32 + 12) /* Memory Type Range Registers */ #define X86_FEATURE_PGE (0 * 32 + 13) /* Page Global Enable */ @@ -100,12 +100,12 @@ enum cpuid_leafs { #define X86_FEATURE_CENTAUR_MCR (3 * 32 + 3) /* Centaur MCRs (= MTRRs) */ /* CPU types for specific tunings: */ -#define X86_FEATURE_K8 (3 * 32 + 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 (3 * 32 + 5) /* "" Athlon */ -#define X86_FEATURE_P3 (3 * 32 + 6) /* "" P3 */ -#define X86_FEATURE_P4 (3 * 32 + 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC (3 * 32 + 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP (3 * 32 + 9) /* SMP kernel running on UP */ +#define X86_FEATURE_K8 (3 * 32 + 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 (3 * 32 + 5) /* "" Athlon */ +#define X86_FEATURE_P3 (3 * 32 + 6) /* "" P3 */ +#define X86_FEATURE_P4 (3 * 32 + 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC (3 * 32 + 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP (3 * 32 + 9) /* SMP kernel running on UP */ #define X86_FEATURE_ART (3 * 32 + 10) /* Always running timer (ART) */ #define X86_FEATURE_ARCH_PERFMON (3 * 32 + 11) /* Intel Architectural PerfMon */ #define X86_FEATURE_PEBS (3 * 32 + 12) /* Precise-Event Based Sampling */ @@ -129,16 +129,16 @@ enum cpuid_leafs { #define X86_FEATURE_TSC_KNOWN_FREQ (3 * 32 + 31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ -#define X86_FEATURE_XMM3 (4 * 32 + 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 (4 * 32 + 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT (4 * 32 + 3) /* "monitor" MONITOR/MWAIT support */ -#define X86_FEATURE_DSCPL (4 * 32 + 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ -#define X86_FEATURE_VMX (4 * 32 + 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX (4 * 32 + 6) /* Safer Mode eXtensions */ -#define X86_FEATURE_EST (4 * 32 + 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 (4 * 32 + 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 (4 * 32 + 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_XMM3 (4 * 32 + 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ (4 * 32 + 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 (4 * 32 + 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT (4 * 32 + 3) /* "monitor" MONITOR/MWAIT support */ +#define X86_FEATURE_DSCPL (4 * 32 + 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ +#define X86_FEATURE_VMX (4 * 32 + 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX (4 * 32 + 6) /* Safer Mode eXtensions */ +#define X86_FEATURE_EST (4 * 32 + 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 (4 * 32 + 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 (4 * 32 + 9) /* Supplemental SSE-3 */ #define X86_FEATURE_CID (4 * 32 + 10) /* Context ID */ #define X86_FEATURE_SDBG (4 * 32 + 11) /* Silicon Debug */ #define X86_FEATURE_FMA (4 * 32 + 12) /* Fused multiply-add */ @@ -162,28 +162,28 @@ enum cpuid_leafs { #define X86_FEATURE_HYPERVISOR (4 * 32 + 31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE (5 * 32 + 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN (5 * 32 + 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT (5 * 32 + 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN (5 * 32 + 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 (5 * 32 + 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN (5 * 32 + 9) /* ACE v2 enabled */ +#define X86_FEATURE_XSTORE (5 * 32 + 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN (5 * 32 + 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT (5 * 32 + 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN (5 * 32 + 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 (5 * 32 + 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN (5 * 32 + 9) /* ACE v2 enabled */ #define X86_FEATURE_PHE (5 * 32 + 10) /* PadLock Hash Engine */ #define X86_FEATURE_PHE_EN (5 * 32 + 11) /* PHE enabled */ #define X86_FEATURE_PMM (5 * 32 + 12) /* PadLock Montgomery Multiplier */ #define X86_FEATURE_PMM_EN (5 * 32 + 13) /* PMM enabled */ /* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ -#define X86_FEATURE_LAHF_LM (6 * 32 + 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY (6 * 32 + 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM (6 * 32 + 2) /* Secure Virtual Machine */ -#define X86_FEATURE_EXTAPIC (6 * 32 + 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY (6 * 32 + 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM (6 * 32 + 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A (6 * 32 + 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE (6 * 32 + 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH (6 * 32 + 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW (6 * 32 + 9) /* OS Visible Workaround */ +#define X86_FEATURE_LAHF_LM (6 * 32 + 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY (6 * 32 + 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM (6 * 32 + 2) /* Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC (6 * 32 + 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY (6 * 32 + 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM (6 * 32 + 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A (6 * 32 + 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE (6 * 32 + 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH (6 * 32 + 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW (6 * 32 + 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6 * 32 + 10) /* Instruction Based Sampling */ #define X86_FEATURE_XOP (6 * 32 + 11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6 * 32 + 12) /* SKINIT/STGI instructions */ @@ -202,14 +202,14 @@ enum cpuid_leafs { #define X86_FEATURE_MWAITX (6 * 32 + 29) /* MWAIT extension (MONITORX/MWAITX instructions) */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ -#define X86_FEATURE_FSGSBASE (9 * 32 + 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ -#define X86_FEATURE_TSC_ADJUST (9 * 32 + 1) /* TSC adjustment MSR 0x3B */ -#define X86_FEATURE_BMI1 (9 * 32 + 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE (9 * 32 + 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 (9 * 32 + 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP (9 * 32 + 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 (9 * 32 + 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS (9 * 32 + 9) /* Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_FSGSBASE (9 * 32 + 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST (9 * 32 + 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_BMI1 (9 * 32 + 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE (9 * 32 + 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 (9 * 32 + 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP (9 * 32 + 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 (9 * 32 + 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS (9 * 32 + 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID (9 * 32 + 10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM (9 * 32 + 11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM (9 * 32 + 12) /* Cache QoS Monitoring */ @@ -238,15 +238,14 @@ enum cpuid_leafs { #define X86_FEATURE_XSAVES (10 * 32 + 3) /* XSAVES/XRSTORS instructions */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 11 */ -#define X86_FEATURE_PREFETCHWT1 (11 * 32 + 0) /* PREFETCHWT1 Intel® Xeon PhiTM only */ -#define X86_FEATURE_AVX512VBMI (11 * 32 + 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_UMIP (11 * 32 + 2) /* User Mode Instruction Protection */ -#define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ -#define X86_FEATURE_SHSTK (11 * 32 + 7) /* Shadow Stack */ -#define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ -#define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ +#define X86_FEATURE_PREFETCHWT1 (11 * 32 + 0) /* PREFETCHWT1 Intel® Xeon PhiTM only */ +#define X86_FEATURE_AVX512VBMI (11 * 32 + 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (11 * 32 + 2) /* User Mode Instruction Protection */ +#define X86_FEATURE_PKU (11 * 32 + 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (11 * 32 + 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (11 * 32 + 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (11 * 32 + 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (11 * 32 + 9) /* Vector AES */ #define X86_FEATURE_VPCLMULQDQ (11 * 32 + 10) /* Carry-Less Multiplication Double Quadword */ #define X86_FEATURE_AVX512_VNNI (11 * 32 + 11) /* Vector Neural Network Instructions */ #define X86_FEATURE_AVX512_BITALG (11 * 32 + 12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ @@ -262,35 +261,35 @@ enum cpuid_leafs { #define X86_FEATURE_CQM_MBM_LOCAL (12 * 32 + 2) /* LLC Local MBM monitoring */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ -#define X86_FEATURE_CLZERO (13 * 32 + 0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13 * 32 + 1) /* Instructions Retired Count */ -#define X86_FEATURE_XSAVEERPTR (13 * 32 + 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_CLZERO (13 * 32 + 0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13 * 32 + 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13 * 32 + 2) /* Always save/restore FP error pointers */ #define X86_FEATURE_IBPB (13 * 32 + 12) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_IBRS (13 * 32 + 14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_STIBP (13 * 32 + 15) /* Single Thread Indirect Branch Predictors */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ -#define X86_FEATURE_DTHERM (14 * 32 + 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14 * 32 + 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14 * 32 + 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14 * 32 + 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14 * 32 + 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14 * 32 + 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14 * 32 + 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14 * 32 + 9) /* HWP Activity Window */ +#define X86_FEATURE_DTHERM (14 * 32 + 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14 * 32 + 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14 * 32 + 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14 * 32 + 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14 * 32 + 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14 * 32 + 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14 * 32 + 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14 * 32 + 9) /* HWP Activity Window */ #define X86_FEATURE_HWP_EPP (14 * 32 + 10) /* HWP Energy Perf. Preference */ #define X86_FEATURE_HWP_PKG_REQ (14 * 32 + 11) /* HWP Package Level Request */ #define X86_FEATURE_HDC (14 * 32 + 13) /* HDC base registers present */ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ -#define X86_FEATURE_NPT (15 * 32 + 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15 * 32 + 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15 * 32 + 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15 * 32 + 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15 * 32 + 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15 * 32 + 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15 * 32 + 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15 * 32 + 7) /* Decode Assists support */ +#define X86_FEATURE_NPT (15 * 32 + 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15 * 32 + 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15 * 32 + 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15 * 32 + 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15 * 32 + 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15 * 32 + 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15 * 32 + 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15 * 32 + 7) /* Decode Assists support */ #define X86_FEATURE_PAUSEFILTER (15 * 32 + 10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15 * 32 + 12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15 * 32 + 13) /* Virtual Interrupt Controller */ @@ -306,8 +305,8 @@ enum cpuid_leafs { #define X86_FEATURE_SMCA (17 * 32 + 3) /* Scalable MCA */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ -#define X86_FEATURE_AVX512_4VNNIW (18 * 32 + 2) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (18 * 32 + 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_AVX512_4VNNIW (18 * 32 + 2) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS (18 * 32 + 3) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_PCONFIG (18 * 32 + 18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18 * 32 + 26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18 * 32 + 27) /* "" Single Thread Indirect Branch Predictors */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index d595a68fc..d740e3c04 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -21,28 +21,7 @@ #define XSTATE_YMM 0x4 #define FXSAVE_SIZE 512 -/* - * This used to be 4096 (one page). There is a comment below concerning - * this size: - * "One page should be enough for the whole xsave state ;-)" - * Which is kind of funny as it is no longer enough ;-) - * - * Older CPUs: - * # cpuid -1 -l 0xd -s 0 - * ... - * bytes required by XSAVE/XRSTOR area = 0x00000988 (2440) - * - * Newer CPUs (Sapphire Rapids): - * # cpuid -1 -l 0xd -s 0 - * ... - * bytes required by XSAVE/XRSTOR area = 0x00002b00 (11008) - * - * So one page is no longer enough... But: - * - * Four pages should be enough for the whole xsave state ;-) - */ - -#define XSAVE_SIZE 4*4096 +#define XSAVE_SIZE 4096 #define XSAVE_HDR_SIZE 64 #define XSAVE_HDR_OFFSET FXSAVE_SIZE @@ -101,11 +80,6 @@ enum xfeature { (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | XFEATURE_MASK_PKRU | XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR) -/* xsave structure features which is safe to fill with garbage (see validate_random_xstate()) */ -#define XFEATURE_MASK_FAULTINJ \ - (XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | \ - XFEATURE_MASK_Hi16_ZMM) - struct fpx_sw_bytes { uint32_t magic1; uint32_t extended_size; @@ -131,7 +105,7 @@ struct i387_fxsave_struct { uint32_t fos; /* FPU Operand Selector */ }; }; - uint32_t mxcsr; /* MXCSR Register State */ + uint32_t mxcsr; /* MXCSR Register State */ uint32_t mxcsr_mask; /* MXCSR Mask */ /* 8*16 bytes for each FP-reg = 128 bytes */ @@ -245,14 +219,6 @@ struct pkru_state { uint32_t pad; } __packed; -/* - * State component 11 is Control-flow Enforcement user states - */ -struct cet_user_state { - uint64_t cet; /* user control-flow settings */ - uint64_t ssp; /* user shadow stack pointer */ -}; - /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. @@ -264,11 +230,8 @@ struct cet_user_state { * * * One page should be enough for the whole xsave state ;-) - * - * Of course it was not ;-) Now using four pages... - * */ -#define EXTENDED_STATE_AREA_SIZE (XSAVE_SIZE - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct) - sizeof(struct cet_user_state)) +#define EXTENDED_STATE_AREA_SIZE (4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct)) /* * cpu requires it to be 64 byte aligned @@ -284,7 +247,6 @@ struct xsave_struct { struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; - struct cet_user_state cet; } __aligned(FP_MIN_ALIGN_BYTES) __packed; struct xsave_struct_ia32 { @@ -302,7 +264,7 @@ struct xsave_struct_ia32 { typedef struct { /* - * The FPU xsave area must be continuous and FP_MIN_ALIGN_BYTES + * The FPU xsave area must be continious and FP_MIN_ALIGN_BYTES * aligned, thus make sure the compiler won't insert any hole here. */ @@ -315,13 +277,13 @@ typedef struct { } fpu_state_64_t; struct user_i387_ia32_struct { - uint32_t cwd; /* FPU Control Word */ - uint32_t swd; /* FPU Status Word */ - uint32_t twd; /* FPU Tag Word */ - uint32_t fip; /* FPU IP Offset */ - uint32_t fcs; /* FPU IP Selector */ - uint32_t foo; /* FPU Operand Pointer Offset */ - uint32_t fos; /* FPU Operand Pointer Selector */ + uint32_t cwd; /* FPU Control Word */ + uint32_t swd; /* FPU Status Word */ + uint32_t twd; /* FPU Tag Word */ + uint32_t fip; /* FPU IP Offset */ + uint32_t fcs; /* FPU IP Selector */ + uint32_t foo; /* FPU Operand Pointer Offset */ + uint32_t fos; /* FPU Operand Pointer Selector */ uint32_t st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ }; diff --git a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h index b998c488c..34b3ad061 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/infect-types.h @@ -127,11 +127,10 @@ typedef struct { typedef struct xsave_struct user_fpregs_struct_t; -#define REG_RES(regs) get_user_reg(®s, ax) -#define REG_IP(regs) get_user_reg(®s, ip) -#define SET_REG_IP(regs, val) set_user_reg(®s, ip, val) -#define REG_SP(regs) get_user_reg(®s, sp) -#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) +#define REG_RES(regs) get_user_reg(®s, ax) +#define REG_IP(regs) get_user_reg(®s, ip) +#define REG_SP(regs) get_user_reg(®s, sp) +#define REG_SYSCALL_NR(regs) get_user_reg(®s, orig_ax) #define __NR(syscall, compat) ((compat) ? __NR32_##syscall : __NR_##syscall) @@ -143,11 +142,4 @@ typedef struct xsave_struct user_fpregs_struct_t; */ #define __NR32_mmap __NR32_mmap2 -extern bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs); -#define compel_shstk_enabled __compel_shstk_enabled - -extern int __parasite_setup_shstk(struct parasite_ctl *ctl, - user_fpregs_struct_t *ext_regs); -#define parasite_setup_shstk __parasite_setup_shstk - #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index 4a2e67559..ec8c156fa 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -177,24 +177,6 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define USER32_CS 0x23 /* clang-format off */ -/* - * rst_sigreturn in resorer is noninline call which adds an entry to the - * shadow stack above the sigframe token; - * if shadow stack is enabled, increment the shadow stack pointer to remove - * that entry - */ -#define ARCH_SHSTK_POP() \ - asm volatile( \ - "xor %%rax, %%rax\n" \ - "rdsspq %%rax\n" \ - "cmpq $0, %%rax\n" \ - "jz 1f\n" \ - "movq $1, %%rax\n" \ - "incsspq %%rax\n" \ - "1:\n" \ - : : \ - : "rax") - #define ARCH_RT_SIGRETURN_NATIVE(new_sp) \ asm volatile( \ "movq %0, %%rax \n" \ @@ -221,19 +203,10 @@ static inline void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) : "rdi"(new_sp) \ : "eax", "r8", "r9", "r10", "r11", "memory") -#define ARCH_RT_SIGRETURN_RST(new_sp, rt_sigframe) \ -do { \ - if ((rt_sigframe)->is_native) { \ - ARCH_SHSTK_POP(); \ - ARCH_RT_SIGRETURN_NATIVE(new_sp); \ - } else \ - ARCH_RT_SIGRETURN_COMPAT(new_sp); \ -} while (0) - -#define ARCH_RT_SIGRETURN_DUMP(new_sp, rt_sigframe) \ +#define ARCH_RT_SIGRETURN(new_sp, rt_sigframe) \ do { \ if ((rt_sigframe)->is_native) \ - return new_sp; \ + ARCH_RT_SIGRETURN_NATIVE(new_sp); \ else \ ARCH_RT_SIGRETURN_COMPAT(new_sp); \ } while (0) diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index afcf2c53b..1e344bf3a 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -26,16 +26,6 @@ #ifndef NT_X86_XSTATE #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #endif - -#ifndef NT_X86_SHSTK -#define NT_X86_SHSTK 0x204 /* x86 shstk state */ -#endif - -#ifndef ARCH_SHSTK_STATUS -#define ARCH_SHSTK_STATUS 0x5005 -#define ARCH_SHSTK_SHSTK (1ULL << 0) -#endif - #ifndef NT_PRSTATUS #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ #endif @@ -44,12 +34,12 @@ * Injected syscall instruction */ const char code_syscall[] = { - 0x0f, 0x05, /* syscall */ + 0x0f, 0x05, /* syscall */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; const char code_int_80[] = { - 0xcd, 0x80, /* int $0x80 */ + 0xcd, 0x80, /* int $0x80 */ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */ }; @@ -230,16 +220,6 @@ int sigreturn_prep_fpu_frame_plain(struct rt_sigframe *sigframe, struct rt_sigfr #define get_signed_user_reg(pregs, name) \ ((user_regs_native(pregs)) ? (int64_t)((pregs)->native.name) : (int32_t)((pregs)->compat.name)) -static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) -{ - if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { - pr_perror("Can't obtain FPU registers for %d", pid); - return -1; - } - - return 0; -} - static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) { struct iovec iov; @@ -252,73 +232,17 @@ static int get_task_xsave(pid_t pid, user_fpregs_struct_t *xsave) return -1; } - if ((xsave->xsave_hdr.xstate_bv & 3) != 3) { - // Due to init-optimisation [1] x87 FPU or SSE state may not be filled in. - // Since those are restored unconditionally, make sure the init values are - // filled by retrying with old PTRACE_GETFPREGS. - // - // [1] Intel® 64 and IA-32 Architectures Software Developer's - // Manual Volume 1: Basic Architecture - // Section 13.6: Processor tracking of XSAVE-managed state - if (get_task_fpregs(pid, xsave)) - return -1; - } - - /* - * xsave may be on stack, if we don't clear it explicitly we get - * funky shadow stack state - */ - memset(&xsave->cet, 0, sizeof(xsave->cet)); - if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { - unsigned long ssp = 0; - unsigned long features = 0; - - if (ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long)&features, ARCH_SHSTK_STATUS)) { - /* - * kernels that don't support shadow stack return - * -EINVAL - */ - if (errno == EINVAL) - return 0; - - pr_perror("shstk: can't get shadow stack status for %d", pid); - return -1; - } - - if (!(features & ARCH_SHSTK_SHSTK)) - return 0; - - iov.iov_base = &ssp; - iov.iov_len = sizeof(ssp); - - if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { - /* ENODEV means CET is not supported by the CPU */ - if (errno != ENODEV) { - pr_perror("shstk: can't get SSP for %d", pid); - return -1; - } - } - - xsave->cet.cet = features; - xsave->cet.ssp = ssp; - - pr_debug("%d: shstk: cet: %lx ssp: %lx\n", pid, xsave->cet.cet, xsave->cet.ssp); - } - return 0; } -static inline void fixup_mxcsr(struct xsave_struct *xsave) +static int get_task_fpregs(pid_t pid, user_fpregs_struct_t *xsave) { - /* - * Right now xsave->i387.mxcsr filled with the random garbage, - * let's make it valid by applying mask which allows all - * features, except the denormals-are-zero feature bit. - * - * See also fpu__init_system_mxcsr function: - * https://github.com/torvalds/linux/blob/8cb1ae19/arch/x86/kernel/fpu/init.c#L117 - */ - xsave->i387.mxcsr &= 0x0000ffbf; + if (ptrace(PTRACE_GETFPREGS, pid, NULL, xsave)) { + pr_perror("Can't obtain FPU registers for %d", pid); + return -1; + } + + return 0; } /* See arch/x86/kernel/fpu/xstate.c */ @@ -330,7 +254,6 @@ static void validate_random_xstate(struct xsave_struct *xsave) /* No unknown or supervisor features may be set */ hdr->xstate_bv &= XFEATURE_MASK_USER; hdr->xstate_bv &= ~XFEATURE_MASK_SUPERVISOR; - hdr->xstate_bv &= XFEATURE_MASK_FAULTINJ; for (i = 0; i < XFEATURE_MAX; i++) { if (!compel_fpu_has_feature(i)) @@ -359,22 +282,20 @@ static int corrupt_extregs(pid_t pid) bool use_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); user_fpregs_struct_t ext_regs; int *rand_to = (int *)&ext_regs; - unsigned int seed, init_seed; + unsigned int seed; size_t i; - init_seed = seed = time(NULL); + seed = time(NULL); for (i = 0; i < sizeof(ext_regs) / sizeof(int); i++) *rand_to++ = rand_r(&seed); /* * Error log-level as: - * - not intended to be used outside of testing; + * - not intended to be used outside of testing, * - zdtm.py will grep it auto-magically from logs - * (and the seed will be known from automatic testing). + * (and the seed will be known from an automatical testing) */ - pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, init_seed); - - fixup_mxcsr(&ext_regs); + pr_err("Corrupting %s for %d, seed %u\n", use_xsave ? "xsave" : "fpuregs", pid, seed); if (!use_xsave) { if (ptrace(PTRACE_SETFPREGS, pid, NULL, &ext_regs)) { @@ -397,9 +318,10 @@ static int corrupt_extregs(pid_t pid) return 0; } -int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *xs, save_regs_t save, +int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, unsigned long flags) { + user_fpregs_struct_t xsave = {}, *xs = ext_regs ? ext_regs : &xsave; int ret = -1; pr_info("Dumping general registers for %d in %s mode\n", pid, user_regs_native(regs) ? "native" : "compat"); @@ -426,7 +348,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct /* * FPU fetched either via fxsave or via xsave, - * thus decode it accordingly. + * thus decode it accrodingly. */ pr_info("Dumping GP/FPU registers for %d\n", pid); @@ -453,7 +375,7 @@ int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct goto err; out: - ret = save(pid, arg, regs, xs); + ret = save(arg, regs, xs); err: return ret; } @@ -650,7 +572,6 @@ int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s) int ptrace_set_breakpoint(pid_t pid, void *addr) { - k_rtsigset_t block; int ret; /* Set a breakpoint */ @@ -666,16 +587,6 @@ int ptrace_set_breakpoint(pid_t pid, void *addr) return -1; } - /* - * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler - * will be reset to the default one. - */ - ksigfillset(&block); - ksigdelset(&block, SIGTRAP); - if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { - pr_perror("Can't block signals for %d", pid); - return -1; - } ret = ptrace(PTRACE_CONT, pid, NULL, NULL); if (ret) { pr_perror("Unable to restart the stopped tracee process %d", pid); @@ -749,59 +660,3 @@ unsigned long compel_task_size(void) { return TASK_SIZE; } - -bool __compel_shstk_enabled(user_fpregs_struct_t *ext_regs) -{ - if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) - return false; - - if (ext_regs->cet.cet & ARCH_SHSTK_SHSTK) - return true; - - return false; -} - -int parasite_setup_shstk(struct parasite_ctl *ctl, __maybe_unused user_fpregs_struct_t *ext_regs) -{ - pid_t pid = ctl->rpid; - unsigned long sa_restorer = ctl->parasite_ip; - unsigned long long ssp; - unsigned long token; - struct iovec iov; - - if (!compel_shstk_enabled(ext_regs)) - return 0; - - iov.iov_base = &ssp; - iov.iov_len = sizeof(ssp); - if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { - /* ENODEV means CET is not supported by the CPU */ - if (errno != ENODEV) { - pr_perror("shstk: %d: cannot get SSP", pid); - return -1; - } - } - - /* The token is for 64-bit */ - token = ALIGN_DOWN(ssp, 8); - token |= (1UL << 63); - ssp = ALIGN_DOWN(ssp, 8) - 8; - if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, token)) { - pr_perror("shstk: %d: failed to inject shadow stack token", pid); - return -1; - } - - ssp = ssp - sizeof(uint64_t); - if (ptrace(PTRACE_POKEDATA, pid, (void *)ssp, sa_restorer)) { - pr_perror("shstk: %d: failed to inject restorer address", pid); - return -1; - } - - ssp = ssp + sizeof(uint64_t); - if (ptrace(PTRACE_SETREGSET, pid, (unsigned int)NT_X86_SHSTK, &iov) < 0) { - pr_perror("shstk: %d: cannot write SSP", pid); - return -1; - } - - return 0; -} diff --git a/compel/arch/x86/src/lib/thread_area.c b/compel/arch/x86/src/lib/thread_area.c index 271d89dcd..4750c6cdd 100644 --- a/compel/arch/x86/src/lib/thread_area.c +++ b/compel/arch/x86/src/lib/thread_area.c @@ -53,16 +53,15 @@ int __compel_arch_fetch_thread_area(int tid, struct thread_ctx *th) user_desc_t *d = &ptls->desc[i]; err = ptrace(PTRACE_GET_THREAD_AREA, tid, GDT_ENTRY_TLS_MIN + i, d); + /* + * Ignoring absent syscall on !CONFIG_IA32_EMULATION + * where such mixed code can't run. + * XXX: Add compile CONFIG_X86_IGNORE_64BIT_TLS + * (for x86_64 systems with CONFIG_IA32_EMULATION) + */ + if (err == -EIO && native_mode) + return 0; if (err) { - /* - * Ignoring absent syscall on !CONFIG_IA32_EMULATION - * where such mixed code can't run. - * XXX: Add compile CONFIG_X86_IGNORE_64BIT_TLS - * (for x86_64 systems with CONFIG_IA32_EMULATION) - */ - if (errno == EIO && native_mode) - return 0; - pr_perror("get_thread_area failed for %d", tid); return err; } diff --git a/compel/include/infect-priv.h b/compel/include/infect-priv.h index 8e78a7f6c..1c03f4486 100644 --- a/compel/include/infect-priv.h +++ b/compel/include/infect-priv.h @@ -38,7 +38,7 @@ struct parasite_ctl { unsigned long parasite_ip; /* service routine start ip */ unsigned int *cmd; /* address for command */ - void *args; /* address for arguments */ + void *args; /* address for arguments */ unsigned long args_size; int tsock; /* transport socket for transferring fds */ @@ -72,7 +72,6 @@ extern bool arch_can_dump_task(struct parasite_ctl *ctl); extern int compel_get_task_regs(pid_t pid, user_regs_struct_t *regs, user_fpregs_struct_t *ext_regs, save_regs_t save, void *arg, unsigned long flags); extern int compel_set_task_ext_regs(pid_t pid, user_fpregs_struct_t *ext_regs); -extern int compel_set_task_gcs_regs(pid_t pid, user_fpregs_struct_t *ext_regs); extern int arch_fetch_sas(struct parasite_ctl *ctl, struct rt_sigframe *s); extern int sigreturn_prep_regs_plain(struct rt_sigframe *sigframe, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs); diff --git a/compel/include/log.h b/compel/include/log.h index 5250622c8..0e33976b1 100644 --- a/compel/include/log.h +++ b/compel/include/log.h @@ -1,9 +1,6 @@ #ifndef COMPEL_LOG_H__ #define COMPEL_LOG_H__ -#include -#include - #include "uapi/compel/log.h" #ifndef LOG_PREFIX @@ -48,6 +45,6 @@ extern void compel_print_on_level(unsigned int loglevel, const char *format, ... #define pr_debug(fmt, ...) compel_print_on_level(COMPEL_LOG_DEBUG, LOG_PREFIX fmt, ##__VA_ARGS__) -#define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) +#define pr_perror(fmt, ...) pr_err(fmt ": %m\n", ##__VA_ARGS__) #endif /* COMPEL_LOG_H__ */ diff --git a/compel/include/ptrace.h b/compel/include/ptrace.h index 00013f937..bf2701e63 100644 --- a/compel/include/ptrace.h +++ b/compel/include/ptrace.h @@ -5,8 +5,6 @@ #include #include -#define PTRACE_SYSCALL_TRAP 0x80 - #define PTRACE_SI_EVENT(_si_code) (((_si_code)&0xFFFF) >> 8) extern int ptrace_get_regs(pid_t pid, user_regs_struct_t *regs); diff --git a/compel/include/rpc-pie-priv.h b/compel/include/rpc-pie-priv.h index 5a6b337b2..2a239c613 100644 --- a/compel/include/rpc-pie-priv.h +++ b/compel/include/rpc-pie-priv.h @@ -3,7 +3,7 @@ struct ctl_msg { uint32_t cmd; /* command itself */ uint32_t ack; /* ack on command */ - int32_t err; /* error code on reply */ + int32_t err; /* error code on reply */ }; #define ctl_msg_cmd(_cmd) \ diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h index 658df9393..4e32d13dc 100644 --- a/compel/include/uapi/infect-util.h +++ b/compel/include/uapi/infect-util.h @@ -3,21 +3,6 @@ #include "common/compiler.h" -/** - * The length of the hash is based on what libuuid provides. - * According to the manpage this is: - * - * The uuid_unparse() function converts the supplied UUID uu from the binary - * representation into a 36-byte string (plus trailing '\0') - */ -#define RUN_ID_HASH_LENGTH 37 - -/* - * compel_run_id is a unique value of the current run. It can be used to - * generate resource ID-s to avoid conflicts with other processes. - */ -extern char compel_run_id[RUN_ID_HASH_LENGTH]; - struct parasite_ctl; extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); extern int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd); diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index d21c261b7..c3d2ee6a6 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,21 +13,11 @@ #define PARASITE_START_AREA_MIN (4096) -#define PARASITE_STACK_SIZE (16 << 10) -/* - * A stack redzone is a small, protected region of memory located immediately - * after a parasite stack. It is intended to remain unchanged. While it can be - * implemented as a guard page, we want to avoid the overhead of additional - * remote system calls. - */ -#define PARASITE_STACK_REDZONE 128 - extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; unsigned long long shdpnd; - unsigned long long sigblk; char state; int vpid; int ppid; @@ -40,9 +30,7 @@ extern int __must_check compel_wait_task(int pid, int ppid, struct seize_task_status *st, void *data); extern int __must_check compel_stop_task(int pid); -extern int __must_check compel_parse_stop_signo(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); -extern int compel_resume_task_sig(pid_t pid, int orig_state, int state, int stop_signo); struct parasite_ctl; struct parasite_thread_ctl; @@ -50,12 +38,9 @@ struct parasite_thread_ctl; extern struct parasite_ctl __must_check *compel_prepare(int pid); extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); extern int __must_check compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); -extern int __must_check compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, - unsigned long args_size); extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); -extern int __must_check compel_start_daemon(struct parasite_ctl *ctl); extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); extern int __must_check compel_cure_local(struct parasite_ctl *ctl); @@ -92,9 +77,9 @@ enum trace_flags { TRACE_EXIT, }; -extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat); +extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat, enum trace_flags trace); -extern int __must_check compel_stop_pie(pid_t pid, void *addr, bool no_bp); +extern int __must_check compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); @@ -106,7 +91,7 @@ extern k_rtsigset_t *compel_thread_sigmask(struct parasite_thread_ctl *tctl); struct rt_sigframe; typedef int (*open_proc_fn)(int pid, int mode, const char *fmt, ...) __attribute__((__format__(__printf__, 3, 4))); -typedef int (*save_regs_t)(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +typedef int (*save_regs_t)(void *, user_regs_struct_t *, user_fpregs_struct_t *); typedef int (*make_sigframe_t)(void *, struct rt_sigframe *, struct rt_sigframe *, k_rtsigset_t *); struct infect_ctx { @@ -121,15 +106,14 @@ struct infect_ctx { unsigned long task_size; unsigned long syscall_ip; /* entry point of infection */ - unsigned long flags; /* fine-tune (e.g. faults) */ + unsigned long flags; /* fine-tune (e.g. faults) */ - void (*child_handler)(int, siginfo_t *, void *); /* handler for SIGCHLD deaths */ + void (*child_handler)(int, siginfo_t *, void *); /* hander for SIGCHLD deaths */ struct sigaction orig_handler; open_proc_fn open_proc; int log_fd; /* fd for parasite code to send messages to */ - unsigned long remote_map_addr; /* User-specified address where to mmap parasitic code, default not set */ }; extern struct infect_ctx *compel_infect_ctx(struct parasite_ctl *); @@ -184,37 +168,4 @@ extern unsigned long compel_task_size(void); extern uint64_t compel_get_leader_sp(struct parasite_ctl *ctl); extern uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl); -extern uint64_t compel_get_leader_ip(struct parasite_ctl *ctl); -extern uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl); - -void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v); -void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v); - -extern void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack); - -#ifndef compel_host_supports_gcs -static inline bool compel_host_supports_gcs(void) -{ - return false; -} -#define compel_host_supports_gcs -#endif - -#ifndef compel_shstk_enabled -static inline bool compel_shstk_enabled(user_fpregs_struct_t *ext_regs) -{ - return false; -} -#define compel_shstk_enabled -#endif - -#ifndef parasite_setup_shstk -static inline int parasite_setup_shstk(struct parasite_ctl *ctl, - user_fpregs_struct_t *ext_regs) -{ - return 0; -} -#define parasite_setup_shstk parasite_setup_shstk -#endif - #endif diff --git a/compel/include/uapi/loglevels.h b/compel/include/uapi/loglevels.h index 7a49825d2..e76c15657 100644 --- a/compel/include/uapi/loglevels.h +++ b/compel/include/uapi/loglevels.h @@ -7,10 +7,10 @@ */ enum __compel_log_levels { - COMPEL_LOG_MSG, /* Print message regardless of log level */ + COMPEL_LOG_MSG, /* Print message regardless of log level */ COMPEL_LOG_ERROR, /* Errors only, when we're in trouble */ - COMPEL_LOG_WARN, /* Warnings */ - COMPEL_LOG_INFO, /* Informative, everything is fine */ + COMPEL_LOG_WARN, /* Warnings */ + COMPEL_LOG_INFO, /* Informative, everything is fine */ COMPEL_LOG_DEBUG, /* Debug only */ COMPEL_DEFAULT_LOGLEVEL = COMPEL_LOG_WARN diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index 558124fbd..c5291d20d 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -62,21 +62,9 @@ */ typedef struct { uint64_t filter_off; /* Input: which filter */ - uint64_t flags; /* Output: filter's flags */ + uint64_t flags; /* Output: filter's flags */ } seccomp_metadata_t; -#ifndef PTRACE_GET_RSEQ_CONFIGURATION -#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f - -struct __ptrace_rseq_configuration { - uint64_t rseq_abi_pointer; - uint32_t rseq_abi_size; - uint32_t signature; - uint32_t flags; - uint32_t pad; -}; -#endif - #ifdef PTRACE_EVENT_STOP #if PTRACE_EVENT_STOP == 7 /* Bad value from Linux 3.1-3.3, fixed in 3.4 */ #undef PTRACE_EVENT_STOP @@ -86,19 +74,6 @@ struct __ptrace_rseq_configuration { #define PTRACE_EVENT_STOP 128 #endif -/* - * Amazon Linux 2 uses glibc 2.26. PTRACE_ARCH_PRCTL was added in glibc 2.27. - * This allows CRIU to build on Amazon Linux 2. - * - * Note that in sys/ptrace.h, PTRACE_ARCH_PRCTL is an enum value so the - * preprocessor doesn't know about it. PT_ARCH_PRCTL is the preprocessor symbol - * that matches the value of PTRACE_ARCH_PRCTL. So look for PT_ARCH_PRCTL to - * decide if PTRACE_ARCH_PRCTL is available or not. - */ -#if defined(__x86_64__) && !defined(PT_ARCH_PRCTL) -#define PTRACE_ARCH_PRCTL 30 /* From asm/ptrace-abi.h. */ -#endif - extern int ptrace_suspend_seccomp(pid_t pid); extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h index a729abbd2..08a5a7a80 100644 --- a/compel/plugins/include/uapi/std/infect.h +++ b/compel/plugins/include/uapi/std/infect.h @@ -7,7 +7,7 @@ extern int parasite_get_rpc_sock(void); extern unsigned int __export_parasite_service_cmd; extern void *__export_parasite_service_args_ptr; -extern unsigned long __must_check parasite_service(void); +extern int __must_check parasite_service(void); /* * Must be supplied by user plugins. diff --git a/compel/plugins/include/uapi/std/syscall-types.h b/compel/plugins/include/uapi/std/syscall-types.h index 1eea99daa..cc1969c01 100644 --- a/compel/plugins/include/uapi/std/syscall-types.h +++ b/compel/plugins/include/uapi/std/syscall-types.h @@ -40,7 +40,6 @@ struct rusage; struct iocb; struct pollfd; struct clone_args; -struct open_how; typedef unsigned long aio_context_t; diff --git a/compel/plugins/std/infect.c b/compel/plugins/std/infect.c index 034201320..abecc140f 100644 --- a/compel/plugins/std/infect.c +++ b/compel/plugins/std/infect.c @@ -16,10 +16,6 @@ #include "rpc-pie-priv.h" -#ifndef ARCH_RT_SIGRETURN_DUMP -#define ARCH_RT_SIGRETURN_DUMP ARCH_RT_SIGRETURN -#endif - static int tsock = -1; static struct rt_sigframe *sigframe; @@ -31,7 +27,7 @@ static struct rt_sigframe *sigframe; */ static unsigned __page_size; -unsigned long __attribute((weak)) page_size(void) +unsigned __attribute((weak)) page_size(void) { return __page_size; } @@ -83,13 +79,12 @@ static int __parasite_daemon_wait_msg(struct ctl_msg *m) /* Core infect code */ -static noinline unsigned long fini_sigreturn(unsigned long new_sp) +static noinline void fini_sigreturn(unsigned long new_sp) { - ARCH_RT_SIGRETURN_DUMP(new_sp, sigframe); - return new_sp; + ARCH_RT_SIGRETURN(new_sp, sigframe); } -static unsigned long fini(void) +static int fini(void) { unsigned long new_sp; @@ -101,14 +96,14 @@ static unsigned long fini(void) sys_close(tsock); std_log_set_fd(-1); - return fini_sigreturn(new_sp); + fini_sigreturn(new_sp); BUG(); return -1; } -static noinline __used unsigned long parasite_daemon(void *args) +static noinline __used int noinline parasite_daemon(void *args) { struct ctl_msg m; int ret = -1; @@ -145,10 +140,12 @@ static noinline __used unsigned long parasite_daemon(void *args) } out: - return fini(); + fini(); + + return 0; } -static noinline __used unsigned long parasite_init_daemon(void *data) +static noinline __used int parasite_init_daemon(void *data) { struct parasite_init_args *args = data; int ret; @@ -181,11 +178,14 @@ static noinline __used unsigned long parasite_init_daemon(void *data) } else goto err; - return parasite_daemon(data); + parasite_daemon(data); err: futex_set_and_wake(&args->daemon_connected, ret); - return fini(); + fini(); + BUG(); + + return -1; } #ifndef __parasite_entry @@ -203,7 +203,7 @@ err: unsigned int __export_parasite_service_cmd = 0; void *__export_parasite_service_args_ptr = NULL; -unsigned long __used __parasite_entry parasite_service(void) +int __used __parasite_entry parasite_service(void) { unsigned int cmd = __export_parasite_service_cmd; void *args = __export_parasite_service_args_ptr; diff --git a/compel/plugins/std/string.c b/compel/plugins/std/string.c index d67e0d1a9..bde1bc68b 100644 --- a/compel/plugins/std/string.c +++ b/compel/plugins/std/string.c @@ -151,12 +151,7 @@ static unsigned int __conv_val(unsigned char c) if (__isdigit(c)) return c - '0'; else if (__isalpha(c)) - /** - * If we want the value of something which __isalpha() == true - * it has to be base > 10. 'A' = 10, 'B' = 11 ... 'Z' = 35 - */ - return __tolower(c) - 'a' + 10; - + return &conv_tab[__tolower(c)] - conv_tab; return -1u; } diff --git a/compel/src/lib/handle-elf.c b/compel/src/lib/handle-elf.c index e4b8728ce..9662751e0 100644 --- a/compel/src/lib/handle-elf.c +++ b/compel/src/lib/handle-elf.c @@ -554,7 +554,7 @@ int __handle_elf(void *mem, size_t size) #endif /* ELF_PPC64 */ #ifdef ELF_X86_64 - case R_X86_64_32: /* Symbol + Addend (4 bytes) */ + case R_X86_64_32: /* Symbol + Addend (4 bytes) */ case R_X86_64_32S: /* Symbol + Addend (4 bytes) */ pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32); pr_out(" { .offset = 0x%-8x, .type = COMPEL_TYPE_INT, " @@ -614,7 +614,7 @@ int __handle_elf(void *mem, size_t size) #ifdef ELF_S390 /* * See also arch/s390/kernel/module.c/apply_rela(): - * A PLT reads the GOT (global offset table). We can handle it like + * A PLT reads the GOT (global offest table). We can handle it like * R_390_PC32DBL because we have linked statically. */ case R_390_PLT32DBL: /* PC relative on a PLT (predure link table) */ diff --git a/compel/src/lib/infect-util.c b/compel/src/lib/infect-util.c index dc57e28f7..5d6d0ddd8 100644 --- a/compel/src/lib/infect-util.c +++ b/compel/src/lib/infect-util.c @@ -7,8 +7,6 @@ #include "infect-rpc.h" #include "infect-util.h" -char compel_run_id[RUN_ID_HASH_LENGTH]; - int compel_util_send_fd(struct parasite_ctl *ctl, int fd) { int sk; diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index 22fcf24fa..0fb9e715c 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -1,4 +1,3 @@ -#include #include #include #include @@ -38,6 +37,8 @@ #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - (size_t)((struct sockaddr_un *)0)->sun_path) #endif +#define PARASITE_STACK_SIZE (16 << 10) + #ifndef SECCOMP_MODE_DISABLED #define SECCOMP_MODE_DISABLED 0 #endif @@ -90,12 +91,6 @@ static int parse_pid_status(int pid, struct seize_task_status *ss, void *data) continue; } - if (!strncmp(aux, "SigBlk:", 7)) { - if (sscanf(aux + 7, "%llx", &ss->sigblk) != 1) - goto err_parse; - - continue; - } } fclose(f); @@ -190,29 +185,6 @@ static int skip_sigstop(int pid, int nr_signals) return 0; } -#define SIG_MASK(sig) (1ULL << ((sig)-1)) - -#define SIG_IN_MASK(sig, mask) ((sig) > 0 && (sig) <= SIGMAX && (SIG_MASK(sig) & (mask))) - -#define SUPPORTED_STOP_MASK ((1ULL << (SIGSTOP - 1)) | (1ULL << (SIGTSTP - 1))) - -static inline int sig_stop(int sig) -{ - return SIG_IN_MASK(sig, SUPPORTED_STOP_MASK); -} - -int compel_parse_stop_signo(int pid) -{ - siginfo_t si; - - if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si) < 0) { - pr_perror("SEIZE %d: can't parse stopped siginfo", pid); - return -1; - } - - return si.si_signo; -} - /* * This routine seizes task putting it into a special * state where we can manipulate the task via ptrace @@ -225,7 +197,7 @@ int compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_ void *data) { siginfo_t si; - int status, nr_stopsig; + int status, nr_sigstop; int ret = 0, ret2, wait_errno = 0; /* @@ -302,11 +274,6 @@ try_again: goto try_again; } - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { - pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); - return -1; - } - if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0) goto err; @@ -323,32 +290,17 @@ try_again: goto err; } - nr_stopsig = 0; - if (SIG_IN_MASK(SIGSTOP, ss->sigpnd)) - nr_stopsig++; - if (SIG_IN_MASK(SIGSTOP, ss->shdpnd)) - nr_stopsig++; + nr_sigstop = 0; + if (ss->sigpnd & (1 << (SIGSTOP - 1))) + nr_sigstop++; + if (ss->shdpnd & (1 << (SIGSTOP - 1))) + nr_sigstop++; + if (si.si_signo == SIGSTOP) + nr_sigstop++; - if (SIG_IN_MASK(SIGTSTP, ss->sigpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) - nr_stopsig++; - if (SIG_IN_MASK(SIGTSTP, ss->shdpnd) && !SIG_IN_MASK(SIGTSTP, ss->sigblk)) - nr_stopsig++; - - if (sig_stop(si.si_signo)) - nr_stopsig++; - - if (nr_stopsig) { - if (skip_sigstop(pid, nr_stopsig)) { - /* - * Make sure that the task is stopped by a supported stop signal and - * send it again to restore task state before criu intervention. - */ - if (sig_stop(si.si_signo)) - kill(pid, si.si_signo); - else - kill(pid, SIGSTOP); - goto err; - } + if (nr_sigstop) { + if (skip_sigstop(pid, nr_sigstop)) + goto err_stop; return COMPEL_TASK_STOPPED; } @@ -360,6 +312,8 @@ try_again: goto err; } +err_stop: + kill(pid, SIGSTOP); err: if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) pr_perror("Unable to detach from %d", pid); @@ -367,11 +321,6 @@ err: } int compel_resume_task(pid_t pid, int orig_st, int st) -{ - return compel_resume_task_sig(pid, orig_st, st, SIGSTOP); -} - -int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) { int ret = 0; @@ -395,18 +344,8 @@ int compel_resume_task_sig(pid_t pid, int orig_st, int st, int stop_signo) * task with STOP in queue that would get lost after * detach, so stop it again. */ - if (orig_st == COMPEL_TASK_STOPPED) { - /* - * Check that stop_signo contain supported stop signal. - * If it isn't, then send SIGSTOP. It makes sense in the case - * when we get COMPEL_TASK_STOPPED from old image, - * where stop_signo was not yet supported. - */ - if (sig_stop(stop_signo)) - kill(pid, stop_signo); - else - kill(pid, SIGSTOP); - } + if (orig_st == COMPEL_TASK_STOPPED) + kill(pid, SIGSTOP); } else { pr_err("Unknown final state %d\n", st); ret = -1; @@ -425,7 +364,7 @@ static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) int sun_len; saddr->sun_family = AF_UNIX; - snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d-%s", key, compel_run_id); + snprintf(saddr->sun_path, UNIX_PATH_MAX, "X/crtools-pr-%d", key); sun_len = SUN_LEN(saddr); *saddr->sun_path = '\0'; @@ -516,8 +455,8 @@ static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack, user_ ksigfillset(&block); /* - * FIXME(issues/1429): SIGTRAP can't be blocked, otherwise its handler - * will be reset to the default one. + * FIXME(issues/1429): SIGTRAP can't be blocked, otherwice its hanlder + * will be reseted to the default one. */ ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) { @@ -587,7 +526,7 @@ static int parasite_trap(struct parasite_ctl *ctl, pid_t pid, user_regs_struct_t } if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); + pr_err("Task is still running (pid: %d)\n", pid); goto err; } @@ -737,7 +676,6 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) { pid_t pid = ctl->rpid; struct infect_ctx *ictx = &ctl->ictx; - user_fpregs_struct_t ext_regs; /* * Get task registers before going daemon, since the @@ -745,7 +683,7 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) * while in daemon it is not such. */ - if (compel_get_task_regs(pid, &ctl->orig.regs, &ext_regs, ictx->save_regs, ictx->regs_arg, ictx->flags)) { + if (compel_get_task_regs(pid, &ctl->orig.regs, NULL, ictx->save_regs, ictx->regs_arg, ictx->flags)) { pr_err("Can't obtain regs for thread %d\n", pid); return -1; } @@ -758,9 +696,6 @@ static int parasite_start_daemon(struct parasite_ctl *ctl) if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask)) return -1; - if (parasite_setup_shstk(ctl, &ext_regs)) - return -1; - if (parasite_init_daemon(ctl)) return -1; @@ -814,7 +749,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; - int ret, fd, lfd, remote_flags; + int ret, fd, lfd; if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; @@ -858,11 +793,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size, goto err_cure; } - remote_flags = MAP_FILE | MAP_SHARED; - if (ctl->ictx.remote_map_addr){ - remote_flags |= MAP_FIXED_NOREPLACE; - } - ctl->remote_map = remote_mmap(ctl, (void *)ctl->ictx.remote_map_addr, size, remote_prot, remote_flags, fd, 0); + ctl->remote_map = remote_mmap(ctl, NULL, size, remote_prot, MAP_FILE | MAP_SHARED, fd, 0); if (!ctl->remote_map) { pr_err("Can't rmap memfd for parasite blob\n"); goto err_curef; @@ -973,7 +904,7 @@ static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size) return ret; } -int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) +int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) { int ret; unsigned long p, map_exchange_size, parasite_size = 0; @@ -1024,7 +955,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, ctl->args_size = args_size; parasite_size += ctl->args_size; - /* RESTORE_STACK_SIGFRAME needs a 64 bytes alignment */ + /* RESTORE_STACK_SIGFRAME needs a 64 bytes alignement */ parasite_size = round_up(parasite_size, 64); map_exchange_size = parasite_size; @@ -1054,16 +985,6 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize); compel_relocs_apply(ctl->local_map, ctl->remote_map, &ctl->pblob); - /* - * Ensure the infected thread sees the updated code. - * - * On architectures like ARM64, the Data Cache (D-cache) and - * Instruction Cache (I-cache) are not automatically coherent. - * Modifications land in the D-cache, so we must flush (clean) the - * D-cache to push changes to RAM to ensure the CPU fetches the updated - * instructions. - */ - __builtin___clear_cache(ctl->local_map, ctl->local_map + ctl->pblob.hdr.bsize); p = parasite_size; @@ -1072,7 +993,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, p += RESTORE_STACK_SIGFRAME; p += PARASITE_STACK_SIZE; - ctl->rstack = ctl->remote_map + p - PARASITE_STACK_REDZONE; + ctl->rstack = ctl->remote_map + p; /* * x86-64 ABI requires a 16 bytes aligned stack. @@ -1086,7 +1007,7 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, if (nr_threads > 1) { p += PARASITE_STACK_SIZE; - ctl->r_thread_stack = ctl->remote_map + p - PARASITE_STACK_REDZONE; + ctl->r_thread_stack = ctl->remote_map + p; } ret = arch_fetch_sas(ctl, ctl->rsigframe); @@ -1095,23 +1016,15 @@ int compel_infect_no_daemon(struct parasite_ctl *ctl, unsigned long nr_threads, goto err; } + if (parasite_start_daemon(ctl)) + goto err; + return 0; err: return -1; } -int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size) -{ - if (compel_infect_no_daemon(ctl, nr_threads, args_size)) - return -1; - - if (parasite_start_daemon(ctl)) - return -1; - - return 0; -} - struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid) { struct parasite_thread_ctl *tctl; @@ -1308,7 +1221,7 @@ struct plain_regs_struct { user_fpregs_struct_t fpregs; }; -static int save_regs_plain(pid_t pid, void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) +static int save_regs_plain(void *to, user_regs_struct_t *r, user_fpregs_struct_t *f) { struct plain_regs_struct *prs = to; @@ -1395,6 +1308,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pid_t pid = ctl->rpid; user_regs_struct_t regs; int status, ret = 0; + enum trace_flags flag; /* stop getting chld from parasite -- we're about to step-by-step it */ if (restore_child_handler(ctl)) @@ -1414,7 +1328,7 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) pr_debug("Daemon %d exited trapping\n", pid); if (!WIFSTOPPED(status)) { - pr_err("Task is still running (pid: %d, status: 0x%x)\n", pid, status); + pr_err("Task is still running (pid: %d)\n", pid); return -1; } @@ -1435,11 +1349,14 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return -1; /* Go to sigreturn as closer as we can */ - ret = compel_stop_pie(pid, ctl->sigreturn_addr, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); + ret = compel_stop_pie(pid, ctl->sigreturn_addr, &flag, ctl->ictx.flags & INFECT_NO_BREAKPOINTS); if (ret < 0) return ret; - if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1))) + if (compel_stop_on_syscall(1, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag)) + return -1; + + if (ptrace_flush_breakpoints(pid)) return -1; /* @@ -1451,11 +1368,6 @@ static int parasite_fini_seized(struct parasite_ctl *ctl) return 0; } -int compel_start_daemon(struct parasite_ctl *ctl) -{ - return parasite_start_daemon(ctl); -} - int compel_stop_daemon(struct parasite_ctl *ctl) { if (ctl->daemonized) { @@ -1563,7 +1475,7 @@ int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd) /* * compel_unmap() is used for unmapping parasite and restorer blobs. - * A blob can contain code for unmapping itself, so the process is + * A blob can contain code for unmapping itself, so the porcess is * trapped on the exit from the munmap syscall. */ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) @@ -1576,7 +1488,7 @@ int compel_unmap(struct parasite_ctl *ctl, unsigned long addr) if (ret) goto err; - ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1)); + ret = compel_stop_on_syscall(1, __NR(munmap, 0), __NR(munmap, 1), TRACE_ENTER); /* * Don't touch extended registers here: they were restored @@ -1588,12 +1500,12 @@ err: return ret; } -int compel_stop_pie(pid_t pid, void *addr, bool no_bp) +int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp) { int ret; if (no_bp) { - pr_debug("Force no-breakpoints restore of %d\n", pid); + pr_debug("Force no-breakpoints restore\n"); ret = 0; } else ret = ptrace_set_breakpoint(pid, addr); @@ -1605,6 +1517,7 @@ int compel_stop_pie(pid_t pid, void *addr, bool no_bp) * PIE will stop on a breakpoint, next * stop after that will be syscall enter. */ + *tf = TRACE_EXIT; return 0; } @@ -1617,12 +1530,14 @@ int compel_stop_pie(pid_t pid, void *addr, bool no_bp) pr_perror("Unable to restart the %d process", pid); return -1; } + + *tf = TRACE_ENTER; return 0; } static bool task_is_trapped(int status, pid_t pid) { - if (WIFSTOPPED(status) && (WSTOPSIG(status) & ~PTRACE_SYSCALL_TRAP) == SIGTRAP) + if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) return true; pr_err("Task %d is in unexpected state: %x\n", pid, status); @@ -1656,13 +1571,15 @@ static inline int is_required_syscall(user_regs_struct_t *regs, pid_t pid, const * sys_nr - the required syscall number * sys_nr_compat - the required compatible syscall number */ -int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) +int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat, enum trace_flags trace) { - enum trace_flags trace = tasks > 1 ? TRACE_ALL : TRACE_ENTER; user_regs_struct_t regs; int status, ret; pid_t pid; + if (tasks > 1) + trace = TRACE_ALL; + /* Stop all threads on the enter point in sys_rt_sigreturn */ while (tasks) { pid = wait4(-1, &status, __WALL, NULL); @@ -1676,18 +1593,6 @@ int compel_stop_on_syscall(int tasks, const int sys_nr, const int sys_nr_compat) pr_debug("%d was trapped\n", pid); - if ((WSTOPSIG(status) & PTRACE_SYSCALL_TRAP) == 0) { - /* - * On some platforms such as ARM64, it is impossible to - * pass through a breakpoint, so let's clear it right - * after it has been triggered. - */ - if (ptrace_flush_breakpoints(pid)) { - pr_err("Unable to clear breakpoints\n"); - return -1; - } - goto goon; - } if (trace == TRACE_EXIT) { trace = TRACE_ENTER; pr_debug("`- Expecting exit\n"); @@ -1781,31 +1686,3 @@ uint64_t compel_get_thread_sp(struct parasite_thread_ctl *tctl) { return REG_SP(tctl->th.regs); } - -uint64_t compel_get_leader_ip(struct parasite_ctl *ctl) -{ - return REG_IP(ctl->orig.regs); -} - -uint64_t compel_get_thread_ip(struct parasite_thread_ctl *tctl) -{ - return REG_IP(tctl->th.regs); -} - -void compel_set_leader_ip(struct parasite_ctl *ctl, uint64_t v) -{ - SET_REG_IP(ctl->orig.regs, v); -} - -void compel_set_thread_ip(struct parasite_thread_ctl *tctl, uint64_t v) -{ - SET_REG_IP(tctl->th.regs, v); -} - -void compel_get_stack(struct parasite_ctl *ctl, void **rstack, void **r_thread_stack) -{ - if (rstack) - *rstack = ctl->rstack; - if (r_thread_stack) - *r_thread_stack = ctl->r_thread_stack; -} diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c index 717ee2839..49b685d70 100644 --- a/compel/src/lib/ptrace.c +++ b/compel/src/lib/ptrace.c @@ -23,7 +23,7 @@ int ptrace_suspend_seccomp(pid_t pid) { - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD) < 0) { + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) { pr_perror("suspending seccomp failed"); return -1; } diff --git a/compel/src/main.c b/compel/src/main.c index 21e06d7dd..a9a50959f 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -19,7 +19,6 @@ #define CFLAGS_DEFAULT_SET \ "-Wstrict-prototypes " \ - "-ffreestanding " \ "-fno-stack-protector -nostdlib -fomit-frame-pointer " #define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" @@ -56,13 +55,6 @@ static const flags_t flags = { .cflags = COMPEL_CFLAGS_PIE, #elif defined CONFIG_MIPS .arch = "mips", - .cflags = COMPEL_CFLAGS_PIE, -#elif defined CONFIG_LOONGARCH64 - .arch = "loongarch64", - .cflags = COMPEL_CFLAGS_PIE, -#elif defined CONFIG_RISCV64 - .arch = "riscv64", - .cflags = COMPEL_CFLAGS_PIE, #else #error "CONFIG_ not defined, or unsupported ARCH" #endif @@ -101,6 +93,7 @@ static int piegen(void) } if (handle_binary(mem, st.st_size)) { + close(fd), fd = -1; unlink(opts.output_filename); goto err; } @@ -108,7 +101,8 @@ static int piegen(void) ret = 0; err: - close(fd); + if (fd >= 0) + close(fd); if (opts.fout) fclose(opts.fout); if (!ret) @@ -352,9 +346,11 @@ int main(int argc, char *argv[]) printf("Version: %d.%d.%d\n", COMPEL_SO_VERSION_MAJOR, COMPEL_SO_VERSION_MINOR, COMPEL_SO_VERSION_SUBLEVEL); exit(0); + break; default: // '?' // error message already printed by getopt_long() return usage(1); + break; } } diff --git a/compel/test/Makefile b/compel/test/Makefile index f46a821ee..63fb76f80 100644 --- a/compel/test/Makefile +++ b/compel/test/Makefile @@ -1,4 +1,4 @@ -all: fdspy infect rsys stack +all: fdspy infect rsys fdspy: $(Q) $(MAKE) -C fdspy @@ -10,12 +10,8 @@ infect: $(Q) $(MAKE) -C infect run .PHONY: infect + rsys: $(Q) $(MAKE) -C rsys $(Q) $(MAKE) -C rsys run .PHONY: rsys - -stack: - $(Q) $(MAKE) -C stack - $(Q) $(MAKE) -C stack run -.PHONY: stack diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 41de99e20..7f20ea2a7 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -110,11 +110,11 @@ static int check_pipe_ends(int wfd, int rfd) printf("Check pipe ends are connected\n"); if (write(wfd, "1", 2) != 2) { fprintf(stderr, "write to pipe failed\n"); - return 0; + return -1; } if (read(rfd, aux, sizeof(aux)) != sizeof(aux)) { fprintf(stderr, "read from pipe failed\n"); - return 0; + return -1; } if (aux[0] != '1' || aux[1] != '\0') { fprintf(stderr, "Pipe connectivity lost\n"); diff --git a/compel/test/infect/Makefile b/compel/test/infect/Makefile index 85efa5fd9..bacfad962 100644 --- a/compel/test/infect/Makefile +++ b/compel/test/infect/Makefile @@ -3,11 +3,6 @@ CFLAGS ?= -O2 -g -Wall -Werror COMPEL := ../../../compel/compel-host -ifeq ($(GCS_ENABLE),1) -CFLAGS += -mbranch-protection=standard -DGCS_TEST_ENABLE=1 -LDFLAGS += -z experimental-gcs=check -endif - all: victim spy run: @@ -22,7 +17,7 @@ clean: rm -f parasite.o victim: victim.c - $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) + $(CC) $(CFLAGS) -o $@ $^ spy: spy.c parasite.h $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index 143946941..e7273b446 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -94,15 +94,15 @@ static inline int chk(int fd, int val) int v = 0; if (read(fd, &v, sizeof(v)) != sizeof(v)) - return 1; + return 0; printf("%d, want %d\n", v, val); - return v != val; + return v == val; } int main(int argc, char **argv) { - int p_in[2], p_out[2], p_err[2], pid, i, err = 0; + int p_in[2], p_out[2], p_err[2], pid, i, pass = 1; /* * Prepare IO-s and fork the victim binary @@ -112,9 +112,6 @@ int main(int argc, char **argv) return -1; } -#ifdef GCS_TEST_ENABLE - setenv("GLIBC_TUNABLES", "glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2", 1); -#endif pid = vfork(); if (pid == 0) { close(p_in[1]); @@ -145,11 +142,9 @@ int main(int argc, char **argv) return 1; printf("Checking the victim alive\n"); - err = chk(p_out[0], 1); - if (err) - return 1; - err = chk(p_out[0], 42); - if (err) + pass = chk(p_out[0], 1); + pass = chk(p_out[0], 42); + if (!pass) return 1; /* @@ -181,14 +176,14 @@ int main(int argc, char **argv) printf("Checking the result\n"); /* These two came from parasite */ - err = chk(p_out[0], 138); - err |= chk(p_out[0], 403); + pass = chk(p_out[0], 138); + pass = chk(p_out[0], 403); /* These two came from post-infect */ - err |= chk(p_out[0], 1234); - err |= chk(p_out[0], 4096); + pass = chk(p_out[0], 1234); + pass = chk(p_out[0], 4096); - if (!err) + if (pass) printf("All OK\n"); else printf("Something went WRONG\n"); diff --git a/compel/test/stack/.gitignore b/compel/test/stack/.gitignore deleted file mode 100644 index 0a554758d..000000000 --- a/compel/test/stack/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -parasite.h -parasite.po -spy -victim diff --git a/compel/test/stack/Makefile b/compel/test/stack/Makefile deleted file mode 100644 index bacfad962..000000000 --- a/compel/test/stack/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -CC := gcc -CFLAGS ?= -O2 -g -Wall -Werror - -COMPEL := ../../../compel/compel-host - -all: victim spy - -run: - ./spy -.PHONY: run - -clean: - rm -f victim - rm -f spy - rm -f parasite.h - rm -f parasite.po - rm -f parasite.o - -victim: victim.c - $(CC) $(CFLAGS) -o $@ $^ - -spy: spy.c parasite.h - $(CC) $(CFLAGS) $(shell $(COMPEL) includes) -o $@ $< $(shell $(COMPEL) --static libs) - -parasite.h: parasite.po - $(COMPEL) hgen -o $@ -f $< - -parasite.po: parasite.o - ld $(shell $(COMPEL) ldflags) -o $@ $^ $(shell $(COMPEL) plugins) - -parasite.o: parasite.c - $(CC) $(CFLAGS) -c $(shell $(COMPEL) cflags) -o $@ $^ diff --git a/compel/test/stack/parasite.c b/compel/test/stack/parasite.c deleted file mode 100644 index ad13bd25d..000000000 --- a/compel/test/stack/parasite.c +++ /dev/null @@ -1,38 +0,0 @@ -#include - -#include -#include - -/* - * Stubs for std compel plugin. - */ -int parasite_trap_cmd(int cmd, void *args) -{ - return 0; -} -void parasite_cleanup(void) -{ -} - -#define PARASITE_CMD_INC PARASITE_USER_CMDS -#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 - -int parasite_daemon_cmd(int cmd, void *args) -{ - int v; - - switch (cmd) { - case PARASITE_CMD_INC: - v = (*(int *)args) + 1; - break; - case PARASITE_CMD_DEC: - v = (*(int *)args) - 1; - break; - default: - v = -1; - break; - } - - sys_write(1, &v, sizeof(int)); - return 0; -} diff --git a/compel/test/stack/spy.c b/compel/test/stack/spy.c deleted file mode 100644 index 184c8ab31..000000000 --- a/compel/test/stack/spy.c +++ /dev/null @@ -1,294 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "parasite.h" - -#define PARASITE_CMD_INC PARASITE_USER_CMDS -#define PARASITE_CMD_DEC PARASITE_USER_CMDS + 1 - -#define err_and_ret(msg) \ - do { \ - fprintf(stderr, msg); \ - return -1; \ - } while (0) - -void *saved_data = NULL; - -#define SAVED_DATA_MAX page_size() - -void cleanup_saved_data(void) -{ - free(saved_data); -} - -static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) -{ - printf("\tLC%u: ", lvl); - vprintf(fmt, parms); -} - -static void *get_parasite_rstack_start(struct parasite_ctl *ctl) -{ - void *rstack, *r_thread_stack, *rstack_start; - - compel_get_stack(ctl, &rstack, &r_thread_stack); - - rstack_start = rstack; - if (r_thread_stack != NULL && r_thread_stack < rstack_start) - rstack_start = r_thread_stack; - - return rstack_start; -} - -static void *read_proc_mem(int pid, void *offset, size_t len) -{ - char victim_mem_path[6 + 11 + 4 + 1]; - int written; - int fd; - void *data; - ssize_t mem_read; - - written = snprintf(victim_mem_path, sizeof(victim_mem_path), "/proc/%d/mem", pid); - if (written < 0 || written >= sizeof(victim_mem_path)) { - fprintf(stderr, "Failed to create path string to victim's /proc/%d/mem file\n", pid); - return NULL; - } - - fd = open(victim_mem_path, O_RDONLY); - if (fd < 0) { - perror("Failed to open victim's /proc/$pid/mem file"); - return NULL; - } - - data = malloc(len); - if (data == NULL) { - perror("Can't allocate memory to read victim's /proc/$pid/mem file"); - return NULL; - } - - mem_read = pread(fd, data, len, (off_t)offset); - if (mem_read == -1) { - perror("Failed to read victim's /proc/$pid/mem file"); - goto freebuf; - } - - return data; - -freebuf: - free(data); - return NULL; -} - -static int check_saved_data(struct parasite_ctl *ctl, int pid, void *stack, void *saved_data, size_t saved_data_size) -{ - if (saved_data != NULL) { - void *current_data; - - current_data = read_proc_mem(pid, stack, saved_data_size); - if (current_data == NULL) - return -1; - - if (memcmp(saved_data, current_data, saved_data_size) != 0) - return 1; - } - - return 0; -} - -static int do_infection(int pid) -{ - int state; - struct parasite_ctl *ctl; - struct infect_ctx *ictx; - int *arg; - void *stack; - size_t saved_data_size = PARASITE_STACK_REDZONE; - int saved_data_check; - - compel_log_init(print_vmsg, COMPEL_LOG_DEBUG); - - printf("Stopping task\n"); - state = compel_stop_task(pid); - if (state < 0) - err_and_ret("Can't stop task\n"); - - printf("Preparing parasite ctl\n"); - ctl = compel_prepare(pid); - if (!ctl) - err_and_ret("Can't prepare for infection\n"); - - printf("Configuring contexts\n"); - - /* - * First -- the infection context. Most of the stuff - * is already filled by compel_prepare(), just set the - * log descriptor for parasite side, library cannot - * live w/o it. - */ - ictx = compel_infect_ctx(ctl); - ictx->log_fd = STDERR_FILENO; - - parasite_setup_c_header(ctl); - - printf("Infecting\n"); - if (compel_infect_no_daemon(ctl, 1, sizeof(int))) - err_and_ret("Can't infect victim\n"); - - if (atexit(cleanup_saved_data)) - err_and_ret("Can't register cleanup function with atexit\n"); - - stack = get_parasite_rstack_start(ctl); - - if (compel_start_daemon(ctl)) - err_and_ret("Can't start daemon in victim\n"); - - /* - * Now get the area with arguments and run two - * commands one by one. - */ - arg = compel_parasite_args(ctl, int); - - printf("Running cmd 1\n"); - *arg = 137; - if (compel_rpc_call_sync(PARASITE_CMD_INC, ctl)) - err_and_ret("Can't run parasite command 1\n"); - - printf("Running cmd 2\n"); - *arg = 404; - if (compel_rpc_call_sync(PARASITE_CMD_DEC, ctl)) - err_and_ret("Can't run parasite command 2\n"); - - saved_data_check = check_saved_data(ctl, pid, stack, saved_data, saved_data_size); - if (saved_data_check == -1) - err_and_ret("Could not check saved data\n"); - if (saved_data_check != 0) - err_and_ret("Saved data unexpectedly modified\n"); - - /* - * Done. Cure and resume the task. - */ - printf("Curing\n"); - if (compel_cure(ctl)) - err_and_ret("Can't cure victim\n"); - - if (compel_resume_task(pid, state, state)) - err_and_ret("Can't unseize task\n"); - - printf("Done\n"); - - return 0; -} - -static inline int chk(int fd, int val) -{ - int v = 0; - - if (read(fd, &v, sizeof(v)) != sizeof(v)) - return 1; - - printf("%d, want %d\n", v, val); - return v != val; -} - -int main(int argc, char **argv) -{ - int p_in[2], p_out[2], p_err[2], pid, i, err = 0; - - /* - * Prepare IO-s and fork the victim binary - */ - if (pipe(p_in) || pipe(p_out) || pipe(p_err)) { - perror("Can't make pipe"); - return -1; - } - - pid = vfork(); - if (pid == 0) { - close(p_in[1]); - dup2(p_in[0], 0); - close(p_in[0]); - close(p_out[0]); - dup2(p_out[1], 1); - close(p_out[1]); - close(p_err[0]); - dup2(p_err[1], 2); - close(p_err[1]); - execl("./victim", "victim", NULL); - exit(1); - } - - close(p_in[0]); - close(p_out[1]); - close(p_err[1]); - - /* - * Tell the little guy some numbers - */ - i = 1; - if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) - return 1; - i = 42; - if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) - return 1; - - printf("Checking the victim alive\n"); - err = chk(p_out[0], 1); - if (err) - return 1; - err = chk(p_out[0], 42); - if (err) - return 1; - - /* - * Now do the infection with parasite.c - */ - - printf("Infecting the victim\n"); - if (do_infection(pid)) - return 1; - - /* - * Tell the victim some more stuff to check it's alive - */ - i = 1234; - if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) - return 1; - i = 4096; - if (write(p_in[1], &i, sizeof(i)) != sizeof(i)) - return 1; - - /* - * Stop the victim and check the infection went well - */ - printf("Closing victim stdin\n"); - close(p_in[1]); - printf("Waiting for victim to die\n"); - wait(NULL); - - printf("Checking the result\n"); - - /* These two came from parasite */ - err = chk(p_out[0], 138); - err |= chk(p_out[0], 403); - - /* These two came from post-infect */ - err |= chk(p_out[0], 1234); - err |= chk(p_out[0], 4096); - - if (!err) - printf("All OK\n"); - else - printf("Something went WRONG\n"); - - return 0; -} diff --git a/compel/test/stack/victim.c b/compel/test/stack/victim.c deleted file mode 100644 index f94613fa1..000000000 --- a/compel/test/stack/victim.c +++ /dev/null @@ -1,16 +0,0 @@ -#include - -int main(int argc, char **argv) -{ - int i; - - while (1) { - if (read(0, &i, sizeof(i)) != sizeof(i)) - break; - - if (write(1, &i, sizeof(i)) != sizeof(i)) - break; - } - - return 0; -} diff --git a/contrib/debian/dev-packages.lst b/contrib/debian/dev-packages.lst new file mode 100644 index 000000000..c2d1509fa --- /dev/null +++ b/contrib/debian/dev-packages.lst @@ -0,0 +1,20 @@ +# Required packages for development in Debian +build-essential +libprotobuf-dev +libprotobuf-c-dev +protobuf-c-compiler +protobuf-compiler +python3-protobuf +libnet-dev + +# Extra packages, required for testing and building other tools +pkg-config +libnl-3-dev +libbsd0 +libbsd-dev +iproute2 +libcap-dev +libaio-dev +python3-yaml +libnl-route-3-dev +python-future diff --git a/contrib/dependencies/apk-packages.sh b/contrib/dependencies/apk-packages.sh deleted file mode 100755 index c47fb9fe0..000000000 --- a/contrib/dependencies/apk-packages.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env sh - -apk add --no-cache \ - asciidoctor \ - bash \ - build-base \ - coreutils \ - e2fsprogs \ - elfutils-dev \ - git \ - gnutls-dev \ - go \ - ip6tables \ - iproute2 \ - iptables \ - iptables-legacy \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libcap-utils \ - libdrm-dev \ - libnet-dev \ - libnl3-dev \ - libtraceevent-dev \ - libtracefs-dev \ - nftables \ - nftables-dev \ - perl \ - pkgconfig \ - procps \ - protobuf-c-compiler \ - protobuf-c-dev \ - protobuf-dev \ - py3-importlib-metadata \ - py3-pip \ - py3-protobuf \ - py3-yaml \ - python3 \ - sudo \ - tar \ - util-linux \ - util-linux-dev diff --git a/contrib/dependencies/apt-cross-packages.sh b/contrib/dependencies/apt-cross-packages.sh deleted file mode 100755 index 30ce6874c..000000000 --- a/contrib/dependencies/apt-cross-packages.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env sh - -APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" -if [ ! -x "$APT_INSTALL" ]; then - echo "Error: apt-install not found or not executable" - exit 1 -fi - -"$APT_INSTALL" \ - crossbuild-essential-"${DEBIAN_ARCH}" \ - iproute2:"${DEBIAN_ARCH}" \ - libaio-dev:"${DEBIAN_ARCH}" \ - libbz2-dev:"${DEBIAN_ARCH}" \ - libc6-"${DEBIAN_ARCH}"-cross \ - libc6-dev-"${DEBIAN_ARCH}"-cross \ - libcap-dev:"${DEBIAN_ARCH}" \ - libdrm-dev:"${DEBIAN_ARCH}" \ - libelf-dev:"${DEBIAN_ARCH}" \ - libexpat1-dev:"${DEBIAN_ARCH}" \ - libgnutls28-dev:"${DEBIAN_ARCH}" \ - libnet-dev:"${DEBIAN_ARCH}" \ - libnftables-dev:"${DEBIAN_ARCH}" \ - libnl-3-dev:"${DEBIAN_ARCH}" \ - libnl-route-3-dev:"${DEBIAN_ARCH}" \ - libprotobuf-c-dev:"${DEBIAN_ARCH}" \ - libprotobuf-dev:"${DEBIAN_ARCH}" \ - libssl-dev:"${DEBIAN_ARCH}" \ - libtraceevent-dev:"${DEBIAN_ARCH}" \ - libtracefs-dev:"${DEBIAN_ARCH}" \ - ncurses-dev:"${DEBIAN_ARCH}" \ - uuid-dev:"${DEBIAN_ARCH}" \ - build-essential \ - pkg-config \ - git \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-protobuf diff --git a/contrib/dependencies/apt-packages.sh b/contrib/dependencies/apt-packages.sh deleted file mode 100755 index 7963be7b4..000000000 --- a/contrib/dependencies/apt-packages.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env sh - -APT_INSTALL="$(cd "$(dirname "$0")/.." >/dev/null 2>&1 && pwd)/apt-install" -if [ ! -x "$APT_INSTALL" ]; then - echo "Error: apt-install not found or not executable" - exit 1 -fi - -"$APT_INSTALL" \ - asciidoctor \ - bash \ - bsdmainutils \ - build-essential \ - gdb \ - git-core \ - iproute2 \ - iptables \ - kmod \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libdrm-dev \ - libelf-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnet-dev \ - libnl-3-dev \ - libnl-route-3-dev \ - libperl-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - libtraceevent-dev \ - libtracefs-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python3-importlib-metadata \ - python3-pip \ - python3-protobuf \ - python3-yaml \ - time \ - util-linux \ - uuid-dev diff --git a/contrib/dependencies/dnf-packages.sh b/contrib/dependencies/dnf-packages.sh deleted file mode 100755 index 793f267a5..000000000 --- a/contrib/dependencies/dnf-packages.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env sh - -dnf install -y \ - asciidoc \ - binutils \ - elfutils-libelf-devel \ - gcc \ - git \ - glibc-devel \ - gnutls-devel \ - iproute \ - iptables \ - libaio-devel \ - libasan \ - libbpf-devel \ - libbsd-devel \ - libcap-devel \ - libdrm-devel \ - libnet-devel \ - libnl3-devel \ - libselinux-devel \ - libtraceevent-devel \ - libtracefs-devel \ - libuuid-devel \ - make \ - nftables \ - pkg-config \ - protobuf \ - protobuf-c \ - protobuf-c-devel \ - protobuf-compiler \ - protobuf-devel \ - python-devel \ - python3-importlib-metadata \ - python3-protobuf \ - python3-pyyaml \ - python3-setuptools \ - python3-wheel \ - rubygem-asciidoctor \ - xmlto diff --git a/contrib/dependencies/pacman-packages.sh b/contrib/dependencies/pacman-packages.sh deleted file mode 100755 index 260797606..000000000 --- a/contrib/dependencies/pacman-packages.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env sh - -pacman -Syu --noconfirm \ - asciidoctor \ - base-devel \ - bash \ - coreutils \ - diffutils \ - git \ - gnutls \ - go \ - iproute2 \ - iptables \ - libaio \ - libbsd \ - libcap \ - libdrm \ - libelf \ - libnet \ - libnl \ - libtraceevent \ - libtracefs \ - nftables \ - pkg-config \ - protobuf \ - protobuf-c \ - python-importlib-metadata \ - python-pip \ - python-protobuf \ - python-yaml \ - sudo \ - tar \ - util-linux \ - util-linux-libs diff --git a/contrib/docker_cr.sh b/contrib/docker_cr.sh index 04ef676cd..9b43d8ba1 100755 --- a/contrib/docker_cr.sh +++ b/contrib/docker_cr.sh @@ -418,7 +418,7 @@ resolve_path() { local p p="${2}" - if command -v realpath > /dev/null; then + if which realpath > /dev/null; then p=$(realpath "${p}") fi ${ECHO} "${1}: ${p}" @@ -427,7 +427,7 @@ resolve_path() { resolve_cmd() { local cpath - cpath=$(command -v "${2}") + cpath=$(which "${2}") resolve_path "${1}" "${cpath}" } diff --git a/coredump/coredump b/coredump/coredump deleted file mode 100755 index 5b3e6f366..000000000 --- a/coredump/coredump +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -import platform -import argparse -import os -import sys - -import criu_coredump - -PLATFORMS = ["aarch64", "armv7l", "x86_64"] - - -def coredump(opts): - generator = criu_coredump.coredump_generator() - cores = generator(os.path.realpath(opts['in'])) - for pid in cores: - if opts['pid'] and pid != opts['pid']: - continue - with open(os.path.realpath(opts['out']) + "/core." + str(pid), 'wb+') as f: - cores[pid].write(f) - - -def main(): - desc = 'CRIU core dump' - parser = argparse.ArgumentParser(description=desc, - formatter_class=argparse.RawTextHelpFormatter) - - parser.add_argument('-i', - '--in', - default='.', - help='directory where to get images from') - parser.add_argument('-p', - '--pid', - type=int, - help='generate coredump for specific pid(all pids py default)') - parser.add_argument('-o', - '--out', - default='.', - help='directory to write coredumps to') - - opts = vars(parser.parse_args()) - - if platform.machine() not in PLATFORMS: - print("ERROR: %s is only supported on: %s" % (sys.argv[0], ', '.join(PLATFORMS))) - sys.exit(1) - - try: - coredump(opts) - except SystemExit as error: - print('ERROR: %s' % error) - print('Exiting') - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/coredump/criu-coredump b/coredump/criu-coredump new file mode 100755 index 000000000..25c188c6b --- /dev/null +++ b/coredump/criu-coredump @@ -0,0 +1,40 @@ +#!/usr/bin/env python2 +import argparse +import os + +import criu_coredump + +def coredump(opts): + generator = criu_coredump.coredump_generator() + cores = generator(os.path.realpath(opts['in'])) + for pid in cores: + if opts['pid'] and pid != opts['pid']: + continue + with open(os.path.realpath(opts['out'])+"/core."+str(pid), 'w+') as f: + cores[pid].write(f) + + +def main(): + desc = 'CRIU core dump' + parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument('-i', + '--in', + default = '.', + help = 'directory where to get images from') + parser.add_argument('-p', + '--pid', + type = int, + help = 'generate coredump for specific pid(all pids py default)') + parser.add_argument('-o', + '--out', + default = '.', + help = 'directory to write coredumps to') + + opts = vars(parser.parse_args()) + + coredump(opts) + +if __name__ == '__main__': + main() diff --git a/coredump/criu_coredump/__init__.py b/coredump/criu_coredump/__init__.py index c1a437cf4..213af42ec 100644 --- a/coredump/criu_coredump/__init__.py +++ b/coredump/criu_coredump/__init__.py @@ -1 +1,2 @@ -from .coredump import coredump_generator +from coredump import * +import elf diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index acb806ace..b37ef2291 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -29,12 +29,9 @@ # 4) VMAs contents; # import io -import sys +import elf import ctypes -import platform - from pycriu import images -from . import elf # Some memory-related constants PAGESIZE = 4096 @@ -54,8 +51,6 @@ status = { "VMA_AREA_SOCKET": 1 << 11, "VMA_AREA_VVAR": 1 << 12, "VMA_AREA_AIORING": 1 << 13, - "VMA_AREA_MEMFD": 1 << 14, - "VMA_AREA_UPROBES": 1 << 17, "VMA_AREA_UNSUPP": 1 << 31 } @@ -93,16 +88,11 @@ class coredump: for note in self.notes: buf.write(note.nhdr) buf.write(note.owner) - buf.write(b"\0" * (8 - len(note.owner))) + buf.write("\0" * (8 - len(note.owner))) buf.write(note.data) - bits = platform.architecture()[0] # 32 or 64 bits - - ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} - phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} - - offset = ctypes.sizeof(ehdr[bits]()) - offset += (len(self.vmas) + 1) * ctypes.sizeof(phdr[bits]()) + offset = ctypes.sizeof(elf.Elf64_Ehdr()) + offset += (len(self.vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) filesz = 0 for note in self.notes: @@ -137,20 +127,6 @@ class coredump_generator: reg_files = None # reg-files; pagemaps = {} # pagemap by pid; - # thread info key based on the current arch - thread_info_key = { - "aarch64": "ti_aarch64", - "armv7l": "ti_arm", - "x86_64": "thread_info", - } - - machine = platform.machine() # current arch - bits = platform.architecture()[0] # 32 or 64 bits - - ehdr = {"32bit": elf.Elf32_Ehdr, "64bit": elf.Elf64_Ehdr} # 32 or 64 bits Ehdr - nhdr = {"32bit": elf.Elf32_Nhdr, "64bit": elf.Elf64_Nhdr} # 32 or 64 bits Nhdr - phdr = {"32bit": elf.Elf32_Phdr, "64bit": elf.Elf64_Phdr} # 32 or 64 bits Phdr - def _img_open_and_strip(self, name, single=False, pid=None): """ Load criu image and strip it from magic and redundant list. @@ -160,7 +136,7 @@ class coredump_generator: path += "-" + str(pid) path += ".img" - with open(path, 'rb') as f: + with open(path) as f: img = images.load(f) if single: @@ -201,7 +177,7 @@ class coredump_generator: for p in self.coredumps: if pid and p != pid: continue - with open(coredumps_dir + "/" + "core." + str(p), 'wb+') as f: + with open(coredumps_dir + "/" + "core." + str(p), 'w+') as f: self.coredumps[p].write(f) def _gen_coredump(self, pid): @@ -222,62 +198,44 @@ class coredump_generator: """ Generate elf header for process pid with program headers phdrs. """ - ei_class = {"32bit": elf.ELFCLASS32, "64bit": elf.ELFCLASS64} - - ehdr = self.ehdr[self.bits]() + ehdr = elf.Elf64_Ehdr() ctypes.memset(ctypes.addressof(ehdr), 0, ctypes.sizeof(ehdr)) ehdr.e_ident[elf.EI_MAG0] = elf.ELFMAG0 ehdr.e_ident[elf.EI_MAG1] = elf.ELFMAG1 ehdr.e_ident[elf.EI_MAG2] = elf.ELFMAG2 ehdr.e_ident[elf.EI_MAG3] = elf.ELFMAG3 - ehdr.e_ident[elf.EI_CLASS] = ei_class[self.bits] + ehdr.e_ident[elf.EI_CLASS] = elf.ELFCLASS64 ehdr.e_ident[elf.EI_DATA] = elf.ELFDATA2LSB ehdr.e_ident[elf.EI_VERSION] = elf.EV_CURRENT - if self.machine == "armv7l": - ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_ARM - else: - ehdr.e_ident[elf.EI_OSABI] = elf.ELFOSABI_NONE - ehdr.e_type = elf.ET_CORE - ehdr.e_machine = self._get_e_machine() + ehdr.e_machine = elf.EM_X86_64 ehdr.e_version = elf.EV_CURRENT - ehdr.e_phoff = ctypes.sizeof(self.ehdr[self.bits]()) - ehdr.e_ehsize = ctypes.sizeof(self.ehdr[self.bits]()) - ehdr.e_phentsize = ctypes.sizeof(self.phdr[self.bits]()) - # FIXME Case len(phdrs) > PN_XNUM should be handled properly. + ehdr.e_phoff = ctypes.sizeof(elf.Elf64_Ehdr()) + ehdr.e_ehsize = ctypes.sizeof(elf.Elf64_Ehdr()) + ehdr.e_phentsize = ctypes.sizeof(elf.Elf64_Phdr()) + #FIXME Case len(phdrs) > PN_XNUM should be handled properly. # See fs/binfmt_elf.c from linux kernel. ehdr.e_phnum = len(phdrs) return ehdr - def _get_e_machine(self): - """ - Get the e_machine field based on the current architecture. - """ - e_machine_dict = { - "aarch64": elf.EM_AARCH64, - "armv7l": elf.EM_ARM, - "x86_64": elf.EM_X86_64, - } - return e_machine_dict[self.machine] - def _gen_phdrs(self, pid, notes, vmas): """ Generate program headers for process pid. """ phdrs = [] - offset = ctypes.sizeof(self.ehdr[self.bits]()) - offset += (len(vmas) + 1) * ctypes.sizeof(self.phdr[self.bits]()) + offset = ctypes.sizeof(elf.Elf64_Ehdr()) + offset += (len(vmas) + 1) * ctypes.sizeof(elf.Elf64_Phdr()) filesz = 0 for note in notes: filesz += ctypes.sizeof(note.nhdr) + ctypes.sizeof(note.data) + 8 # PT_NOTE - phdr = self.phdr[self.bits]() + phdr = elf.Elf64_Phdr() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_NOTE phdr.p_offset = offset @@ -297,7 +255,7 @@ class coredump_generator: for vma in vmas: offset += filesz filesz = vma.filesz - phdr = self.phdr[self.bits]() + phdr = elf.Elf64_Phdr() ctypes.memset(ctypes.addressof(phdr), 0, ctypes.sizeof(phdr)) phdr.p_type = elf.PT_LOAD phdr.p_align = PAGESIZE @@ -337,7 +295,7 @@ class coredump_generator: prpsinfo.pr_state = 3 # Don't even ask me why it is so, just borrowed from linux # source and made pr_state match. - prpsinfo.pr_sname = b'.' if prpsinfo.pr_state > 5 else b"RSDTZW" [ + prpsinfo.pr_sname = '.' if prpsinfo.pr_state > 5 else "RSDTZW" [ prpsinfo.pr_state] prpsinfo.pr_zomb = 1 if prpsinfo.pr_state == 4 else 0 prpsinfo.pr_nice = core["thread_core"][ @@ -349,19 +307,17 @@ class coredump_generator: prpsinfo.pr_ppid = pstree["ppid"] prpsinfo.pr_pgrp = pstree["pgid"] prpsinfo.pr_sid = pstree["sid"] - # prpsinfo.pr_psargs has a limit of 80 characters which means it will - # fail here if the cmdline is longer than 80 - prpsinfo.pr_psargs = self._gen_cmdline(pid)[:80] - prpsinfo.pr_fname = core["tc"]["comm"].encode() + prpsinfo.pr_fname = core["tc"]["comm"] + prpsinfo.pr_psargs = self._gen_cmdline(pid) - nhdr = self.nhdr[self.bits]() + nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prpsinfo()) nhdr.n_type = elf.NT_PRPSINFO note = elf_note() note.data = prpsinfo - note.owner = b"CORE" + note.owner = "CORE" note.nhdr = nhdr return note @@ -371,110 +327,82 @@ class coredump_generator: Generate NT_PRSTATUS note for thread tid of process pid. """ core = self.cores[tid] - regs = self._get_gpregs(core) + regs = core["thread_info"]["gpregs"] pstree = self.pstree[pid] prstatus = elf.elf_prstatus() ctypes.memset(ctypes.addressof(prstatus), 0, ctypes.sizeof(prstatus)) - # FIXME setting only some of the fields for now. Revisit later. + #FIXME setting only some of the fields for now. Revisit later. prstatus.pr_pid = tid prstatus.pr_ppid = pstree["ppid"] prstatus.pr_pgrp = pstree["pgid"] prstatus.pr_sid = pstree["sid"] - self._set_pr_regset(prstatus.pr_reg, regs) + prstatus.pr_reg.r15 = regs["r15"] + prstatus.pr_reg.r14 = regs["r14"] + prstatus.pr_reg.r13 = regs["r13"] + prstatus.pr_reg.r12 = regs["r12"] + prstatus.pr_reg.rbp = regs["bp"] + prstatus.pr_reg.rbx = regs["bx"] + prstatus.pr_reg.r11 = regs["r11"] + prstatus.pr_reg.r10 = regs["r10"] + prstatus.pr_reg.r9 = regs["r9"] + prstatus.pr_reg.r8 = regs["r8"] + prstatus.pr_reg.rax = regs["ax"] + prstatus.pr_reg.rcx = regs["cx"] + prstatus.pr_reg.rdx = regs["dx"] + prstatus.pr_reg.rsi = regs["si"] + prstatus.pr_reg.rdi = regs["di"] + prstatus.pr_reg.orig_rax = regs["orig_ax"] + prstatus.pr_reg.rip = regs["ip"] + prstatus.pr_reg.cs = regs["cs"] + prstatus.pr_reg.eflags = regs["flags"] + prstatus.pr_reg.rsp = regs["sp"] + prstatus.pr_reg.ss = regs["ss"] + prstatus.pr_reg.fs_base = regs["fs_base"] + prstatus.pr_reg.gs_base = regs["gs_base"] + prstatus.pr_reg.ds = regs["ds"] + prstatus.pr_reg.es = regs["es"] + prstatus.pr_reg.fs = regs["fs"] + prstatus.pr_reg.gs = regs["gs"] - nhdr = self.nhdr[self.bits]() + nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.elf_prstatus()) nhdr.n_type = elf.NT_PRSTATUS note = elf_note() note.data = prstatus - note.owner = b"CORE" + note.owner = "CORE" note.nhdr = nhdr return note - def _get_gpregs(self, core): - """ - Get the general purpose registers based on the current architecture. - """ - thread_info_key = self.thread_info_key[self.machine] - thread_info = core[thread_info_key] - - return thread_info["gpregs"] - - def _set_pr_regset(self, pr_reg, regs): - """ - Set the pr_reg struct based on the current architecture. - """ - if self.machine == "aarch64": - pr_reg.regs = (ctypes.c_ulonglong * len(regs["regs"]))(*regs["regs"]) - pr_reg.sp = regs["sp"] - pr_reg.pc = regs["pc"] - pr_reg.pstate = regs["pstate"] - elif self.machine == "armv7l": - pr_reg.r0 = regs["r0"] - pr_reg.r1 = regs["r1"] - pr_reg.r2 = regs["r2"] - pr_reg.r3 = regs["r3"] - pr_reg.r4 = regs["r4"] - pr_reg.r5 = regs["r5"] - pr_reg.r6 = regs["r6"] - pr_reg.r7 = regs["r7"] - pr_reg.r8 = regs["r8"] - pr_reg.r9 = regs["r9"] - pr_reg.r10 = regs["r10"] - pr_reg.fp = regs["fp"] - pr_reg.ip = regs["ip"] - pr_reg.sp = regs["sp"] - pr_reg.lr = regs["lr"] - pr_reg.pc = regs["pc"] - pr_reg.cpsr = regs["cpsr"] - pr_reg.orig_r0 = regs["orig_r0"] - elif self.machine == "x86_64": - pr_reg.r15 = regs["r15"] - pr_reg.r14 = regs["r14"] - pr_reg.r13 = regs["r13"] - pr_reg.r12 = regs["r12"] - pr_reg.rbp = regs["bp"] - pr_reg.rbx = regs["bx"] - pr_reg.r11 = regs["r11"] - pr_reg.r10 = regs["r10"] - pr_reg.r9 = regs["r9"] - pr_reg.r8 = regs["r8"] - pr_reg.rax = regs["ax"] - pr_reg.rcx = regs["cx"] - pr_reg.rdx = regs["dx"] - pr_reg.rsi = regs["si"] - pr_reg.rdi = regs["di"] - pr_reg.orig_rax = regs["orig_ax"] - pr_reg.rip = regs["ip"] - pr_reg.cs = regs["cs"] - pr_reg.eflags = regs["flags"] - pr_reg.rsp = regs["sp"] - pr_reg.ss = regs["ss"] - pr_reg.fs_base = regs["fs_base"] - pr_reg.gs_base = regs["gs_base"] - pr_reg.ds = regs["ds"] - pr_reg.es = regs["es"] - pr_reg.fs = regs["fs"] - pr_reg.gs = regs["gs"] - def _gen_fpregset(self, pid, tid): """ Generate NT_FPREGSET note for thread tid of process pid. """ core = self.cores[tid] - regs = self._get_fpregs(core) + regs = core["thread_info"]["fpregs"] fpregset = elf.elf_fpregset_t() ctypes.memset(ctypes.addressof(fpregset), 0, ctypes.sizeof(fpregset)) - self._set_fpregset(fpregset, regs) + fpregset.cwd = regs["cwd"] + fpregset.swd = regs["swd"] + fpregset.ftw = regs["twd"] + fpregset.fop = regs["fop"] + fpregset.rip = regs["rip"] + fpregset.rdp = regs["rdp"] + fpregset.mxcsr = regs["mxcsr"] + fpregset.mxcr_mask = regs["mxcsr_mask"] + fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( + *regs["st_space"]) + fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( + *regs["xmm_space"]) + #fpregset.padding = regs["padding"] unused nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 @@ -483,87 +411,7 @@ class coredump_generator: note = elf_note() note.data = fpregset - note.owner = b"CORE" - note.nhdr = nhdr - - return note - - def _get_fpregs(self, core): - """ - Get the floating point register dictionary based on the current architecture. - """ - fpregs_key_dict = {"aarch64": "fpsimd", "x86_64": "fpregs"} - fpregs_key = fpregs_key_dict[self.machine] - - thread_info_key = self.thread_info_key[self.machine] - - return core[thread_info_key][fpregs_key] - - def _set_fpregset(self, fpregset, regs): - """ - Set the fpregset struct based on the current architecture. - """ - if self.machine == "aarch64": - fpregset.vregs = (ctypes.c_ulonglong * len(regs["vregs"]))(*regs["vregs"]) - fpregset.fpsr = regs["fpsr"] - fpregset.fpcr = regs["fpcr"] - elif self.machine == "x86_64": - fpregset.cwd = regs["cwd"] - fpregset.swd = regs["swd"] - fpregset.ftw = regs["twd"] - fpregset.fop = regs["fop"] - fpregset.rip = regs["rip"] - fpregset.rdp = regs["rdp"] - fpregset.mxcsr = regs["mxcsr"] - fpregset.mxcr_mask = regs["mxcsr_mask"] - fpregset.st_space = (ctypes.c_uint * len(regs["st_space"]))( - *regs["st_space"]) - fpregset.xmm_space = (ctypes.c_uint * len(regs["xmm_space"]))( - *regs["xmm_space"]) - - def _gen_arm_tls(self, tid): - """ - Generate NT_ARM_TLS note for thread tid of process pid. - """ - core = self.cores[tid] - tls = ctypes.c_ulonglong(core["ti_aarch64"]["tls"]) - - nhdr = elf.Elf64_Nhdr() - nhdr.n_namesz = 6 - nhdr.n_descsz = ctypes.sizeof(ctypes.c_ulonglong) - nhdr.n_type = elf.NT_ARM_TLS - - note = elf_note() - note.data = tls - note.owner = b"LINUX" - note.nhdr = nhdr - - return note - - def _gen_arm_vfp(self, tid): - """ - Generate NT_ARM_VFP note for thread tid of process pid. - """ - core = self.cores[tid] - fpstate = core["ti_arm"]["fpstate"] - - data = elf.vfp_hard_struct() - ctypes.memset(ctypes.addressof(data), 0, ctypes.sizeof(data)) - - data.vfp_regs = (ctypes.c_uint64 * len(fpstate["vfp_regs"]))(*fpstate["vfp_regs"]) - data.fpexc = fpstate["fpexc"] - data.fpscr = fpstate["fpscr"] - data.fpinst = fpstate["fpinst"] - data.fpinst2 = fpstate["fpinst2"] - - nhdr = elf.Elf32_Nhdr() - nhdr.n_namesz = 6 - nhdr.n_descsz = ctypes.sizeof(data) - nhdr.n_type = elf.NT_ARM_VFP - - note = elf_note() - note.data = data - note.owner = b"LINUX" + note.owner = "CORE" note.nhdr = nhdr return note @@ -604,7 +452,7 @@ class coredump_generator: note = elf_note() note.data = data - note.owner = b"LINUX" + note.owner = "LINUX" note.nhdr = nhdr return note @@ -617,14 +465,14 @@ class coredump_generator: # FIXME zeroify everything for now ctypes.memset(ctypes.addressof(siginfo), 0, ctypes.sizeof(siginfo)) - nhdr = self.nhdr[self.bits]() + nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 nhdr.n_descsz = ctypes.sizeof(elf.siginfo_t()) nhdr.n_type = elf.NT_SIGINFO note = elf_note() note.data = siginfo - note.owner = b"CORE" + note.owner = "CORE" note.nhdr = nhdr return note @@ -634,29 +482,24 @@ class coredump_generator: Generate NT_AUXV note for thread tid of process pid. """ mm = self.mms[pid] - num_auxv = len(mm["mm_saved_auxv"]) // 2 + num_auxv = len(mm["mm_saved_auxv"]) / 2 - class elf32_auxv(ctypes.Structure): - _fields_ = [("auxv", elf.Elf32_auxv_t * num_auxv)] - - class elf64_auxv(ctypes.Structure): + class elf_auxv(ctypes.Structure): _fields_ = [("auxv", elf.Elf64_auxv_t * num_auxv)] - elf_auxv = {"32bit": elf32_auxv(), "64bit": elf64_auxv()} - - auxv = elf_auxv[self.bits] + auxv = elf_auxv() for i in range(num_auxv): auxv.auxv[i].a_type = mm["mm_saved_auxv"][i] auxv.auxv[i].a_val = mm["mm_saved_auxv"][i + 1] - nhdr = self.nhdr[self.bits]() + nhdr = elf.Elf64_Nhdr() nhdr.n_namesz = 5 - nhdr.n_descsz = ctypes.sizeof(elf_auxv[self.bits]) + nhdr.n_descsz = ctypes.sizeof(elf_auxv()) nhdr.n_type = elf.NT_AUXV note = elf_note() note.data = auxv - note.owner = b"CORE" + note.owner = "CORE" note.nhdr = nhdr return note @@ -680,10 +523,10 @@ class coredump_generator: continue shmid = vma["shmid"] - off = vma["pgoff"] // PAGESIZE + off = vma["pgoff"] / PAGESIZE files = self.reg_files - fname = next(filter(lambda x: x["id"] == shmid, files))["name"] + fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] info = mmaped_file_info() info.start = vma["start"] @@ -726,17 +569,17 @@ class coredump_generator: setattr(data, "start" + str(i), info.start) setattr(data, "end" + str(i), info.end) setattr(data, "file_ofs" + str(i), info.file_ofs) - setattr(data, "name" + str(i), info.name.encode()) + setattr(data, "name" + str(i), info.name) - nhdr = self.nhdr[self.bits]() + nhdr = elf.Elf64_Nhdr() - nhdr.n_namesz = 5 # strlen + 1 + nhdr.n_namesz = 5 #XXX strlen + 1 nhdr.n_descsz = ctypes.sizeof(elf_files()) nhdr.n_type = elf.NT_FILE note = elf_note() note.nhdr = nhdr - note.owner = b"CORE" + note.owner = "CORE" note.data = data return note @@ -745,15 +588,9 @@ class coredump_generator: notes = [] notes.append(self._gen_prstatus(pid, tid)) - if self.machine != "armv7l": - notes.append(self._gen_fpregset(pid, tid)) + notes.append(self._gen_fpregset(pid, tid)) + notes.append(self._gen_x86_xstate(pid, tid)) notes.append(self._gen_siginfo(pid, tid)) - if self.machine == "aarch64": - notes.append(self._gen_arm_tls(tid)) - elif self.machine == "armv7l": - notes.append(self._gen_arm_vfp(tid)) - elif self.machine == "x86_64": - notes.append(self._gen_x86_xstate(pid, tid)) return notes @@ -794,9 +631,7 @@ class coredump_generator: off = 0 # in pages for m in pagemap[1:]: found = False - num_pages = m.get("nr_pages", m["compat_nr_pages"]) - - for i in range(num_pages): + for i in range(m["nr_pages"]): if m["vaddr"] + i * PAGESIZE == page_no * PAGESIZE: found = True break @@ -805,11 +640,11 @@ class coredump_generator: if not found: continue - if "in_parent" in m and m["in_parent"]: + if "in_parent" in m and m["in_parent"] == True: ppid = self.pstree[pid]["ppid"] return self._get_page(ppid, page_no) else: - with open(self._imgs_dir + "/pages-%s.img" % pages_id, 'rb') as f: + with open(self._imgs_dir + "/pages-%s.img" % pages_id) as f: f.seek(off * PAGESIZE) return f.read(PAGESIZE) @@ -822,16 +657,16 @@ class coredump_generator: f = None if size == 0: - return b"" + return "" if vma["status"] & status["VMA_AREA_VVAR"]: - # FIXME this is what gdb does, as vvar vma + #FIXME this is what gdb does, as vvar vma # is not readable from userspace? - return b"\0" * size + return "\0" * size elif vma["status"] & status["VMA_AREA_VSYSCALL"]: - # FIXME need to dump it with criu or read from + #FIXME need to dump it with criu or read from # current process. - return b"\0" * size + return "\0" * size if vma["status"] & status["VMA_FILE_SHARED"] or \ vma["status"] & status["VMA_FILE_PRIVATE"]: @@ -840,13 +675,9 @@ class coredump_generator: off = vma["pgoff"] files = self.reg_files - fname = next(filter(lambda x: x["id"] == shmid, files))["name"] - - try: - f = open(fname, 'rb') - except FileNotFoundError: - sys.exit('Required file %s not found.' % fname) + fname = filter(lambda x: x["id"] == shmid, files)[0]["name"] + f = open(fname) f.seek(off) start = vma["start"] @@ -868,10 +699,10 @@ class coredump_generator: # a file, and changed ones -- from pages.img. # Finally, if no page is found neither in pages.img nor # in file, hole in inserted -- a page filled with zeroes. - start_page = start // PAGESIZE - end_page = end // PAGESIZE + start_page = start / PAGESIZE + end_page = end / PAGESIZE - buf = b"" + buf = "" for page_no in range(start_page, end_page + 1): page = None @@ -879,17 +710,17 @@ class coredump_generator: # and choose appropriate. page_mem = self._get_page(pid, page_no) - if f is not None: + if f != None: page = f.read(PAGESIZE) - if page_mem is not None: + if page_mem != None: # Page from pages.img has higher priority - # than one from mapped file on disk. + # than one from maped file on disk. page = page_mem if page is None: # Hole - page = PAGESIZE * b"\0" + page = PAGESIZE * "\0" # If it is a start or end page, we need to read # only part of it. @@ -909,7 +740,7 @@ class coredump_generator: buf += page[n_skip:n_skip + n_read] # Don't forget to close file. - if f is not None: + if f != None: f.close() return buf @@ -931,25 +762,25 @@ class coredump_generator: chunk = self._gen_mem_chunk(pid, vma, size) # Replace all '\0's with spaces. - return chunk.replace(b'\0', b' ') + return chunk.replace('\0', ' ') def _get_vma_dump_size(self, vma): """ Calculate amount of vma to put into core dump. """ - if (vma["status"] & status["VMA_AREA_VVAR"] or - vma["status"] & status["VMA_AREA_VSYSCALL"] or - vma["status"] & status["VMA_AREA_VDSO"]): + if vma["status"] & status["VMA_AREA_VVAR"] or \ + vma["status"] & status["VMA_AREA_VSYSCALL"] or \ + vma["status"] & status["VMA_AREA_VDSO"]: size = vma["end"] - vma["start"] elif vma["prot"] == 0: size = 0 - elif (vma["prot"] & prot["PROT_READ"] and - vma["prot"] & prot["PROT_EXEC"]): + elif vma["prot"] & prot["PROT_READ"] and \ + vma["prot"] & prot["PROT_EXEC"]: size = PAGESIZE - elif (vma["status"] & status["VMA_ANON_SHARED"] or - vma["status"] & status["VMA_FILE_SHARED"] or - vma["status"] & status["VMA_ANON_PRIVATE"] or - vma["status"] & status["VMA_FILE_PRIVATE"]): + elif vma["status"] & status["VMA_ANON_SHARED"] or \ + vma["status"] & status["VMA_FILE_SHARED"] or \ + vma["status"] & status["VMA_ANON_PRIVATE"] or \ + vma["status"] & status["VMA_FILE_PRIVATE"]: size = vma["end"] - vma["start"] else: size = 0 @@ -988,6 +819,8 @@ class coredump_generator: vmas = [] for vma in mm["vmas"]: + size = self._get_vma_dump_size(vma) + v = vma_class() v.filesz = self._get_vma_dump_size(vma) v.data = self._gen_mem_chunk(pid, vma, v.filesz) diff --git a/coredump/criu_coredump/elf.py b/coredump/criu_coredump/elf.py index 2911f491e..e65919e6b 100644 --- a/coredump/criu_coredump/elf.py +++ b/coredump/criu_coredump/elf.py @@ -1,14 +1,5 @@ # Define structures and constants for generating elf file. import ctypes -import platform - -MACHINE = platform.machine() - -Elf32_Half = ctypes.c_uint16 # typedef uint16_t Elf32_Half; -Elf32_Word = ctypes.c_uint32 # typedef uint32_t Elf32_Word; -Elf32_Addr = ctypes.c_uint32 # typedef uint32_t Elf32_Addr; -Elf32_Off = ctypes.c_uint32 # typedef uint32_t Elf32_Off; -Elf32_Xword = ctypes.c_uint64 # typedef uint64_t Elf32_Xword; Elf64_Half = ctypes.c_uint16 # typedef uint16_t Elf64_Half; Elf64_Word = ctypes.c_uint32 # typedef uint32_t Elf64_Word; @@ -16,7 +7,7 @@ Elf64_Addr = ctypes.c_uint64 # typedef uint64_t Elf64_Addr; Elf64_Off = ctypes.c_uint64 # typedef uint64_t Elf64_Off; Elf64_Xword = ctypes.c_uint64 # typedef uint64_t Elf64_Xword; -# Elf_Ehdr related constants. +# Elf64_Ehdr related constants. # e_ident size. EI_NIDENT = 16 # #define EI_NIDENT (16) @@ -25,84 +16,58 @@ EI_MAG0 = 0 # #define EI_MAG0 0 /* File identification by ELFMAG0 = 0x7f # #define ELFMAG0 0x7f /* Magic number byte 0 */ EI_MAG1 = 1 # #define EI_MAG1 1 /* File identification byte 1 index */ -ELFMAG1 = ord('E') # #define ELFMAG1 'E' /* Magic number byte 1 */ +ELFMAG1 = ord( + 'E') # #define ELFMAG1 'E' /* Magic number byte 1 */ EI_MAG2 = 2 # #define EI_MAG2 2 /* File identification byte 2 index */ -ELFMAG2 = ord('L') # #define ELFMAG2 'L' /* Magic number byte 2 */ +ELFMAG2 = ord( + 'L') # #define ELFMAG2 'L' /* Magic number byte 2 */ EI_MAG3 = 3 # #define EI_MAG3 3 /* File identification byte 3 index */ -ELFMAG3 = ord('F') # #define ELFMAG3 'F' /* Magic number byte 3 */ +ELFMAG3 = ord( + 'F') # #define ELFMAG3 'F' /* Magic number byte 3 */ EI_CLASS = 4 # #define EI_CLASS 4 /* File class byte index */ EI_DATA = 5 # #define EI_DATA 5 /* Data encoding byte index */ -EI_OSABI = 7 # #define EI_OSABI 7 /* OS ABI identification */ - EI_VERSION = 6 # #define EI_VERSION 6 /* File version byte index */ ELFDATA2LSB = 1 # #define ELFDATA2LSB 1 /* 2's complement, little endian */ -ELFCLASS32 = 1 # #define ELFCLASS32 1 /* 32-bit objects */ ELFCLASS64 = 2 # #define ELFCLASS64 2 /* 64-bit objects */ # Legal values for e_type (object file type). ET_CORE = 4 # #define ET_CORE 4 /* Core file */ # Legal values for e_machine (architecture). -EM_ARM = 40 # #define EM_ARM 40 /* ARM */ EM_X86_64 = 62 # #define EM_X86_64 62 /* AMD x86-64 architecture */ -EM_AARCH64 = 183 # #define EM_AARCH64 183 /* ARM AARCH64 */ # Legal values for e_version (version). EV_CURRENT = 1 # #define EV_CURRENT 1 /* Current version */ -# Legal values for e_osabi -ELFOSABI_NONE = 0 # #define ELFOSABI_NONE 0 /* UNIX System V ABI */ -ELFOSABI_ARM = 97 # #define ELFOSABI_ARM 97 /* ARM */ - - -class Elf32_Ehdr(ctypes.Structure): # typedef struct - _fields_ = [ - ("e_ident", - ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; - ("e_type", Elf32_Half), # Elf32_Half e_type; - ("e_machine", Elf32_Half), # Elf32_Half e_machine; - ("e_version", Elf32_Word), # Elf32_Word e_version; - ("e_entry", Elf32_Addr), # Elf32_Addr e_entry; - ("e_phoff", Elf32_Off), # Elf32_Off e_phoff; - ("e_shoff", Elf32_Off), # Elf32_Off e_shoff; - ("e_flags", Elf32_Word), # Elf32_Word e_flags; - ("e_ehsize", Elf32_Half), # Elf32_Half e_ehsize; - ("e_phentsize", Elf32_Half), # Elf32_Half e_phentsize; - ("e_phnum", Elf32_Half), # Elf32_Half e_phnum; - ("e_shentsize", Elf32_Half), # Elf32_Half e_shentsize; - ("e_shnum", Elf32_Half), # Elf32_Half e_shnum; - ("e_shstrndx", Elf32_Half) # Elf32_Half e_shstrndx; - ] # } Elf32_Ehdr; - class Elf64_Ehdr(ctypes.Structure): # typedef struct - _fields_ = [ + _fields_ = [ # { ("e_ident", - ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; - ("e_type", Elf64_Half), # Elf64_Half e_type; - ("e_machine", Elf64_Half), # Elf64_Half e_machine; - ("e_version", Elf64_Word), # Elf64_Word e_version; - ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; - ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; - ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; - ("e_flags", Elf64_Word), # Elf64_Word e_flags; - ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; - ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; - ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; - ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; - ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; - ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; + ctypes.c_ubyte * EI_NIDENT), # unsigned char e_ident[EI_NIDENT]; + ("e_type", Elf64_Half), # Elf64_Half e_type; + ("e_machine", Elf64_Half), # Elf64_Half e_machine; + ("e_version", Elf64_Word), # Elf64_Word e_version; + ("e_entry", Elf64_Addr), # Elf64_Addr e_entry; + ("e_phoff", Elf64_Off), # Elf64_Off e_phoff; + ("e_shoff", Elf64_Off), # Elf64_Off e_shoff; + ("e_flags", Elf64_Word), # Elf64_Word e_flags; + ("e_ehsize", Elf64_Half), # Elf64_Half e_ehsize; + ("e_phentsize", Elf64_Half), # Elf64_Half e_phentsize; + ("e_phnum", Elf64_Half), # Elf64_Half e_phnum; + ("e_shentsize", Elf64_Half), # Elf64_Half e_shentsize; + ("e_shnum", Elf64_Half), # Elf64_Half e_shnum; + ("e_shstrndx", Elf64_Half) # Elf64_Half e_shstrndx; ] # } Elf64_Ehdr; -# Elf_Phdr related constants. +# Elf64_Phdr related constants. # Legal values for p_type (segment type). PT_LOAD = 1 # #define PT_LOAD 1 /* Loadable program segment */ @@ -114,51 +79,20 @@ PF_W = 1 << 1 # #define PF_W (1 << 1) /* Segment is writable PF_R = 1 << 2 # #define PF_R (1 << 2) /* Segment is readable */ -class Elf32_Phdr(ctypes.Structure): # typedef struct - _fields_ = [ - ("p_type", Elf32_Word), # Elf32_Word p_type; - ("p_offset", Elf32_Off), # Elf32_Off p_offset; - ("p_vaddr", Elf32_Addr), # Elf32_Addr p_vaddr; - ("p_paddr", Elf32_Addr), # Elf32_Addr p_paddr; - ("p_filesz", Elf32_Word), # Elf32_Word p_filesz; - ("p_memsz", Elf32_Word), # Elf32_Word p_memsz; - ("p_flags", Elf32_Word), # Elf32_Word p_flags; - ("p_align", Elf32_Word), # Elf32_Word p_align; - ] # } Elf32_Phdr; - - class Elf64_Phdr(ctypes.Structure): # typedef struct - _fields_ = [ - ("p_type", Elf64_Word), # Elf64_Word p_type; - ("p_flags", Elf64_Word), # Elf64_Word p_flags; - ("p_offset", Elf64_Off), # Elf64_Off p_offset; - ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; - ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; - ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; - ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; - ("p_align", Elf64_Xword), # Elf64_Xword p_align; + _fields_ = [ # { + ("p_type", Elf64_Word), # Elf64_Word p_type; + ("p_flags", Elf64_Word), # Elf64_Word p_flags; + ("p_offset", Elf64_Off), # Elf64_Off p_offset; + ("p_vaddr", Elf64_Addr), # Elf64_Addr p_vaddr; + ("p_paddr", Elf64_Addr), # Elf64_Addr p_paddr; + ("p_filesz", Elf64_Xword), # Elf64_Xword p_filesz; + ("p_memsz", Elf64_Xword), # Elf64_Xword p_memsz; + ("p_align", Elf64_Xword), # Elf64_Xword p_align; ] # } Elf64_Phdr; -# Elf_auxv_t related constants. - - -class _Elf32_auxv_t_U(ctypes.Union): - _fields_ = [("a_val", ctypes.c_uint32)] - - -class Elf32_auxv_t(ctypes.Structure): # typedef struct - _fields_ = [ - ("a_type", - ctypes.c_uint32), # uint32_t a_type; /* Entry type */ - ("a_un", _Elf32_auxv_t_U) # union - - # uint32_t a_val; /* Integer value */ - # /* We use to have pointer elements added here. We cannot do that, - # though, since it does not work when using 32-bit definitions - # on 64-bit platforms and vice versa. */ - # } a_un; - ] # } Elf32_auxv_t; +# Elf64_auxv_t related constants. class _Elf64_auxv_t_U(ctypes.Union): @@ -166,150 +100,78 @@ class _Elf64_auxv_t_U(ctypes.Union): class Elf64_auxv_t(ctypes.Structure): # typedef struct - _fields_ = [ + _fields_ = [ # { ("a_type", - ctypes.c_uint64), # uint64_t a_type; /* Entry type */ - ("a_un", _Elf64_auxv_t_U) # union - - # uint64_t a_val; /* Integer value */ - # /* We use to have pointer elements added here. We cannot do that, - # though, since it does not work when using 32-bit definitions - # on 64-bit platforms and vice versa. */ - # } a_un; + ctypes.c_uint64), # uint64_t a_type; /* Entry type */ + ("a_un", _Elf64_auxv_t_U) # union + # { + # uint64_t a_val; /* Integer value */ + # /* We use to have pointer elements added here. We cannot do that, + # though, since it does not work when using 32-bit definitions + # on 64-bit platforms and vice versa. */ + # } a_un; ] # } Elf64_auxv_t; -# Elf_Nhdr related constants. +# Elf64_Nhdr related constants. -NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ -NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ -NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ -NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ -NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, size might increase */ -NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped files */ -NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ -NT_ARM_VFP = 0x400 # #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ -NT_ARM_TLS = 0x401 # #define NT_ARM_TLS 0x401 /* ARM TLS register */ - - -class Elf32_Nhdr(ctypes.Structure): # typedef struct - _fields_ = [ - ( - "n_namesz", Elf32_Word - ), # Elf32_Word n_namesz; /* Length of the note's name. */ - ( - "n_descsz", Elf32_Word - ), # Elf32_Word n_descsz; /* Length of the note's descriptor. */ - ( - "n_type", Elf32_Word - ), # Elf32_Word n_type; /* Type of the note. */ - ] # } Elf32_Nhdr; +NT_PRSTATUS = 1 # #define NT_PRSTATUS 1 /* Contains copy of prstatus struct */ +NT_FPREGSET = 2 # #define NT_FPREGSET 2 /* Contains copy of fpregset struct */ +NT_PRPSINFO = 3 # #define NT_PRPSINFO 3 /* Contains copy of prpsinfo struct */ +NT_AUXV = 6 # #define NT_AUXV 6 /* Contains copy of auxv array */ +NT_SIGINFO = 0x53494749 # #define NT_SIGINFO 0x53494749 /* Contains copy of siginfo_t, +# size might increase */ +NT_FILE = 0x46494c45 # #define NT_FILE 0x46494c45 /* Contains information about mapped +# files */ +NT_X86_XSTATE = 0x202 # #define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ class Elf64_Nhdr(ctypes.Structure): # typedef struct - _fields_ = [ + _fields_ = [ # { ( "n_namesz", Elf64_Word - ), # Elf64_Word n_namesz; /* Length of the note's name. */ + ), # Elf64_Word n_namesz; /* Length of the note's name. */ ( "n_descsz", Elf64_Word - ), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ + ), # Elf64_Word n_descsz; /* Length of the note's descriptor. */ ("n_type", Elf64_Word - ), # Elf64_Word n_type; /* Type of the note. */ + ), # Elf64_Word n_type; /* Type of the note. */ ] # } Elf64_Nhdr; -# Elf_Shdr related constants. +# Elf64_Shdr related constants. -class Elf32_Shdr(ctypes.Structure): - _fields_ = [ +class Elf64_Shdr(ctypes.Structure): # typedef struct + _fields_ = [ # { ( - # Section name (string tbl index) - "sh_name", Elf32_Word - ), - ( - # Section type - "sh_type", Elf32_Word - ), - ( - # Section flags - "sh_flags", Elf32_Word - ), - ( - # Section virtual addr at execution - "sh_addr", Elf32_Addr - ), - ( - # Section file offset - "sh_offset", Elf32_Off - ), - ( - # Section size in bytes - "sh_size", Elf32_Word - ), - ( - # Link to another section - "sh_link", Elf32_Word - ), - ( - # Additional section information - "sh_info", Elf32_Word - ), - ( - # Section alignment - "sh_addralign", Elf32_Word - ), - ( - # Entry size if section holds table - "sh_entsize", Elf32_Word - ) - ] - - -class Elf64_Shdr(ctypes.Structure): - _fields_ = [ - ( - # Section name (string tbl index) "sh_name", Elf64_Word - ), + ), # Elf64_Word sh_name; /* Section name (string tbl index) */ + ("sh_type", Elf64_Word + ), # Elf64_Word sh_type; /* Section type */ + ("sh_flags", Elf64_Xword + ), # Elf64_Xword sh_flags; /* Section flags */ ( - # Section type - "sh_type", Elf64_Word - ), - ( - # Section flags - "sh_flags", Elf64_Xword - ), - ( - # Section virtual addr at execution "sh_addr", Elf64_Addr - ), + ), # Elf64_Addr sh_addr; /* Section virtual addr at execution */ ( - # Section file offset "sh_offset", Elf64_Off - ), + ), # Elf64_Off sh_offset; /* Section file offset */ ( - # Section size in bytes "sh_size", Elf64_Xword - ), + ), # Elf64_Xword sh_size; /* Section size in bytes */ ( - # Link to another section "sh_link", Elf64_Word - ), + ), # Elf64_Word sh_link; /* Link to another section */ ( - # Additional section information "sh_info", Elf64_Word - ), + ), # Elf64_Word sh_info; /* Additional section information */ + ("sh_addralign", Elf64_Xword + ), # Elf64_Xword sh_addralign; /* Section alignment */ ( - # Section alignment - "sh_addralign", Elf64_Xword - ), - ( - # Entry size if section holds table "sh_entsize", Elf64_Xword - ) - ] + ) # Elf64_Xword sh_entsize; /* Entry size if section holds table */ + ] # } Elf64_Shdr; # elf_prstatus related constants. @@ -317,753 +179,507 @@ class Elf64_Shdr(ctypes.Structure): # Signal info. class elf_siginfo(ctypes.Structure): # struct elf_siginfo - _fields_ = [ - ( - # Signal number - "si_signo", ctypes.c_int - ), - ( - # Extra code - "si_code", ctypes.c_int - ), - ( - # Errno - "si_errno", ctypes.c_int - ) - ] + _fields_ = [ # { + ("si_signo", ctypes.c_int + ), # int si_signo; /* Signal number. */ + ("si_code", ctypes.c_int + ), # int si_code; /* Extra code. */ + ("si_errno", ctypes.c_int + ) # int si_errno; /* Errno. */ + ] # }; # A time value that is accurate to the nearest # microsecond but also has a range of years. class timeval(ctypes.Structure): # struct timeval - _fields_ = [ - ( - # __time_t tv_sec; /* Seconds. */ - "tv_sec", ctypes.c_long - ), - ( - # __suseconds_t tv_usec; /* Microseconds. */ - "tv_usec", ctypes.c_long - ) - ] + _fields_ = [ # { + ("tv_sec", + ctypes.c_long), # __time_t tv_sec; /* Seconds. */ + ("tv_usec", ctypes.c_long + ) # __suseconds_t tv_usec; /* Microseconds. */ + ] # }; -class x86_64_user_regs_struct(ctypes.Structure): # struct x86_64_user_regs_struct - _fields_ = [ +class user_regs_struct(ctypes.Structure): # struct user_regs_struct + _fields_ = [ # { ("r15", - ctypes.c_ulonglong), # __extension__ unsigned long long int r15; + ctypes.c_ulonglong), # __extension__ unsigned long long int r15; ("r14", - ctypes.c_ulonglong), # __extension__ unsigned long long int r14; + ctypes.c_ulonglong), # __extension__ unsigned long long int r14; ("r13", - ctypes.c_ulonglong), # __extension__ unsigned long long int r13; + ctypes.c_ulonglong), # __extension__ unsigned long long int r13; ("r12", - ctypes.c_ulonglong), # __extension__ unsigned long long int r12; + ctypes.c_ulonglong), # __extension__ unsigned long long int r12; ("rbp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; + ctypes.c_ulonglong), # __extension__ unsigned long long int rbp; ("rbx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rbx; ("r11", - ctypes.c_ulonglong), # __extension__ unsigned long long int r11; + ctypes.c_ulonglong), # __extension__ unsigned long long int r11; ("r10", - ctypes.c_ulonglong), # __extension__ unsigned long long int r10; + ctypes.c_ulonglong), # __extension__ unsigned long long int r10; ("r9", - ctypes.c_ulonglong), # __extension__ unsigned long long int r9; + ctypes.c_ulonglong), # __extension__ unsigned long long int r9; ("r8", - ctypes.c_ulonglong), # __extension__ unsigned long long int r8; + ctypes.c_ulonglong), # __extension__ unsigned long long int r8; ("rax", - ctypes.c_ulonglong), # __extension__ unsigned long long int rax; + ctypes.c_ulonglong), # __extension__ unsigned long long int rax; ("rcx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rcx; ("rdx", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; + ctypes.c_ulonglong), # __extension__ unsigned long long int rdx; ("rsi", - ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; + ctypes.c_ulonglong), # __extension__ unsigned long long int rsi; ("rdi", - ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; + ctypes.c_ulonglong), # __extension__ unsigned long long int rdi; ("orig_rax", ctypes.c_ulonglong - ), # __extension__ unsigned long long int orig_rax; + ), # __extension__ unsigned long long int orig_rax; ("rip", - ctypes.c_ulonglong), # __extension__ unsigned long long int rip; + ctypes.c_ulonglong), # __extension__ unsigned long long int rip; ("cs", - ctypes.c_ulonglong), # __extension__ unsigned long long int cs; + ctypes.c_ulonglong), # __extension__ unsigned long long int cs; ("eflags", - ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; + ctypes.c_ulonglong), # __extension__ unsigned long long int eflags; ("rsp", - ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; + ctypes.c_ulonglong), # __extension__ unsigned long long int rsp; ("ss", - ctypes.c_ulonglong), # __extension__ unsigned long long int ss; + ctypes.c_ulonglong), # __extension__ unsigned long long int ss; ("fs_base", ctypes.c_ulonglong - ), # __extension__ unsigned long long int fs_base; + ), # __extension__ unsigned long long int fs_base; ("gs_base", ctypes.c_ulonglong - ), # __extension__ unsigned long long int gs_base; + ), # __extension__ unsigned long long int gs_base; ("ds", - ctypes.c_ulonglong), # __extension__ unsigned long long int ds; + ctypes.c_ulonglong), # __extension__ unsigned long long int ds; ("es", - ctypes.c_ulonglong), # __extension__ unsigned long long int es; + ctypes.c_ulonglong), # __extension__ unsigned long long int es; ("fs", - ctypes.c_ulonglong), # __extension__ unsigned long long int fs; + ctypes.c_ulonglong), # __extension__ unsigned long long int fs; ("gs", ctypes.c_ulonglong - ) # __extension__ unsigned long long int gs; - ] + ) # __extension__ unsigned long long int gs; + ] # }; -class aarch64_user_regs_struct(ctypes.Structure): # struct aarch64_user_regs_struct - _fields_ = [ - ("regs", - ctypes.c_ulonglong * 31), # unsigned long long int regs[31]; - ("sp", - ctypes.c_ulonglong), # unsigned long long int sp; - ("pc", - ctypes.c_ulonglong), # unsigned long long int pc; - ("pstate", - ctypes.c_ulonglong), # unsigned long long int pstate; - ] - - -class arm_user_regs_struct(ctypes.Structure): # struct arm_user_regs_struct - _fields_ = [ - ("r0", - ctypes.c_ulong), # unsigned ulong int r0; - ("r1", - ctypes.c_ulong), # unsigned ulong int r1; - ("r2", - ctypes.c_ulong), # unsigned ulong int r2; - ("r3", - ctypes.c_ulong), # unsigned ulong int r3; - ("r4", - ctypes.c_ulong), # unsigned ulong int r4; - ("r5", - ctypes.c_ulong), # unsigned ulong int r5; - ("r6", - ctypes.c_ulong), # unsigned ulong int r6; - ("r7", - ctypes.c_ulong), # unsigned ulong int r7; - ("r8", - ctypes.c_ulong), # unsigned ulong int r8; - ("r9", - ctypes.c_ulong), # unsigned ulong int r9; - ("r10", - ctypes.c_ulong), # unsigned ulong int r10; - ("fp", - ctypes.c_ulong), # unsigned ulong int fp; - ("ip", - ctypes.c_ulong), # unsigned ulong int ip; - ("sp", - ctypes.c_ulong), # unsigned ulong int sp; - ("lr", - ctypes.c_ulong), # unsigned ulong int lr; - ("pc", - ctypes.c_ulong), # unsigned ulong int pc; - ("cpsr", - ctypes.c_ulong), # unsigned ulong int cpsr; - ("orig_r0", - ctypes.c_ulong), # unsigned ulong int orig_r0; - ] - - -# elf_greg_t = ctypes.c_ulonglong -# ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) -# elf_gregset_t = elf_greg_t*ELF_NGREG -user_regs_dict = { - "aarch64": aarch64_user_regs_struct, - "armv7l": arm_user_regs_struct, - "x86_64": x86_64_user_regs_struct, -} - -try: - elf_gregset_t = user_regs_dict[MACHINE] -except KeyError: - raise ValueError("Current architecture %s is not supported." % MACHINE) +#elf_greg_t = ctypes.c_ulonglong +#ELF_NGREG = ctypes.sizeof(user_regs_struct)/ctypes.sizeof(elf_greg_t) +#elf_gregset_t = elf_greg_t*ELF_NGREG +elf_gregset_t = user_regs_struct class elf_prstatus(ctypes.Structure): # struct elf_prstatus - _fields_ = [ + _fields_ = [ # { ( - # Info associated with signal - # struct elf_siginfo pr_info; "pr_info", elf_siginfo - ), + ), # struct elf_siginfo pr_info; /* Info associated with signal. */ + ("pr_cursig", ctypes.c_short + ), # short int pr_cursig; /* Current signal. */ ( - # Current signal - # short int pr_cursig; - "pr_cursig", ctypes.c_short - ), - ( - # Set of pending signals - # unsigned long int pr_sigpend; "pr_sigpend", ctypes.c_ulong - ), + ), # unsigned long int pr_sigpend; /* Set of pending signals. */ ( - # Set of held signals - # unsigned long int pr_sighold; "pr_sighold", ctypes.c_ulong - ), + ), # unsigned long int pr_sighold; /* Set of held signals. */ + ("pr_pid", ctypes.c_int), # __pid_t pr_pid; + ("pr_ppid", ctypes.c_int), # __pid_t pr_ppid; + ("pr_pgrp", ctypes.c_int), # __pid_t pr_pgrp; + ("pr_sid", ctypes.c_int), # __pid_t pr_sid; + ("pr_utime", + timeval), # struct timeval pr_utime; /* User time. */ + ("pr_stime", timeval + ), # struct timeval pr_stime; /* System time. */ ( - # Process ID - # __pid_t pr_pid; - "pr_pid", ctypes.c_int - ), - ( - # Parent process ID - # __pid_t pr_ppid; - "pr_ppid", ctypes.c_int - ), - ( - # Parent group ID - # __pid_t pr_pgrp; - "pr_pgrp", ctypes.c_int - ), - ( - # Parent session ID - # __pid_t pr_sid; - "pr_sid", ctypes.c_int - ), - ( - # User time - # struct timeval pr_utime; - "pr_utime", timeval - ), - ( - # System time - # struct timeval pr_stime; - "pr_stime", timeval - ), - ( - # Cumulative user time - # struct timeval pr_cutime; "pr_cutime", timeval - ), + ), # struct timeval pr_cutime; /* Cumulative user time. */ ( - # Cumulative system time - # struct timeval pr_cstime; "pr_cstime", timeval - ), + ), # struct timeval pr_cstime; /* Cumulative system time. */ + ("pr_reg", elf_gregset_t + ), # elf_gregset_t pr_reg; /* GP registers. */ ( - # GP registers - # elf_gregset_t pr_reg; - "pr_reg", elf_gregset_t - ), - ( - # True if math copro being used - # int pr_fpvalid; "pr_fpvalid", ctypes.c_int - ) - ] + ) # int pr_fpvalid; /* True if math copro being used. */ + ] # }; # elf_prpsinfo related constants. -# Number of chars for args -# #define ELF_PRARGSZ (80) -ELF_PRARGSZ = 80 +ELF_PRARGSZ = 80 # #define ELF_PRARGSZ (80) /* Number of chars for args. */ class elf_prpsinfo(ctypes.Structure): # struct elf_prpsinfo - _fields_ = [ + _fields_ = [ # { ( - # Numeric process state - # char pr_state; "pr_state", ctypes.c_byte - ), + ), # char pr_state; /* Numeric process state. */ ( - # Char for pr_state - # char pr_sname; "pr_sname", ctypes.c_char - ), - ( - # Zombie - # char pr_zomb; - "pr_zomb", ctypes.c_byte - ), - ( - # Nice value - # char pr_nice; - "pr_nice", ctypes.c_byte - ), - ( - # Flags - # unsigned long int pr_flag; - "pr_flag", ctypes.c_ulong - ), - ( - # User ID - # unsigned int pr_uid; - "pr_uid", ctypes.c_uint - ), - ( - # Group ID - # unsigned int pr_gid; - "pr_gid", ctypes.c_uint - ), - ("pr_pid", ctypes.c_int), + ), # char pr_sname; /* Char for pr_state. */ + ("pr_zomb", ctypes.c_byte + ), # char pr_zomb; /* Zombie. */ + ("pr_nice", ctypes.c_byte + ), # char pr_nice; /* Nice val. */ + ("pr_flag", ctypes.c_ulong + ), # unsigned long int pr_flag; /* Flags. */ + # #if __WORDSIZE == 32 + # unsigned short int pr_uid; + # unsigned short int pr_gid; + # #else + ("pr_uid", ctypes.c_uint), # unsigned int pr_uid; + ("pr_gid", ctypes.c_uint), # unsigned int pr_gid; + # #endif + ("pr_pid", ctypes.c_int), # int pr_pid, pr_ppid, pr_pgrp, pr_sid; ("pr_ppid", ctypes.c_int), ("pr_pgrp", ctypes.c_int), ("pr_sid", ctypes.c_int), - # /* Lots missing */ + # /* Lots missing */ ( - # Filename of executable - # char pr_fname[16]; "pr_fname", ctypes.c_char * 16 - ), + ), # char pr_fname[16]; /* Filename of executable. */ ( - # Initial part of arg list - # char pr_psargs[ELF_PRARGSZ]; "pr_psargs", ctypes.c_char * ELF_PRARGSZ - ) - ] + ) # char pr_psargs[ELF_PRARGSZ]; /* Initial part of arg list. */ + ] # }; -class x86_64_user_fpregs_struct(ctypes.Structure): # struct x86_64_user_fpregs_struct - _fields_ = [ - # unsigned short int cwd; - ("cwd", ctypes.c_ushort), - # unsigned short int swd; - ("swd", ctypes.c_ushort), - # unsigned short int ftw; - ("ftw", ctypes.c_ushort), - # unsigned short int fop; - ("fop", ctypes.c_ushort), - # __extension__ unsigned long long int rip; - ("rip", ctypes.c_ulonglong), - # __extension__ unsigned long long int rdp; - ("rdp", ctypes.c_ulonglong), - # unsigned int mxcsr; - ("mxcsr", ctypes.c_uint), - # unsigned int mxcr_mask; - ("mxcr_mask", ctypes.c_uint), - # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - ("st_space", ctypes.c_uint * 32), - # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ - ("xmm_space", ctypes.c_uint * 64), - # unsigned int padding[24]; - ("padding", ctypes.c_uint * 24), - ] +class user_fpregs_struct(ctypes.Structure): # struct user_fpregs_struct + _fields_ = [ # { + ("cwd", ctypes.c_ushort), # unsigned short int cwd; + ("swd", ctypes.c_ushort), # unsigned short int swd; + ("ftw", ctypes.c_ushort), # unsigned short int ftw; + ("fop", ctypes.c_ushort), # unsigned short int fop; + ("rip", + ctypes.c_ulonglong), # __extension__ unsigned long long int rip; + ("rdp", + ctypes.c_ulonglong), # __extension__ unsigned long long int rdp; + ("mxcsr", ctypes.c_uint), # unsigned int mxcsr; + ("mxcr_mask", ctypes.c_uint), # unsigned int mxcr_mask; + ( + "st_space", ctypes.c_uint * 32 + ), # unsigned int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ + ( + "xmm_space", ctypes.c_uint * 64 + ), # unsigned int xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + ("padding", + ctypes.c_uint * 24), # unsigned int padding[24]; + ] # }; -class aarch64_user_fpregs_struct(ctypes.Structure): # struct aarch64_user_fpregs_struct - _fields_ = [ - # unsigned long long int vregs[64]; - ("vregs", ctypes.c_ulonglong * 64), - # unsigned int fpsr; - ("fpsr", ctypes.c_uint), - # unsigned int fpcr; - ("fpcr", ctypes.c_uint), - # unsigned int padding[2]; - ("padding", ctypes.c_uint * 2), - ] - - -user_fpregs_dict = { - "aarch64": aarch64_user_fpregs_struct, - "armv7l": None, - "x86_64": x86_64_user_fpregs_struct, -} - -try: - elf_fpregset_t = user_fpregs_dict[MACHINE] -except KeyError: - raise ValueError("Current architecture %s is not supported." % MACHINE) +elf_fpregset_t = user_fpregs_struct # siginfo_t related constants. _SI_MAX_SIZE = 128 -_SI_PAD_SIZE = (_SI_MAX_SIZE // ctypes.sizeof(ctypes.c_int)) - 4 +_SI_PAD_SIZE = (_SI_MAX_SIZE / ctypes.sizeof(ctypes.c_int)) - 4 -# /* kill(). */ -class _siginfo_t_U_kill(ctypes.Structure): # struct - _fields_ = [ +# /* kill(). */ +class _siginfo_t_U_kill(ctypes.Structure): # struct + _fields_ = [ # { + ("si_pid", ctypes.c_int + ), # __pid_t si_pid; /* Sending process ID. */ ( - # Sending process ID - # __pid_t si_pid; - "si_pid", ctypes.c_int - ), - ( - # Real user ID of sending process - # __uid_t si_uid; "si_uid", ctypes.c_uint - ) - ] # } _kill; + ) # __uid_t si_uid; /* Real user ID of sending process. */ + ] # } _kill; # Type for data associated with a signal. class sigval_t(ctypes.Union): # typedef union sigval - _fields_ = [ - ("sival_int", ctypes.c_int), # int sival_int; - ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; - ] # } sigval_t; + _fields_ = [ # { + ("sival_int", ctypes.c_int), # int sival_int; + ("sical_ptr", ctypes.c_void_p), # void *sival_ptr; + ] # } sigval_t; -# /* POSIX.1b timers. */ -class _siginfo_t_U_timer(ctypes.Structure): # struct - _fields_ = [ - ( - # Timer ID - # int si_tid; - "si_tid", ctypes.c_int - ), - ( - # Overrun count - # int si_overrun; - "si_overrun", ctypes.c_int - ), - ( - # Signal value - # sigval_t si_sigval; - "si_sigval", sigval_t - ) - ] # } _timer; + # /* POSIX.1b timers. */ +class _siginfo_t_U_timer(ctypes.Structure): # struct + _fields_ = [ # { + ("si_tid", + ctypes.c_int), # int si_tid; /* Timer ID. */ + ("si_overrun", ctypes.c_int + ), # int si_overrun; /* Overrun count. */ + ("si_sigval", sigval_t + ) # sigval_t si_sigval; /* Signal value. */ + ] # } _timer; -# /* POSIX.1b signals. */ -class _siginfo_t_U_rt(ctypes.Structure): # struct - _fields_ = [ + # /* POSIX.1b signals. */ +class _siginfo_t_U_rt(ctypes.Structure): # struct + _fields_ = [ # { + ("si_pid", ctypes.c_int + ), # __pid_t si_pid; /* Sending process ID. */ ( - # Sending process ID - # __pid_t si_pid; - "si_pid", ctypes.c_int - ), - ( - # Real user ID of sending process - # __uid_t si_uid; "si_uid", ctypes.c_uint - ), - ( - # Signal value - # sigval_t si_sigval; - "si_sigval", sigval_t - ) - ] # } _rt; + ), # __uid_t si_uid; /* Real user ID of sending process. */ + ("si_sigval", sigval_t + ) # sigval_t si_sigval; /* Signal value. */ + ] # } _rt; -# /* SIGCHLD. */ -class _siginfo_t_U_sigchld(ctypes.Structure): # struct - _fields_ = [ + # /* SIGCHLD. */ +class _siginfo_t_U_sigchld(ctypes.Structure): # struct + _fields_ = [ # { + ("si_pid", + ctypes.c_int), # __pid_t si_pid; /* Which child. */ ( - # Which child - # __pid_t si_pid; - "si_pid", ctypes.c_int - ), - ( - # Real user ID of sending process - # __uid_t si_uid; "si_uid", ctypes.c_uint - ), - ( - # Exit value or signal - # int si_status; - "si_status", ctypes.c_int - ), - ( - # __sigchld_clock_t si_utime; - "si_utime", ctypes.c_long - ), - ( - # __sigchld_clock_t si_stime; - "si_stime", ctypes.c_long - ) - ] # } _sigchld; + ), # __uid_t si_uid; /* Real user ID of sending process. */ + ("si_status", ctypes.c_int + ), # int si_status; /* Exit value or signal. */ + ("si_utime", ctypes.c_long), # __sigchld_clock_t si_utime; + ("si_stime", ctypes.c_long) # __sigchld_clock_t si_stime; + ] # } _sigchld; -# /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ -class _siginfo_t_U_sigfault(ctypes.Structure): # struct - _fields_ = [ + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ +class _siginfo_t_U_sigfault(ctypes.Structure): # struct + _fields_ = [ # { + ("si_addr", ctypes.c_void_p + ), # void *si_addr; /* Faulting insn/memory ref. */ ( - # Faulting insn/memory ref - # void *si_addr; - "si_addr", ctypes.c_void_p - ), - ( - # Valid LSB of the reported address - # short int si_addr_lsb; "si_addr_lsb", ctypes.c_short - ) - ] # } _sigfault; + ) # short int si_addr_lsb; /* Valid LSB of the reported address. */ + ] # } _sigfault; -# /* SIGPOLL. */ -class _siginfo_t_U_sigpoll(ctypes.Structure): # struct - _fields_ = [ - ( - # Band event for SIGPOLL - # long int si_band; - "si_band", ctypes.c_long - ), - ( - # int si_fd; - "si_fd", ctypes.c_int - ) - ] # } _sigpoll; + # /* SIGPOLL. */ +class _siginfo_t_U_sigpoll(ctypes.Structure): # struct + _fields_ = [ # { + ("si_band", ctypes.c_long + ), # long int si_band; /* Band event for SIGPOLL. */ + ("si_fd", ctypes.c_int) # int si_fd; + ] # } _sigpoll; -# /* SIGSYS. */ -class _siginfo_t_U_sigsys(ctypes.Structure): # struct - _fields_ = [ + # /* SIGSYS. */ +class _siginfo_t_U_sigsys(ctypes.Structure): # struct + _fields_ = [ # { ("_call_addr", ctypes.c_void_p - ), # void *_call_addr; /* Calling user insn. */ + ), # void *_call_addr; /* Calling user insn. */ ( "_syscall", ctypes.c_int - ), # int _syscall; /* Triggering system call number. */ + ), # int _syscall; /* Triggering system call number. */ ("_arch", ctypes.c_uint - ) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - ] # } _sigsys; + ) # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + ] # } _sigsys; -class _siginfo_t_U(ctypes.Union): # union - _fields_ = [ +class _siginfo_t_U(ctypes.Union): # union + _fields_ = [ # { ("_pad", - ctypes.c_int * _SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; - - # /* kill(). */ - ("_kill", _siginfo_t_U_kill), # struct - - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # } _kill; - - # /* POSIX.1b timers. */ - ("_timer", _siginfo_t_U_timer), # struct - - # int si_tid; /* Timer ID. */ - # int si_overrun; /* Overrun count. */ - # sigval_t si_sigval; /* Signal value. */ - # } _timer; - - # /* POSIX.1b signals. */ - ("_rt", _siginfo_t_U_rt), # struct - - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # sigval_t si_sigval; /* Signal value. */ - # } _rt; - - # /* SIGCHLD. */ - ("_sigchld", _siginfo_t_U_sigchld), # struct - - # __pid_t si_pid; /* Which child. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # int si_status; /* Exit value or signal. */ - # __sigchld_clock_t si_utime; - # __sigchld_clock_t si_stime; - # } _sigchld; - - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ - ("_sigfault", _siginfo_t_U_sigfault), # struct - - # void *si_addr; /* Faulting insn/memory ref. */ - # short int si_addr_lsb; /* Valid LSB of the reported address. */ - # } _sigfault; - - # /* SIGPOLL. */ - ("_sigpoll", _siginfo_t_U_sigpoll), # struct - - # long int si_band; /* Band event for SIGPOLL. */ - # int si_fd; - # } _sigpoll; - - # /* SIGSYS. */ - ("_sigsys", _siginfo_t_U_sigpoll) # struct - - # void *_call_addr; /* Calling user insn. */ - # int _syscall; /* Triggering system call number. */ - # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - # } _sigsys; - ] # } _sifields; + ctypes.c_int * _SI_PAD_SIZE), # int _pad[__SI_PAD_SIZE]; + # + # /* kill(). */ + ("_kill", _siginfo_t_U_kill), # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; + # + # /* POSIX.1b timers. */ + ("_timer", _siginfo_t_U_timer), # struct + # { + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; + # + # /* POSIX.1b signals. */ + ("_rt", _siginfo_t_U_rt), # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; + # + # /* SIGCHLD. */ + ("_sigchld", _siginfo_t_U_sigchld), # struct + # { + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; + # + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + ("_sigfault", _siginfo_t_U_sigfault), # struct + # { + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; + # + # /* SIGPOLL. */ + ("_sigpoll", _siginfo_t_U_sigpoll), # struct + # { + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; + # + # /* SIGSYS. */ + ("_sigsys", _siginfo_t_U_sigpoll) # struct + # { + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + ] # } _sifields; class siginfo_t(ctypes.Structure): # typedef struct - _fields_ = [ + _fields_ = [ # { + ("si_signo", ctypes.c_int + ), # int si_signo; /* Signal number. */ ( - # Signal number - # int si_signo; - "si_signo", ctypes.c_int - ), - ( - # If non-zero, an errno value associated with - # int si_errno; "si_errno", ctypes.c_int - ), - ( - # Signal code - this signal, as defined in - # int si_code; - "si_code", ctypes.c_int - ), - ( - # Union - "_sifields", _siginfo_t_U - ) - - # int _pad[__SI_PAD_SIZE]; + ), # int si_errno; /* If non-zero, an errno value associated with + # this signal, as defined in . */ + ("si_code", ctypes.c_int + ), # int si_code; /* Signal code. */ # - # /* kill(). */ - # struct - - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # } _kill; + ("_sifields", _siginfo_t_U) # union + # { + # int _pad[__SI_PAD_SIZE]; # - # /* POSIX.1b timers. */ - # struct - - # int si_tid; /* Timer ID. */ - # int si_overrun; /* Overrun count. */ - # sigval_t si_sigval; /* Signal value. */ - # } _timer; + # /* kill(). */ + # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # } _kill; # - # /* POSIX.1b signals. */ - # struct - - # __pid_t si_pid; /* Sending process ID. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # sigval_t si_sigval; /* Signal value. */ - # } _rt; + # /* POSIX.1b timers. */ + # struct + # { + # int si_tid; /* Timer ID. */ + # int si_overrun; /* Overrun count. */ + # sigval_t si_sigval; /* Signal value. */ + # } _timer; # - # /* SIGCHLD. */ - # struct - - # __pid_t si_pid; /* Which child. */ - # __uid_t si_uid; /* Real user ID of sending process. */ - # int si_status; /* Exit value or signal. */ - # __sigchld_clock_t si_utime; - # __sigchld_clock_t si_stime; - # } _sigchld; + # /* POSIX.1b signals. */ + # struct + # { + # __pid_t si_pid; /* Sending process ID. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # sigval_t si_sigval; /* Signal value. */ + # } _rt; # - # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ - # struct - - # void *si_addr; /* Faulting insn/memory ref. */ - # short int si_addr_lsb; /* Valid LSB of the reported address. */ - # } _sigfault; + # /* SIGCHLD. */ + # struct + # { + # __pid_t si_pid; /* Which child. */ + # __uid_t si_uid; /* Real user ID of sending process. */ + # int si_status; /* Exit value or signal. */ + # __sigchld_clock_t si_utime; + # __sigchld_clock_t si_stime; + # } _sigchld; # - # /* SIGPOLL. */ - # struct - - # long int si_band; /* Band event for SIGPOLL. */ - # int si_fd; - # } _sigpoll; + # /* SIGILL, SIGFPE, SIGSEGV, SIGBUS. */ + # struct + # { + # void *si_addr; /* Faulting insn/memory ref. */ + # short int si_addr_lsb; /* Valid LSB of the reported address. */ + # } _sigfault; # - # /* SIGSYS. */ - # struct - - # void *_call_addr; /* Calling user insn. */ - # int _syscall; /* Triggering system call number. */ - # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ - # } _sigsys; - # } _sifields; - ] # } siginfo_t __SI_ALIGNMENT; + # /* SIGPOLL. */ + # struct + # { + # long int si_band; /* Band event for SIGPOLL. */ + # int si_fd; + # } _sigpoll; + # + # /* SIGSYS. */ + # struct + # { + # void *_call_addr; /* Calling user insn. */ + # int _syscall; /* Triggering system call number. */ + # unsigned int _arch; /* AUDIT_ARCH_* of syscall. */ + # } _sigsys; + # } _sifields; + ] # } siginfo_t __SI_ALIGNMENT; # xsave related. class ymmh_struct(ctypes.Structure): # struct ymmh_struct { - _fields_ = [ - # u32 ymmh_space[64]; - ("ymmh_space", 64 * ctypes.c_uint) - ] # } __packed; + _fields_ = [("ymmh_space", 64 * ctypes.c_uint + ) # u32 ymmh_space[64]; + ] # } __packed; class xsave_hdr_struct(ctypes.Structure): # struct xsave_hdr_struct { _fields_ = [ - # u64 xstate_bv; - ("xstate_bv", ctypes.c_ulonglong), - # u64 reserved1[2]; - ("reserved1", ctypes.c_ulonglong * 2), - # u64 reserved2[5]; - ("reserved2", ctypes.c_ulonglong * 5) + ("xstate_bv", ctypes.c_ulonglong + ), # u64 xstate_bv; + ("reserved1", ctypes.c_ulonglong * + 2), # u64 reserved1[2]; + ("reserved2", ctypes.c_ulonglong * 5 + ) # u64 reserved2[5]; ] # } __packed; class i387_fxsave_struct(ctypes.Structure): # struct i387_fxsave_struct { _fields_ = [ ( - # Control Word - # u16 cwd; "cwd", ctypes.c_ushort - ), + ), # u16 cwd; /* Control Word */ ( - # Status Word - # u16 swd; "swd", ctypes.c_ushort - ), + ), # u16 swd; /* Status Word */ ( - # Tag Word - # u16 twd; "twd", ctypes.c_ushort - ), + ), # u16 twd; /* Tag Word */ ( - # Last Instruction Opcode - # u16 fop; "fop", ctypes.c_ushort - ), - # union { - # struct { + ), # u16 fop; /* Last Instruction Opcode */ + # union { + # struct { ( - # Instruction Pointer - # u64 rip; "rip", ctypes.c_ulonglong - ), + ), # u64 rip; /* Instruction Pointer */ ( - # Data Pointer - # u64 rdp; "rdp", ctypes.c_ulonglong - ), - - # struct { - # u32 fip; /* FPU IP Offset */ - # u32 fcs; /* FPU IP Selector */ - # u32 foo; /* FPU Operand Offset */ - # u32 fos; /* FPU Operand Selector */ - + ), # u64 rdp; /* Data Pointer */ + # }; + # struct { + # u32 fip; /* FPU IP Offset */ + # u32 fcs; /* FPU IP Selector */ + # u32 foo; /* FPU Operand Offset */ + # u32 fos; /* FPU Operand Selector */ + # }; + # }; ( - # MXCSR Register State - # u32 mxcsr; "mxcsr", ctypes.c_uint - ), + ), # u32 mxcsr; /* MXCSR Register State */ ( - # MXCSR Mask - # u32 mxcsr_mask; "mxcsr_mask", ctypes.c_uint - ), - # 8*16 bytes for each FP-reg = 128 bytes - ( - # u32 st_space[32]; - "st_space", ctypes.c_uint * 32 - ), - # 16*16 bytes for each XMM-reg = 256 bytes - ( - # u32 xmm_space[64]; - "xmm_space", ctypes.c_uint * 64 - ), - ( - # u32 padding[12]; - "padding", ctypes.c_uint * 12 - ), - # union { - ( - # u32 padding1[12]; - "padding1", ctypes.c_uint * 12 - ) - # u32 sw_reserved[12]; + ), # u32 mxcsr_mask; /* MXCSR Mask */ + # + # /* 8*16 bytes for each FP-reg = 128 bytes */ + ("st_space", ctypes.c_uint * 32 + ), # u32 st_space[32]; + # + # /* 16*16 bytes for each XMM-reg = 256 bytes */ + ("xmm_space", ctypes.c_uint * 64 + ), # u32 xmm_space[64]; + # + ("padding", ctypes.c_uint * 12 + ), # u32 padding[12]; + # + # union { + ("padding1", ctypes.c_uint * 12 + ) # u32 padding1[12]; + # u32 sw_reserved[12]; + # }; + # ] # } __aligned(16); class elf_xsave_struct(ctypes.Structure): # struct xsave_struct { _fields_ = [ - # struct i387_fxsave_struct i387; - ("i387", i387_fxsave_struct), - # struct xsave_hdr_struct xsave_hdr; - ("xsave_hdr", xsave_hdr_struct), - # struct ymmh_struct ymmh; - ("ymmh", ymmh_struct) + ("i387", + i387_fxsave_struct), # struct i387_fxsave_struct i387; + ("xsave_hdr", xsave_hdr_struct + ), # struct xsave_hdr_struct xsave_hdr; + ("ymmh", ymmh_struct) # struct ymmh_struct ymmh; ] # } __aligned(FP_MIN_ALIGN_BYTES) __packed; - - -class vfp_hard_struct(ctypes.Structure): # struct vfp_hard_struct { - _fields_ = [ - ("vfp_regs", ctypes.c_ulonglong * 32), # __u64 fpregs[32]; - ("fpexc", ctypes.c_ulong), # __u32 fpexc; - ("fpscr", ctypes.c_ulong), # __u32 fpscr; - ("fpinst", ctypes.c_ulong), # __u32 fpinst; - ("fpinst2", ctypes.c_ulong), # __u32 fpinst2; - ] # }; diff --git a/coredump/pycriu b/coredump/pycriu index d1b6ed5c4..d13a8790a 120000 --- a/coredump/pycriu +++ b/coredump/pycriu @@ -1 +1 @@ -../lib/pycriu \ No newline at end of file +../lib/py/ \ No newline at end of file diff --git a/crit/.gitignore b/crit/.gitignore deleted file mode 100644 index 10c8ab186..000000000 --- a/crit/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -crit.egg-info/ -build/ -dist/ -version.py diff --git a/crit/Makefile b/crit/Makefile index 33bd68eed..988b481b6 100644 --- a/crit/Makefile +++ b/crit/Makefile @@ -1,25 +1,13 @@ -VERSION_FILE := $(if $(obj),$(addprefix $(obj)/,crit/version.py),crit/version.py) -all-y += ${VERSION_FILE} -cleanup-y += ${VERSION_FILE} +all-y += crit -${VERSION_FILE}: - $(Q) echo "__version__ = '${CRIU_VERSION}'" > $@ +crit/crit: crit/crit-$(PYTHON) + $(Q) cp $^ $@ +crit: crit/crit +.PHONY: crit -install: ${VERSION_FILE} -ifeq ($(SKIP_PIP_INSTALL),0) - $(E) " INSTALL " crit - $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./crit -else - $(E) " SKIP INSTALL crit" -endif -.PHONY: install - -uninstall: -ifeq ($(SKIP_PIP_INSTALL),0) - $(E) " UNINSTALL" crit - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) crit -else - $(E) " SKIP UNINSTALL crit" -endif -.PHONY: uninstall +clean-crit: + $(Q) $(RM) crit/crit +.PHONY: clean-crit +clean: clean-crit +mrproper: clean diff --git a/crit/crit-python2 b/crit/crit-python2 new file mode 100755 index 000000000..b0b7d3c3a --- /dev/null +++ b/crit/crit-python2 @@ -0,0 +1,6 @@ +#!/usr/bin/env python2 + +from pycriu import cli + +if __name__ == '__main__': + cli.main() diff --git a/lib/setup.py b/crit/crit-python3 old mode 100644 new mode 100755 similarity index 55% rename from lib/setup.py rename to crit/crit-python3 index 618ac1de4..80467cba7 --- a/lib/setup.py +++ b/crit/crit-python3 @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import setuptools +from pycriu import cli if __name__ == '__main__': - setuptools.setup() + cli.main() diff --git a/crit/crit/__init__.py b/crit/crit/__init__.py deleted file mode 100644 index 58f3ace6c..000000000 --- a/crit/crit/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .version import __version__ diff --git a/crit/pycriu b/crit/pycriu new file mode 120000 index 000000000..d13a8790a --- /dev/null +++ b/crit/pycriu @@ -0,0 +1 @@ +../lib/py/ \ No newline at end of file diff --git a/crit/pyproject.toml b/crit/pyproject.toml deleted file mode 100644 index f0b185eb7..000000000 --- a/crit/pyproject.toml +++ /dev/null @@ -1,22 +0,0 @@ -[build-system] -requires = ["setuptools"] -build-backend = "setuptools.build_meta" - -[project] -name = "crit" -description = "CRiu Image Tool" -authors = [ - {name = "CRIU team", email = "criu@lists.linux.dev"}, -] -license = {text = "GPLv2"} -dynamic = ["version"] -requires-python = ">=3.6" - -[project.scripts] -crit = "crit.__main__:main" - -[tool.setuptools] -packages = ["crit"] - -[tool.setuptools.dynamic] -version = {attr = "crit.__version__"} diff --git a/crit/setup.cfg b/crit/setup.cfg deleted file mode 100644 index 37895923f..000000000 --- a/crit/setup.cfg +++ /dev/null @@ -1,20 +0,0 @@ -# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 -# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 -# For older versions of setuptools, we need to use the setup.cfg file -# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config - -[metadata] -name = crit -description = CRiu Image Tool -author = CRIU team -author_email = criu@lists.linux.dev -license = GPLv2 -version = attr: crit.__version__ - -[options] -packages = crit -python_requires = >=3.6 - -[options.entry_points] -console_scripts = - crit = crit.__main__:main diff --git a/crit/setup.py b/crit/setup.py deleted file mode 100644 index 618ac1de4..000000000 --- a/crit/setup.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 -import setuptools - - -if __name__ == '__main__': - setuptools.setup() diff --git a/criu/Makefile b/criu/Makefile index bafdd980b..c7ac94d6a 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -85,9 +85,8 @@ $(obj)/%: pie $(obj)/criu: $(PROGRAM-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) $(GMONLDOPT) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@ -UNIT-BUILTINS += $(obj)/util.o UNIT-BUILTINS += $(obj)/config.o UNIT-BUILTINS += $(obj)/log.o UNIT-BUILTINS += $(obj)/string.o @@ -102,7 +101,7 @@ $(obj)/unittest/built-in.o: .FORCE $(obj)/unittest/unittest: $(UNIT-BUILTINS) $(call msg-link, $@) - $(Q) $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LIBS) $(WRAPFLAGS) -rdynamic -o $@ + $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(WRAPFLAGS) $(LDFLAGS) -rdynamic -o $@ unittest: $(obj)/unittest/unittest $(Q) $(obj)/unittest/$@ @@ -145,7 +144,6 @@ install: $(obj)/criu $(Q) install -m 644 $(UAPI_HEADERS) $(DESTDIR)$(INCLUDEDIR)/criu/ $(Q) mkdir -p $(DESTDIR)$(LIBEXECDIR)/criu/scripts $(Q) install -m 755 scripts/systemd-autofs-restart.sh $(DESTDIR)$(LIBEXECDIR)/criu/scripts - $(E) " INSTALL " scripts/criu-ns $(Q) install -m 755 scripts/criu-ns $(DESTDIR)$(SBINDIR) .PHONY: install diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index ba6132d2f..50a2fa9c5 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -41,7 +41,6 @@ obj-y += lsm.o obj-y += mem.o obj-y += memfd.o obj-y += mount.o -obj-y += mount-v2.o obj-y += filesystems.o obj-y += namespaces.o obj-y += netfilter.o @@ -74,7 +73,6 @@ obj-y += sk-unix.o obj-y += sockets.o obj-y += stats.o obj-y += string.o -obj-y += setproctitle.o obj-y += sysctl.o obj-y += sysfs_parse.o obj-y += timerfd.o @@ -92,16 +90,12 @@ obj-y += servicefd.o obj-y += pie-util-vdso.o obj-y += vdso.o obj-y += timens.o -obj-y += timer.o -obj-y += sigact.o obj-$(CONFIG_HAS_LIBBPF) += bpfmap.o obj-$(CONFIG_COMPAT) += pie-util-vdso-elf32.o CFLAGS_pie-util-vdso-elf32.o += -DCONFIG_VDSO_32 obj-$(CONFIG_COMPAT) += vdso-compat.o CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV) obj-y += pidfd-store.o -obj-y += hugetlb.o -obj-y += pidfd.o PROTOBUF_GEN := scripts/protobuf-gen.sh diff --git a/criu/Makefile.packages b/criu/Makefile.packages index 3e2e6efd1..13c346f44 100644 --- a/criu/Makefile.packages +++ b/criu/Makefile.packages @@ -6,7 +6,7 @@ REQ-RPM-PKG-NAMES += protobuf-devel REQ-RPM-PKG-NAMES += protobuf-python REQ-RPM-PKG-NAMES += libnl3-devel REQ-RPM-PKG-NAMES += libcap-devel -REQ-RPM-PKG-NAMES += libuuid-devel +REQ-RPM-PKG-NAMES += $(PYTHON)-future REQ-RPM-PKG-TEST-NAMES += libaio-devel @@ -15,19 +15,22 @@ REQ-DEB-PKG-NAMES += libprotobuf-c-dev REQ-DEB-PKG-NAMES += protobuf-c-compiler REQ-DEB-PKG-NAMES += protobuf-compiler REQ-DEB-PKG-NAMES += $(PYTHON)-protobuf +REQ-DEB-PKG-NAMES += $(PYTHON)-future REQ-DEB-PKG-NAMES += libnl-3-dev REQ-DEB-PKG-NAMES += libcap-dev -REQ-DEB-PKG-NAMES += uuid-dev REQ-DEB-PKG-TEST-NAMES += $(PYTHON)-yaml REQ-DEB-PKG-TEST-NAMES += libaio-dev +ifeq ($(PYTHON),python3) REQ-DEB-PKG-TEST-NAMES += libaio-dev REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-PyYAML +else +REQ-RPM-PKG-TEST-NAMES += $(PYTHON)-pyyaml +endif - -export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet -luuid +export LIBS += -lprotobuf-c -ldl -lnl-3 -lsoccr -Lsoccr/ -lnet check-packages-failed: $(warning Can not find some of the required libraries) diff --git a/criu/action-scripts.c b/criu/action-scripts.c index 6f7900186..1ce6d9c10 100644 --- a/criu/action-scripts.c +++ b/criu/action-scripts.c @@ -18,7 +18,6 @@ #include "common/scm.h" static const char *action_names[ACT_MAX] = { - [ACT_PRE_STREAM] = "pre-stream", [ACT_PRE_DUMP] = "pre-dump", [ACT_POST_DUMP] = "post-dump", [ACT_PRE_RESTORE] = "pre-restore", @@ -31,7 +30,6 @@ static const char *action_names[ACT_MAX] = { [ACT_POST_RESUME] = "post-resume", [ACT_ORPHAN_PTS_MASTER] = "orphan-pts-master", [ACT_STATUS_READY] = "status-ready", - [ACT_QUERY_EXT_FILES] = "query-ext-files", }; struct script { @@ -53,9 +51,6 @@ static int run_shell_scripts(const char *action) #define ENV_IMGDIR 0x1 #define ENV_ROOTPID 0x2 - if (list_empty(&scripts)) - return 0; - if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) { pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action); return -1; @@ -116,20 +111,6 @@ int rpc_send_fd(enum script_actions act, int fd) return send_criu_rpc_script(act, (char *)action, rpc_sk, fd); } -int rpc_query_external_files(void) -{ - int rpc_sk; - - if (scripts_mode != SCRIPTS_RPC) - return 0; - - rpc_sk = get_service_fd(RPC_SK_OFF); - if (rpc_sk < 0) - return -1; - - return exec_rpc_query_external_files((char *)action_names[ACT_QUERY_EXT_FILES], rpc_sk); -} - int run_scripts(enum script_actions act) { int ret = 0; @@ -137,24 +118,23 @@ int run_scripts(enum script_actions act) pr_debug("Running %s scripts\n", action); - switch (scripts_mode) { - case SCRIPTS_NONE: + if (scripts_mode == SCRIPTS_NONE) return 0; - case SCRIPTS_RPC: + + if (scripts_mode == SCRIPTS_RPC) { ret = rpc_send_fd(act, -1); - if (ret) - break; - /* Enable scripts from config file in RPC mode (fallthrough) */ - case SCRIPTS_SHELL: - ret = run_shell_scripts(action); - break; - default: - BUG(); + goto out; } + if (scripts_mode == SCRIPTS_SHELL) { + ret = run_shell_scripts(action); + goto out; + } + + BUG(); +out: if (ret) pr_err("One of more action scripts failed\n"); - return ret; } @@ -162,9 +142,8 @@ int add_script(char *path) { struct script *script; - /* Set shell mode when a script is added but don't overwrite RPC mode */ - if (scripts_mode == SCRIPTS_NONE) - scripts_mode = SCRIPTS_SHELL; + BUG_ON(scripts_mode == SCRIPTS_RPC); + scripts_mode = SCRIPTS_SHELL; script = xmalloc(sizeof(struct script)); if (script == NULL) @@ -190,6 +169,7 @@ int add_rpc_notify(int sk) return -1; } + BUG_ON(scripts_mode == SCRIPTS_SHELL); scripts_mode = SCRIPTS_RPC; if (install_service_fd(RPC_SK_OFF, fd) < 0) diff --git a/criu/apparmor.c b/criu/apparmor.c index 48b639216..328fc606b 100644 --- a/criu/apparmor.c +++ b/criu/apparmor.c @@ -35,7 +35,7 @@ * Thus, the old code that saves and restores AA profiles is still relevant, we * just need to add the new code in this file to walk the namespace and dump * any blobs in that AA namespace, and then restore these blobs on restore so - * that the profiles the old code tries to use are actually present. + * that the profiles the old code tries to use are actualy present. */ static AaNamespace **namespaces = NULL; @@ -108,7 +108,7 @@ static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) return -1; aa_policy__init(cur); - __strlcat(path + my_offset, "name", PATH_MAX - my_offset); + strlcat(path + my_offset, "name", PATH_MAX - my_offset); f = fopen(path, "r"); if (!f) { xfree(cur); @@ -124,7 +124,7 @@ static int collect_profile(char *path, int offset, char *dir, AaNamespace *ns) return -1; } - __strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); + strlcpy(path + my_offset, "raw_data", PATH_MAX - my_offset); fd = open(path, O_RDONLY); if (fd < 0) { pr_perror("failed to open aa policy %s", path); @@ -207,6 +207,8 @@ static int by_time(const struct dirent **de1, const struct dirent **de2) } else { if (sb1.st_mtim.tv_sec < sb2.st_mtim.tv_sec) return -1; + if (sb1.st_mtim.tv_sec == sb2.st_mtim.tv_sec) + return 0; return 1; } } @@ -469,7 +471,6 @@ static void *get_suspend_policy(char *name, off_t *len) ret = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (ret == MAP_FAILED) { pr_perror("mmap of %s failed", file); - ret = NULL; goto out; } @@ -519,13 +520,13 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit tmp = *end; *end = 0; - __strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); + strlcpy(namespace, rewrite_pos + 1, sizeof(namespace)); *end = tmp; break; } default: - __strlcpy(namespace, ns->name, sizeof(namespace)); + strlcpy(namespace, ns->name, sizeof(namespace)); for (i = 0; i < ns->n_policies; i++) { if (strcmp(ns->policies[i]->name, rewrite_pos)) pr_warn("binary rewriting of apparmor policies not supported right now, not renaming %s to %s\n", @@ -550,8 +551,8 @@ static int write_aa_policy(AaNamespace *ns, char *path, int offset, char *rewrit goto fail; } - ret = snprintf(path + offset + my_offset, PATH_MAX - offset - my_offset, "/.replace"); - if (ret < 0 || ret >= PATH_MAX - offset - my_offset) { + ret = snprintf(path + offset + my_offset, sizeof(path) - offset - my_offset, "/.replace"); + if (ret < 0 || ret >= sizeof(path) - offset - my_offset) { pr_err("snprintf failed\n"); goto fail; } @@ -629,7 +630,7 @@ int suspend_aa(void) } ret = do_suspend(true); - if (rmrf(policydir) < 0) + if (rm_rf(policydir) < 0) pr_err("failed removing policy dir %s\n", policydir); return ret; @@ -667,7 +668,7 @@ int dump_aa_namespaces(void) bool check_aa_ns_dumping(void) { - char contents[49]; + char contents[48]; int major, minor, ret; FILE *f; diff --git a/criu/arch/aarch64/Makefile b/criu/arch/aarch64/Makefile index b87fcaa5b..b26487367 100644 --- a/criu/arch/aarch64/Makefile +++ b/criu/arch/aarch64/Makefile @@ -6,4 +6,3 @@ obj-y += cpu.o obj-y += crtools.o obj-y += sigframe.o obj-y += bitops.o -obj-y += gcs.o \ No newline at end of file diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index 2e89f9ce3..e87b8629a 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -1,6 +1,5 @@ #include #include -#include #include @@ -12,7 +11,6 @@ #include "common/compiler.h" #include #include "asm/dump.h" -#include "asm/gcs-types.h" #include "protobuf.h" #include "images/core.pb-c.h" #include "images/creds.pb-c.h" @@ -22,137 +20,12 @@ #include "cpu.h" #include "restorer.h" #include "compel/infect.h" -#include "pstree.h" -#include - -/* - * cr_user_pac_* are a copy of the corresponding uapi structs - * in arch/arm64/include/uapi/asm/ptrace.h - */ -struct cr_user_pac_address_keys { - __uint128_t apiakey; - __uint128_t apibkey; - __uint128_t apdakey; - __uint128_t apdbkey; -}; - -struct cr_user_pac_generic_keys { - __uint128_t apgakey; -}; - -/* - * The following HWCAP constants are copied from - * arch/arm64/include/uapi/asm/hwcap.h - */ -#ifndef HWCAP_PACA -#define HWCAP_PACA (1 << 30) -#endif - -#ifndef HWCAP_PACG -#define HWCAP_PACG (1UL << 31) -#endif - -/* - * The following NT_ARM_PAC constants are copied from - * include/uapi/linux/elf.h - */ -#ifndef NT_ARM_PACA_KEYS -#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ -#endif - -#ifndef NT_ARM_PACG_KEYS -#define NT_ARM_PACG_KEYS 0x408 -#endif - -#ifndef NT_ARM_PAC_ENABLED_KEYS -#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* AArch64 pointer authentication enabled keys. */ -#endif - -extern unsigned long getauxval(unsigned long type); #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e -static int save_pac_keys(int pid, CoreEntry *core) -{ - struct cr_user_pac_address_keys paca; - struct cr_user_pac_generic_keys pacg; - PacKeys *pac_entry; - long pac_enabled_key; - struct iovec iov; - int ret; - - unsigned long hwcaps = getauxval(AT_HWCAP); - - pac_entry = xmalloc(sizeof(PacKeys)); - if (!pac_entry) - return -1; - core->ti_aarch64->pac_keys = pac_entry; - pac_keys__init(pac_entry); - - if (hwcaps & HWCAP_PACA) { - PacAddressKeys *pac_address_keys; - - pr_debug("%d: Dumping address authentication keys\n", pid); - iov.iov_base = &paca; - iov.iov_len = sizeof(paca); - if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { - pr_perror("Failed to get address authentication key for %d", pid); - return -1; - } - pac_address_keys = xmalloc(sizeof(PacAddressKeys)); - if (!pac_address_keys) - return -1; - pac_address_keys__init(pac_address_keys); - pac_entry->pac_address_keys = pac_address_keys; - pac_address_keys->apiakey_lo = paca.apiakey; - pac_address_keys->apiakey_hi = paca.apiakey >> 64; - pac_address_keys->apibkey_lo = paca.apibkey; - pac_address_keys->apibkey_hi = paca.apibkey >> 64; - pac_address_keys->apdakey_lo = paca.apdakey; - pac_address_keys->apdakey_hi = paca.apdakey >> 64; - pac_address_keys->apdbkey_lo = paca.apdbkey; - pac_address_keys->apdbkey_hi = paca.apdbkey >> 64; - - iov.iov_base = &pac_enabled_key; - iov.iov_len = sizeof(pac_enabled_key); - ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov); - if (ret) { - pr_perror("Failed to get authentication key mask for %d", pid); - return -1; - } - - pac_address_keys->pac_enabled_key = pac_enabled_key; - - } - if (hwcaps & HWCAP_PACG) { - PacGenericKeys *pac_generic_keys; - - pr_debug("%d: Dumping generic authentication keys\n", pid); - iov.iov_base = &pacg; - iov.iov_len = sizeof(pacg); - if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { - pr_perror("Failed to get a generic authantication key for %d", pid); - return -1; - } - pac_generic_keys = xmalloc(sizeof(PacGenericKeys)); - if (!pac_generic_keys) - return -1; - pac_generic_keys__init(pac_generic_keys); - pac_entry->pac_generic_keys = pac_generic_keys; - pac_generic_keys->apgakey_lo = pacg.apgakey; - pac_generic_keys->apgakey_hi = pacg.apgakey >> 64; - } - return 0; -} - -int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) { int i; - struct cr_user_gcs gcs_live; - struct iovec gcs_iov = { - .iov_base = &gcs_live, - .iov_len = sizeof(gcs_live), - }; CoreEntry *core = x; // Save the Aarch64 CPU state @@ -164,24 +37,11 @@ int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_str // Save the FP/SIMD state for (i = 0; i < 32; ++i) { - core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->fpstate.vregs[i]; - core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->fpstate.vregs[i] >> 64; - } - assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpsr); - assign_reg(core->ti_aarch64->fpsimd, &fpsimd->fpstate, fpcr); - - if (save_pac_keys(pid, core)) - return -1; - - /* Save the GCS state */ - if (compel_host_supports_gcs()) { - if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_GCS, &gcs_iov) < 0) { - pr_perror("Failed to get GCS for %d", pid); - return -1; - } - core->ti_aarch64->gcs->gcspr_el0 = gcs_live.gcspr_el0; - core->ti_aarch64->gcs->features_enabled = gcs_live.features_enabled; + core->ti_aarch64->fpsimd->vregs[2 * i] = fpsimd->vregs[i]; + core->ti_aarch64->fpsimd->vregs[2 * i + 1] = fpsimd->vregs[i] >> 64; } + assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr); + assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr); return 0; } @@ -191,7 +51,6 @@ int arch_alloc_thread_info(CoreEntry *core) ThreadInfoAarch64 *ti_aarch64; UserAarch64RegsEntry *gpregs; UserAarch64FpsimdContextEntry *fpsimd; - UserAarch64GcsEntry *gcs; ti_aarch64 = xmalloc(sizeof(*ti_aarch64)); if (!ti_aarch64) @@ -221,15 +80,6 @@ int arch_alloc_thread_info(CoreEntry *core) if (!fpsimd->vregs) goto err; - /* Allocate & init GCS */ - if (compel_host_supports_gcs()) { - gcs = xmalloc(sizeof(*gcs)); - if (!gcs) - goto err; - user_aarch64_gcs_entry__init(gcs); - ti_aarch64->gcs = gcs; - } - return 0; err: return -1; @@ -242,12 +92,6 @@ void arch_free_thread_info(CoreEntry *core) xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs); xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd); } - if (CORE_THREAD_ARCH_INFO(core)->pac_keys) { - PacKeys *pac_entry = CORE_THREAD_ARCH_INFO(core)->pac_keys; - xfree(pac_entry->pac_address_keys); - xfree(pac_entry->pac_generic_keys); - xfree(pac_entry); - } xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); xfree(CORE_THREAD_ARCH_INFO(core)); @@ -259,7 +103,6 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { int i; struct fpsimd_context *fpsimd = RT_SIGFRAME_FPU(sigframe); - struct gcs_context *gcs; if (core->ti_aarch64->fpsimd->n_vregs != 64) return 1; @@ -273,18 +116,6 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) fpsimd->head.magic = FPSIMD_MAGIC; fpsimd->head.size = sizeof(*fpsimd); - if (compel_host_supports_gcs()) { - gcs = RT_SIGFRAME_GCS(sigframe); - - pr_debug("sigframe gcspr %llx enabled %llx\n", gcs->gcspr, gcs->features_enabled); - - gcs->head.magic = GCS_MAGIC; - gcs->head.size = sizeof(*gcs); - gcs->reserved = 0; - gcs->gcspr = core->ti_aarch64->gcs->gcspr_el0 - 8; - gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; - } - return 0; } @@ -304,83 +135,3 @@ int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r) return 0; } - -int arch_ptrace_restore(int pid, struct pstree_item *item) -{ - unsigned long hwcaps = getauxval(AT_HWCAP); - struct cr_user_pac_address_keys upaca; - struct cr_user_pac_generic_keys upacg; - PacAddressKeys *paca; - PacGenericKeys *pacg; - long pac_enabled_keys; - struct iovec iov; - int ret; - - - pr_debug("%d: Restoring PAC keys\n", pid); - - paca = &rsti(item)->arch_info.pac_address_keys; - pacg = &rsti(item)->arch_info.pac_generic_keys; - if (rsti(item)->arch_info.has_paca) { - if (!(hwcaps & HWCAP_PACA)) { - pr_err("PACG support is required from the source system.\n"); - return 1; - } - pac_enabled_keys = rsti(item)->arch_info.pac_address_keys.pac_enabled_key; - - upaca.apiakey = paca->apiakey_lo + ((__uint128_t)paca->apiakey_hi << 64); - upaca.apibkey = paca->apibkey_lo + ((__uint128_t)paca->apibkey_hi << 64); - upaca.apdakey = paca->apdakey_lo + ((__uint128_t)paca->apdakey_hi << 64); - upaca.apdbkey = paca->apdbkey_lo + ((__uint128_t)paca->apdbkey_hi << 64); - - iov.iov_base = &upaca; - iov.iov_len = sizeof(upaca); - - if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACA_KEYS, &iov))) { - pr_perror("Failed to set address authentication keys for %d", pid); - return 1; - } - iov.iov_base = &pac_enabled_keys; - iov.iov_len = sizeof(pac_enabled_keys); - if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PAC_ENABLED_KEYS, &iov))) { - pr_perror("Failed to set enabled key mask for %d", pid); - return 1; - } - } - - if (rsti(item)->arch_info.has_pacg) { - if (!(hwcaps & HWCAP_PACG)) { - pr_err("PACG support is required from the source system.\n"); - return 1; - } - upacg.apgakey = pacg->apgakey_lo + ((__uint128_t)pacg->apgakey_hi << 64); - iov.iov_base = &upacg; - iov.iov_len = sizeof(upacg); - if ((ret = ptrace(PTRACE_SETREGSET, pid, NT_ARM_PACG_KEYS, &iov))) { - pr_perror("Failed to set the generic authentication key for %d", pid); - return 1; - } - } - - return 0; -} - -void arch_rsti_init(struct pstree_item *p) -{ - PacKeys *pac_keys = p->core[0]->ti_aarch64->pac_keys; - - rsti(p)->arch_info.has_paca = false; - rsti(p)->arch_info.has_pacg = false; - - if (!pac_keys) - return; - - if (pac_keys->pac_address_keys) { - rsti(p)->arch_info.has_paca = true; - rsti(p)->arch_info.pac_address_keys = *pac_keys->pac_address_keys; - } - if (pac_keys->pac_generic_keys) { - rsti(p)->arch_info.has_pacg = true; - rsti(p)->arch_info.pac_generic_keys = *pac_keys->pac_generic_keys; - } -} diff --git a/criu/arch/aarch64/gcs.c b/criu/arch/aarch64/gcs.c deleted file mode 100644 index 4bdb9d2e4..000000000 --- a/criu/arch/aarch64/gcs.c +++ /dev/null @@ -1,157 +0,0 @@ -#include -#include - -#include -#include - -#include "asm/gcs-types.h" -#include "pstree.h" -#include "restorer.h" -#include "rst-malloc.h" -#include "vma.h" - -#include -#include - -static bool task_has_gcs_enabled(UserAarch64GcsEntry *gcs) -{ - return gcs && (gcs->features_enabled & PR_SHADOW_STACK_ENABLE) != 0; -} - -static bool host_supports_gcs(void) -{ - unsigned long hwcap = getauxval(AT_HWCAP); - return (hwcap & HWCAP_GCS) != 0; -} - -static bool task_needs_gcs(struct pstree_item *item, CoreEntry *core) -{ - UserAarch64GcsEntry *gcs; - - if (!task_alive(item)) - return false; - - gcs = core->ti_aarch64->gcs; - - if (task_has_gcs_enabled(gcs)) { - if (!host_supports_gcs()) { - pr_warn_once("Restoring task with GCS on non-GCS host\n"); - return false; - } - - pr_info("Restoring task with GCS\n"); - return true; - } - - pr_info("Restoring a task without GCS\n"); - return false; -} - -static int gcs_prepare_task(struct vm_area_list *vmas, - struct rst_shstk_info *gcs) -{ - struct vma_area *vma; - - list_for_each_entry(vma, &vmas->h, list) { - if (vma_area_is(vma, VMA_AREA_SHSTK) && - in_vma_area(vma, gcs->gcspr_el0)) { - unsigned long premapped_addr = vma->premmaped_addr; - unsigned long size = vma_area_len(vma); - - gcs->vma_start = vma->e->start; - gcs->vma_size = size; - gcs->premapped_addr = premapped_addr; - - return 0; - } - } - - pr_err("Unable to find a shadow stack vma: %lx\n", gcs->gcspr_el0); - return -1; -} - -int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, - struct task_restore_args *ta) -{ - int i; - struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); - struct vm_area_list *vmas = &rsti(item)->vmas; - struct rst_shstk_info *gcs = &ta->shstk; - - if (!task_needs_gcs(item, core)) - return 0; - - gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; - gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; - - if (gcs_prepare_task(vmas, gcs)) { - pr_err("gcs: failed to prepare shadow stack memory\n"); - return -1; - } - - for (i = 0; i < item->nr_threads; i++) { - struct thread_restore_args *thread_args = &args_array[i]; - - core = item->core[i]; - gcs = &thread_args->shstk; - - gcs->gcspr_el0 = core->ti_aarch64->gcs->gcspr_el0; - gcs->features_enabled = core->ti_aarch64->gcs->features_enabled; - - if (gcs_prepare_task(vmas, gcs)) { - pr_err("gcs: failed to prepare GCS memory\n"); - return -1; - } - } - - return 0; -} - -int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, - int (*func)(void *arg), void *arg) -{ - int fret; - unsigned long flags = PR_SHADOW_STACK_ENABLE | - PR_SHADOW_STACK_PUSH | - PR_SHADOW_STACK_WRITE; - - long ret, x1_after, x8_after; - - /* If task doesn't need GCS, just call func */ - if (!task_needs_gcs(item, core)) { - return func(arg); - } - - pr_debug("gcs: GCS enable SVC about to fire: x8=%d x0=%d x1=0x%lx\n", - __NR_prctl, PR_SET_SHADOW_STACK_STATUS, flags); - - asm volatile( - "mov x0, %3\n" // x0 = PR_SET_SHADOW_STACK_STATUS (75) - "mov x1, %4\n" // x1 = flags - "mov x2, xzr\n" // x2 = 0 - "mov x3, xzr\n" // x3 = 0 - "mov x4, xzr\n" // x4 = 0 - "mov x8, %5\n" // x8 = __NR_prctl (167) - "svc #0\n" // Invoke syscall - "mov %0, x0\n" // Capture return value - "mov %1, x1\n" // Capture x1 after - "mov %2, x8\n" // Capture x8 after - : "=r"(ret), "=r"(x1_after), "=r"(x8_after) - : "i"(PR_SET_SHADOW_STACK_STATUS), // x0 - %3rd - "r"(flags), // x1 - %4th - "i"(__NR_prctl) // x8 - %5th - : "x0", "x1", "x2", "x3", "x4", "x8", "memory", "cc"); - - pr_info("gcs: after SVC: ret=%ld x1=%ld x8=%ld\n", ret, x1_after, x8_after); - - if (ret != 0) { - int err = errno; - pr_err("gcs: failed to enable GCS: ret=%ld errno=%d (%s)\n", ret, err, strerror(err)); - return -1; - } - - fret = func(arg); - exit(fret); - - return -1; -} diff --git a/criu/arch/aarch64/include/asm/dump.h b/criu/arch/aarch64/include/asm/dump.h index ecab061c3..90cd8bca8 100644 --- a/criu/arch/aarch64/include/asm/dump.h +++ b/criu/arch/aarch64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/aarch64/include/asm/gcs.h b/criu/arch/aarch64/include/asm/gcs.h deleted file mode 100644 index 28faa23b7..000000000 --- a/criu/arch/aarch64/include/asm/gcs.h +++ /dev/null @@ -1,196 +0,0 @@ -#ifndef __CR_ASM_GCS_H__ -#define __CR_ASM_GCS_H__ - -#include - -struct rst_shstk_info { - unsigned long vma_start; /* start of GCS VMA */ - unsigned long vma_size; /* size of GCS VMA */ - unsigned long premapped_addr; /* premapped buffer */ - unsigned long tmp_gcs; /* temp area for GCS if needed */ - u64 gcspr_el0; /* GCS pointer */ - u64 features_enabled; /* GCS flags */ -}; - -#define rst_shstk_info rst_shstk_info - -struct task_restore_args; -struct pstree_item; - -int arch_gcs_prepare(struct pstree_item *item, CoreEntry *core, - struct task_restore_args *ta); -#define arch_shstk_prepare arch_gcs_prepare - -int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, - int (*func)(void *arg), void *arg); -#define arch_shstk_trampoline arch_shstk_trampoline - -static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *gcs, void *ptr) -{ - gcs->tmp_gcs = (long unsigned)ptr; -} -#define shstk_set_restorer_stack shstk_set_restorer_stack - -static always_inline long shstk_restorer_stack_size(void) -{ - return PAGE_SIZE; -} -#define shstk_restorer_stack_size shstk_restorer_stack_size - -#ifdef CR_NOGLIBC -#include -#include -#include "vma.h" - -static inline unsigned long gcs_map(unsigned long addr, unsigned long size, unsigned int flags) -{ - long gcspr = sys_map_shadow_stack(addr, size, flags); - pr_info("gcs: syscall: map_shadow_stack at=%lx size=%ld\n", addr, size); - - if (gcspr < 0) { - pr_err("gcs: failed to map GCS at %lx: %ld\n", addr, gcspr); - return -1; - } - - if (addr && gcspr != addr) { - pr_err("gcs: address mismatch: need %lx, got %lx\n", addr, gcspr); - return -1; - } - - pr_info("gcs: mmapped GCS at %lx\n", gcspr); - - return gcspr; -} - -/* clang-format off */ -static always_inline void gcsss1(unsigned long *Xt) -{ - asm volatile ( - "sys #3, C7, C7, #2, %0\n" - : - : "rZ" (Xt) - : "memory"); -} - -static always_inline unsigned long *gcsss2(void) -{ - unsigned long *Xt; - - asm volatile ( - "SYSL %0, #3, C7, C7, #3\n" - : "=r" (Xt) - : - : "memory"); - - return Xt; -} - -static inline void gcsstr(unsigned long addr, unsigned long val) -{ - asm volatile( - "mov x0, %0\n" - "mov x1, %1\n" - ".inst 0xd91f1c01\n" // GCSSTR x1, [x0] - "mov x0, #0\n" - : - : "r"(addr), "r"(val) - : "x0", "x1", "memory"); -} -/* clang-format on */ - -static always_inline int gcs_restore(struct rst_shstk_info *gcs) -{ - unsigned long gcspr, val; - - if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { - return 0; - } - - gcspr = gcs->gcspr_el0 - 8; - - val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8); - pr_debug("gcs: [0] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); - gcsstr(gcspr, val); - - val = ALIGN_DOWN(GCS_SIGNAL_CAP(gcspr), 8) | GCS_CAP_VALID_TOKEN; - gcspr -= 8; - pr_debug("gcs: [1] GCSSTR VAL=%lx write at GCSPR=%lx\n", val, gcspr); - gcsstr(gcspr, val); - - pr_debug("gcs: about to switch stacks via GCSSS1 to: %lx\n", gcspr); - gcsss1((unsigned long *)gcspr); - return 0; -} -#define arch_shstk_restore gcs_restore - -static always_inline int gcs_vma_restore(VmaEntry *vma_entry) -{ - unsigned long shstk, i, ret; - unsigned long *gcs_data = (void *)vma_premmaped_start(vma_entry); - unsigned long vma_size = vma_entry_len(vma_entry); - - shstk = gcs_map(0, vma_size, SHADOW_STACK_SET_TOKEN); - if (shstk < 0) { - pr_err("Failed to map shadow stack at %lx: %ld\n", shstk, shstk); - } - - /* restore shadow stack contents */ - for (i = 0; i < vma_size / 8; i++) - gcsstr(shstk + i * 8, gcs_data[i]); - - pr_debug("unmap %lx %ld\n", (unsigned long)gcs_data, vma_size); - ret = sys_munmap(gcs_data, vma_size); - if (ret < 0) { - pr_err("Failed to unmap premmaped shadow stack\n"); - return ret; - } - - vma_premmaped_start(vma_entry) = shstk; - - return 0; -} -#define shstk_vma_restore gcs_vma_restore - -static always_inline int gcs_switch_to_restorer(struct rst_shstk_info *gcs) -{ - int ret; - unsigned long *ssp; - unsigned long addr; - unsigned long gcspr; - - if (!(gcs && gcs->features_enabled & PR_SHADOW_STACK_ENABLE)) { - return 0; - } - - pr_debug("gcs->premapped_addr + gcs->vma_size = %lx\n", gcs->premapped_addr + gcs->vma_size); - pr_debug("gcs->tmp_gcs = %lx\n", gcs->tmp_gcs); - addr = gcs->tmp_gcs; - - if (addr % PAGE_SIZE != 0) { - pr_err("gcs: 0x%lx not page-aligned to size 0x%lx\n", addr, PAGE_SIZE); - return -1; - } - - ret = sys_munmap((void *)addr, PAGE_SIZE); - if (ret < 0) { - pr_err("gcs: Failed to unmap aarea for dumpee GCS VMAs\n"); - return -1; - } - - gcspr = gcs_map(addr, PAGE_SIZE, SHADOW_STACK_SET_TOKEN); - - if (gcspr == -1) { - pr_err("gcs: failed to gcs_map(%lx, %lx)\n", (unsigned long)addr, PAGE_SIZE); - return -1; - } - - ssp = (unsigned long *)(addr + PAGE_SIZE - 8); - gcsss1(ssp); - - return 0; -} -#define arch_shstk_switch_to_restorer gcs_switch_to_restorer - -#endif /* CR_NOGLIBC */ - -#endif /* __CR_ASM_GCS_H__ */ diff --git a/criu/arch/aarch64/include/asm/restore.h b/criu/arch/aarch64/include/asm/restore.h index c79605c40..75e87996a 100644 --- a/criu/arch/aarch64/include/asm/restore.h +++ b/criu/arch/aarch64/include/asm/restore.h @@ -26,14 +26,4 @@ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); -#define ARCH_RST_INFO y -struct rst_arch_info { - bool has_paca, has_pacg; - PacAddressKeys pac_address_keys; - PacGenericKeys pac_generic_keys; -}; - -int arch_ptrace_restore(int pid, struct pstree_item *item); -void arch_rsti_init(struct pstree_item *current); - #endif diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index 8f3edc257..64a9c24eb 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -1,11 +1,10 @@ #ifndef __CR_ASM_RESTORER_H__ #define __CR_ASM_RESTORER_H__ -#include +#include #include #include "asm/types.h" -#include "asm/gcs.h" #include "images/core.pb-c.h" #include diff --git a/criu/arch/aarch64/include/asm/thread_pointer.h b/criu/arch/aarch64/include/asm/thread_pointer.h deleted file mode 100644 index f7e07066a..000000000 --- a/criu/arch/aarch64/include/asm/thread_pointer.h +++ /dev/null @@ -1,27 +0,0 @@ -/* __thread_pointer definition. Generic version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -static inline void *__criu_thread_pointer(void) -{ - return __builtin_thread_pointer(); -} - -#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h index db118cafd..c860af1cf 100644 --- a/criu/arch/aarch64/include/asm/types.h +++ b/criu/arch/aarch64/include/asm/types.h @@ -22,8 +22,6 @@ typedef UserAarch64RegsEntry UserRegsEntry; #define TI_SP(core) ((core)->ti_aarch64->gpregs->sp) -#define TI_IP(core) ((core)->ti_aarch64->gpregs->pc) - static inline void *decode_pointer(uint64_t v) { return (void *)v; @@ -33,16 +31,7 @@ static inline uint64_t encode_pointer(void *p) return (uint64_t)p; } -/** - * See also: - * * arch/arm64/include/uapi/asm/auxvec.h - * * include/linux/auxvec.h - * * include/linux/mm_types.h - */ -#define AT_VECTOR_SIZE_BASE 22 -#define AT_VECTOR_SIZE_ARCH 2 -#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) - +#define AT_VECTOR_SIZE 40 typedef uint64_t auxv_t; typedef uint64_t tls_t; diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c index 6a5e4c89a..26b94e157 100644 --- a/criu/arch/arm/crtools.c +++ b/criu/arch/arm/crtools.c @@ -22,7 +22,7 @@ #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) -int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/arm/include/asm/dump.h b/criu/arch/arm/include/asm/dump.h index b0ac5715d..485986065 100644 --- a/criu/arch/arm/include/asm/dump.h +++ b/criu/arch/arm/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/arm/include/asm/thread_pointer.h b/criu/arch/arm/include/asm/thread_pointer.h deleted file mode 100644 index f7e07066a..000000000 --- a/criu/arch/arm/include/asm/thread_pointer.h +++ /dev/null @@ -1,27 +0,0 @@ -/* __thread_pointer definition. Generic version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -static inline void *__criu_thread_pointer(void) -{ - return __builtin_thread_pointer(); -} - -#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/arm/include/asm/types.h b/criu/arch/arm/include/asm/types.h index 93d2dc23d..cfcb8a136 100644 --- a/criu/arch/arm/include/asm/types.h +++ b/criu/arch/arm/include/asm/types.h @@ -21,8 +21,6 @@ typedef UserArmRegsEntry UserRegsEntry; #define TI_SP(core) ((core)->ti_arm->gpregs->sp) -#define TI_IP(core) ((core)->ti_arm->gpregs->ip) - static inline void *decode_pointer(u64 v) { return (void *)(u32)v; diff --git a/criu/arch/loongarch64/Makefile b/criu/arch/loongarch64/Makefile deleted file mode 100644 index 4bd99eb7e..000000000 --- a/criu/arch/loongarch64/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -builtin-name := crtools.built-in.o - -ccflags-y += -iquote $(obj)/include -ccflags-y += -iquote criu/include -iquote include -ccflags-y += $(COMPEL_UAPI_INCLUDES) - -asflags-y += -Wstrict-prototypes -asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer -asflags-y += -iquote $(obj)/include -ldflags-y += -r -z noexecstack - -obj-y += cpu.o -obj-y += crtools.o -obj-y += sigframe.o diff --git a/criu/arch/loongarch64/cpu.c b/criu/arch/loongarch64/cpu.c deleted file mode 100644 index 5559c4288..000000000 --- a/criu/arch/loongarch64/cpu.c +++ /dev/null @@ -1,31 +0,0 @@ -#undef LOG_PREFIX -#define LOG_PREFIX "cpu: " - -int cpu_init(void) -{ - return 0; -} - -int cpu_dump_cpuinfo(void) -{ - return 0; -} - -int cpu_validate_cpuinfo(void) -{ - return 0; -} - -int cpuinfo_dump(void) -{ - if (cpu_init()) - return -1; - if (cpu_dump_cpuinfo()) - return -1; - return 0; -} - -int cpuinfo_check(void) -{ - return 0; -} diff --git a/criu/arch/loongarch64/crtools.c b/criu/arch/loongarch64/crtools.c deleted file mode 100644 index 783951b5b..000000000 --- a/criu/arch/loongarch64/crtools.c +++ /dev/null @@ -1,115 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "types.h" -#include "log.h" -#include "asm/restorer.h" -#include "asm/parasite-syscall.h" -#include -#include "asm/dump.h" -#include "cr_options.h" -#include "common/compiler.h" -#include "restorer.h" -#include "parasite-syscall.h" -#include "util.h" -#include "cpu.h" -#include -#include "kerndat.h" - -#include "protobuf.h" -#include "images/core.pb-c.h" -#include "images/creds.pb-c.h" - -#define assign_reg(dst, src, e) (dst)->e = (__typeof__(dst->e))(src)->e - -int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) -{ - int i; - CoreEntry *core = x; - UserLoongarch64GpregsEntry *gprs = core->ti_loongarch64->gpregs; - UserLoongarch64FpregsEntry *fprs = core->ti_loongarch64->fpregs; - for (i = 0; i < GPR_NUM; i++) - assign_reg(gprs, regs, regs[i]); - assign_reg(gprs, regs, pc); - - for (i = 0; i < FPR_NUM; i++) - assign_reg(fpregs, fpregs, regs[i]); - assign_reg(fprs, fpregs, fcc); - assign_reg(fprs, fpregs, fcsr); - return 0; -} - -int arch_alloc_thread_info(CoreEntry *core) -{ - ThreadInfoLoongarch64 *ti_loongarch64; - UserLoongarch64GpregsEntry *gpregs; - UserLoongarch64FpregsEntry *fpregs; - - ti_loongarch64 = xmalloc(sizeof(*ti_loongarch64)); - thread_info_loongarch64__init(ti_loongarch64); - core->ti_loongarch64 = ti_loongarch64; - - gpregs = xmalloc(sizeof(*gpregs)); - if (!gpregs) - goto err; - user_loongarch64_gpregs_entry__init(gpregs); - gpregs->n_regs = GPR_NUM; - gpregs->regs = xmalloc(GPR_NUM * sizeof(uint64_t)); - if (!gpregs->regs) - goto err; - ti_loongarch64->gpregs = gpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - goto err; - user_loongarch64_fpregs_entry__init(fpregs); - fpregs->n_regs = FPR_NUM; - fpregs->regs = xmalloc(FPR_NUM * sizeof(uint64_t)); - if (!fpregs->regs) - goto err; - ti_loongarch64->fpregs = fpregs; - - return 0; -err: - return -1; -} - -void arch_free_thread_info(CoreEntry *core) -{ - if (CORE_THREAD_ARCH_INFO(core)) { - if (CORE_THREAD_ARCH_INFO(core)->fpregs) { - xfree(CORE_THREAD_ARCH_INFO(core)->fpregs->regs); - xfree(CORE_THREAD_ARCH_INFO(core)->fpregs); - } - xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs); - xfree(CORE_THREAD_ARCH_INFO(core)->gpregs); - xfree(CORE_THREAD_ARCH_INFO(core)); - CORE_THREAD_ARCH_INFO(core) = NULL; - } -} - -int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) -{ - fpu_context_t *fpu = RT_SIGFRAME_FPU(sigframe); - UserLoongarch64FpregsEntry *fpregs = core->ti_loongarch64->fpregs; - - memcpy(fpu->regs, fpregs->regs, sizeof(fpu->regs)); - fpu->fcc = fpregs->fcc; - fpu->fcsr = fpregs->fcsr; - return 0; -} - -int restore_gpregs(struct rt_sigframe *sigframe, UserRegsEntry *r) -{ - sigcontext_t *sc = RT_SIGFRAME_SIGCTX(sigframe); - memcpy(sc->regs, r->regs, sizeof(sc->regs)); - sc->pc = r->pc; - return 0; -} diff --git a/criu/arch/loongarch64/include/asm/dump.h b/criu/arch/loongarch64/include/asm/dump.h deleted file mode 100644 index a1c0c4c58..000000000 --- a/criu/arch/loongarch64/include/asm/dump.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __CR_ASM_DUMP_H__ -#define __CR_ASM_DUMP_H__ - -extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); -extern int arch_alloc_thread_info(CoreEntry *core); -extern void arch_free_thread_info(CoreEntry *core); - -static inline void core_put_tls(CoreEntry *core, tls_t tls) -{ - core->ti_loongarch64->tls = tls; -} - -#define get_task_futex_robust_list_compat(pid, info) -1 - -#endif diff --git a/criu/arch/loongarch64/include/asm/int.h b/criu/arch/loongarch64/include/asm/int.h deleted file mode 100644 index 642804e9b..000000000 --- a/criu/arch/loongarch64/include/asm/int.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CR_ASM_INT_H__ -#define __CR_ASM_INT_H__ - -#include "asm-generic/int.h" - -#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/kerndat.h b/criu/arch/loongarch64/include/asm/kerndat.h deleted file mode 100644 index bb70cf6cf..000000000 --- a/criu/arch/loongarch64/include/asm/kerndat.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __CR_ASM_KERNDAT_H__ -#define __CR_ASM_KERNDAT_H__ - -#define kdat_compatible_cr() 0 -#define kdat_can_map_vdso() 0 - -#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/loongarch64/include/asm/parasite-syscall.h b/criu/arch/loongarch64/include/asm/parasite-syscall.h deleted file mode 100644 index 6008c3792..000000000 --- a/criu/arch/loongarch64/include/asm/parasite-syscall.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CR_ASM_PARASITE_SYSCALL_H__ -#define __CR_ASM_PARASITE_SYSCALL_H__ - -struct parasite_ctl; - -#endif diff --git a/criu/arch/loongarch64/include/asm/parasite.h b/criu/arch/loongarch64/include/asm/parasite.h deleted file mode 100644 index b64cb3185..000000000 --- a/criu/arch/loongarch64/include/asm/parasite.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef __ASM_PARASITE_H__ -#define __ASM_PARASITE_H__ - -static inline void arch_get_tls(tls_t *ptls) -{ - tls_t tls; - asm volatile("or %0, $zero, $tp" : "=r"(tls)); - *ptls = tls; -} - -#endif diff --git a/criu/arch/loongarch64/include/asm/restore.h b/criu/arch/loongarch64/include/asm/restore.h deleted file mode 100644 index d956231c8..000000000 --- a/criu/arch/loongarch64/include/asm/restore.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __CR_ASM_RESTORE_H__ -#define __CR_ASM_RESTORE_H__ - -#include "asm/restorer.h" -#include "images/core.pb-c.h" - -/* clang-format off */ -#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args) \ -({ \ - uint64_t save_sp; \ - asm volatile("or %0, $zero, $sp" : "=r"(save_sp) : :"memory"); \ - asm volatile( \ - "or $a0, $zero, %2 \n" \ - "or $sp, $zero, %0 \n" \ - "jirl $ra, %1, 0 \n" \ - : \ - : "r"(new_sp & ~15), \ - "r"(restore_task_exec_start), \ - "r"(task_args) \ - : "$a0", "memory"); \ - asm volatile("or $sp, $zero, %0" : : "r"(save_sp) : "memory"); \ -}) - -/* clang-format on */ - -static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) -{ - *ptls = pcore->ti_loongarch64->tls; -} - -int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); - -#endif diff --git a/criu/arch/loongarch64/include/asm/restorer.h b/criu/arch/loongarch64/include/asm/restorer.h deleted file mode 100644 index 7a0d35c5b..000000000 --- a/criu/arch/loongarch64/include/asm/restorer.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef __CR_ASM_RESTORER_H__ -#define __CR_ASM_RESTORER_H__ - -#include "asm/types.h" -#include -#include "images/core.pb-c.h" -#include -#include - -/* clang-format off */ -#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ - thread_args, clone_restore_fn) \ - asm volatile( \ - "clone_emul: \n" \ - "ld.d $a1, %2 \n" \ - "addi.d $a1, $a1, -16 \n" \ - "st.d %5, $a1, 0 \n" \ - "st.d %6, $a1, 8 \n" \ - "or $a0, $zero, %1 \n" \ - "or $a2, $zero, %3 \n" \ - "or $a3, $zero, %4 \n" \ - "ori $a7, $zero, "__stringify(__NR_clone)" \n" \ - "syscall 0 \n" \ - \ - "beqz $a0, thread_run \n" \ - \ - "or %0, $zero, $a0 \n" \ - "b clone_end \n" \ - \ - "thread_run: \n" \ - "ld.d $a1, $sp, 0 \n" \ - "ld.d $a0, $sp, 8 \n" \ - "jirl $ra, $a1, 0 \n" \ - \ - "clone_end: \n" \ - : "=r"(ret) \ - : "r"(clone_flags), \ - "ZB"(new_sp), \ - "r"(&parent_tid), \ - "r"(&thread_args[i].pid), \ - "r"(&clone_restore_fn), \ - "r"(&thread_args[i]) \ - : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") - -#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ - clone_restore_fn) \ - asm volatile( \ - "clone3_emul: \n" \ - "or $a0, $zero, %1 \n" \ - "or $a1, $zero, %2 \n" \ - "or $a2, $zero, %3 \n" \ - "or $a3, $zero, %4 \n" \ - "ori $a7, $zero, "__stringify(__NR_clone3)" \n" \ - "syscall 0 \n" \ - \ - "beqz $a0, clone3_thread_run \n" \ - \ - "or %0, $zero, $a0 \n" \ - "b clone3_end \n" \ - \ - "clone3_thread_run: \n" \ - "or $a0, $zero, $a3 \n" \ - "jirl $ra, $a2, 0 \n" \ - "clone3_end: \n" \ - : "=r"(ret) \ - : "r"(&clone_args), \ - "r"(size), \ - "r"(clone_restore_fn), \ - "r"(args) \ - : "$a0", "$a1", "$a2", "$a3", "$a7", "memory") -/* clang-format on */ - -static inline void restore_tls(tls_t *ptls) -{ - asm volatile("or $tp, $zero, %0" : : "r"(*ptls)); -} -static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) -{ - return -1; -} -static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) -{ - return -1; -} -static inline void *alloc_compat_syscall_stack(void) -{ - return NULL; -} -static inline void free_compat_syscall_stack(void *stack32) -{ -} -int restore_gpregs(struct rt_sigframe *f, UserLoongarch64GpregsEntry *r); -int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r); - -#define arch_map_vdso(map, compat) -1 - -#endif diff --git a/criu/arch/loongarch64/include/asm/thread_pointer.h b/criu/arch/loongarch64/include/asm/thread_pointer.h deleted file mode 100644 index f7e07066a..000000000 --- a/criu/arch/loongarch64/include/asm/thread_pointer.h +++ /dev/null @@ -1,27 +0,0 @@ -/* __thread_pointer definition. Generic version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -static inline void *__criu_thread_pointer(void) -{ - return __builtin_thread_pointer(); -} - -#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/loongarch64/include/asm/types.h b/criu/arch/loongarch64/include/asm/types.h deleted file mode 100644 index 72bca2022..000000000 --- a/criu/arch/loongarch64/include/asm/types.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __CR_ASM_TYPES_H__ -#define __CR_ASM_TYPES_H__ - -#include -#include - -#include "page.h" -#include "bitops.h" -#include "asm/int.h" -#include "images/core.pb-c.h" - -#include - -#define core_is_compat(core) false - -#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__LOONGARCH64 - -#define CORE_THREAD_ARCH_INFO(core) core->ti_loongarch64 - -#define TI_SP(core) ((core)->ti_loongarch64->gpregs->regs[4]) - -#define TI_IP(core) ((core)->ti_loongarch64->gpregs->pc) - -typedef UserLoongarch64GpregsEntry UserRegsEntry; - -static inline uint64_t encode_pointer(void *p) -{ - return (uint64_t)p; -} -static inline void *decode_pointer(uint64_t v) -{ - return (void *)v; -} - -#define AT_VECTOR_SIZE 44 -typedef uint64_t auxv_t; -typedef uint64_t tls_t; - -#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/loongarch64/include/asm/vdso.h b/criu/arch/loongarch64/include/asm/vdso.h deleted file mode 100644 index 64631dee0..000000000 --- a/criu/arch/loongarch64/include/asm/vdso.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef __CR_ASM_VDSO_H__ -#define __CR_ASM_VDSO_H__ - -#include "asm/int.h" -#include "asm-generic/vdso.h" - -/* This definition is used in pie/util-vdso.c to initialize the vdso symbol - * name string table 'vdso_symbols' - */ - -/* - * This is a minimal amount of symbols - * we should support at the moment. - */ -#define VDSO_SYMBOL_MAX 5 -#define VDSO_SYMBOL_GTOD 3 - -#define ARCH_VDSO_SYMBOLS_LIST \ - const char *aarch_vdso_symbol1 = "__vdso_getcpu"; \ - const char *aarch_vdso_symbol2 = "__vdso_clock_getres"; \ - const char *aarch_vdso_symbol3 = "__vdso_clock_gettime"; \ - const char *aarch_vdso_symbol4 = "__vdso_gettimeofday"; \ - const char *aarch_vdso_symbol5 = "__vdso_rt_sigreturn"; - -#define ARCH_VDSO_SYMBOLS \ - aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5 -#endif diff --git a/criu/arch/loongarch64/restorer.c b/criu/arch/loongarch64/restorer.c deleted file mode 100644 index 730318ac1..000000000 --- a/criu/arch/loongarch64/restorer.c +++ /dev/null @@ -1,14 +0,0 @@ -#include - -#include "restorer.h" -#include "asm/restorer.h" -#include - -#include -#include "log.h" -#include "cpu.h" - -int restore_nonsigframe_gpregs(UserLoongarch64GpregsEntry *r) -{ - return 0; -} diff --git a/criu/arch/loongarch64/sigframe.c b/criu/arch/loongarch64/sigframe.c deleted file mode 100644 index 18983ff13..000000000 --- a/criu/arch/loongarch64/sigframe.c +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include - -#include "asm/sigframe.h" -#include "asm/types.h" - -#include "log.h" -#include -int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) -{ - return 0; -} diff --git a/criu/arch/loongarch64/vdso-pie.c b/criu/arch/loongarch64/vdso-pie.c deleted file mode 100644 index 7a75d2741..000000000 --- a/criu/arch/loongarch64/vdso-pie.c +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include "asm/types.h" - -#include -#include -#include "parasite-vdso.h" -#include "log.h" -#include "common/bug.h" - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "vdso: " -static void insert_trampoline(uintptr_t from, uintptr_t to) -{ - struct { - uint32_t pcaddi; - uint32_t ldptr; - uint32_t jirl; - uint32_t guards; - uint64_t imm64; - } __packed jmp = { - .pcaddi = 0x18000095, /* pcaddi $x, 4 */ - .ldptr = 0x260002b5, /* ldptr.d $x, $x, 0 */ - .jirl = 0x4c0002a0, /* jirl $zero, $x, 0 */ - .guards = 0x002a0000, /* break 0 */ - .imm64 = to, - }; - memcpy((void *)from, &jmp, sizeof(jmp)); -} - -int vdso_redirect_calls(unsigned long base_to, unsigned long base_from, struct vdso_symtable *sto, - struct vdso_symtable *sfrom, bool compat_vdso) -{ - unsigned int i; - unsigned long from, to; - for (i = 0; i < ARRAY_SIZE(sto->symbols); i++) { - if (vdso_symbol_empty(&sfrom->symbols[i])) - continue; - pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n", base_from, sfrom->symbols[i].offset, base_to, - sto->symbols[i].offset, i); - - from = base_from + sfrom->symbols[i].offset; - to = base_to + sto->symbols[i].offset; - insert_trampoline(from, to); - } - return 0; -} diff --git a/criu/arch/mips/crtools.c b/criu/arch/mips/crtools.c index eabbd85f4..ed4da9b7e 100644 --- a/criu/arch/mips/crtools.c +++ b/criu/arch/mips/crtools.c @@ -27,7 +27,7 @@ #include "images/core.pb-c.h" #include "images/creds.pb-c.h" -int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; diff --git a/criu/arch/mips/include/asm/dump.h b/criu/arch/mips/include/asm/dump.h index ec59b051b..58015833d 100644 --- a/criu/arch/mips/include/asm/dump.h +++ b/criu/arch/mips/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); diff --git a/criu/arch/mips/include/asm/thread_pointer.h b/criu/arch/mips/include/asm/thread_pointer.h deleted file mode 100644 index f7e07066a..000000000 --- a/criu/arch/mips/include/asm/thread_pointer.h +++ /dev/null @@ -1,27 +0,0 @@ -/* __thread_pointer definition. Generic version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -static inline void *__criu_thread_pointer(void) -{ - return __builtin_thread_pointer(); -} - -#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/mips/include/asm/types.h b/criu/arch/mips/include/asm/types.h index 2c75b6a92..237471f3c 100644 --- a/criu/arch/mips/include/asm/types.h +++ b/criu/arch/mips/include/asm/types.h @@ -18,8 +18,6 @@ #define CORE_THREAD_ARCH_INFO(core) core->ti_mips -#define TI_IP(core) ((core)->ti_mips->gpregs->cp0_epc) - typedef UserMipsRegsEntry UserRegsEntry; static inline u64 encode_pointer(void *p) diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c index b87230f40..bb5b7256e 100644 --- a/criu/arch/ppc64/cpu.c +++ b/criu/arch/ppc64/cpu.c @@ -64,12 +64,6 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; - if (empty_image(img)) { - pr_err("No cpuinfo image\n"); - close_image(img); - return -1; - } - if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c index d57040008..a08a2ca5b 100644 --- a/criu/arch/ppc64/crtools.c +++ b/criu/arch/ppc64/crtools.c @@ -404,7 +404,7 @@ static int __copy_task_regs(user_regs_struct_t *regs, user_fpregs_struct_t *fpre return 0; } -int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { return __copy_task_regs(u, f, (CoreEntry *)arg); } diff --git a/criu/arch/ppc64/include/asm/dump.h b/criu/arch/ppc64/include/asm/dump.h index 7393654fa..eb488900a 100644 --- a/criu/arch/ppc64/include/asm/dump.h +++ b/criu/arch/ppc64/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/ppc64/include/asm/thread_pointer.h b/criu/arch/ppc64/include/asm/thread_pointer.h deleted file mode 100644 index 304516fbe..000000000 --- a/criu/arch/ppc64/include/asm/thread_pointer.h +++ /dev/null @@ -1,33 +0,0 @@ -/* __thread_pointer definition. powerpc version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -#ifdef __powerpc64__ -register void *__thread_register asm("r13"); -#else -register void *__thread_register asm("r2"); -#endif - -static inline void *__criu_thread_pointer(void) -{ - return __thread_register; -} - -#endif /* _SYS_THREAD_POINTER_H */ \ No newline at end of file diff --git a/criu/arch/ppc64/include/asm/types.h b/criu/arch/ppc64/include/asm/types.h index d60aadde5..fedeff22a 100644 --- a/criu/arch/ppc64/include/asm/types.h +++ b/criu/arch/ppc64/include/asm/types.h @@ -19,8 +19,6 @@ typedef UserPpc64RegsEntry UserRegsEntry; #define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64 -#define TI_IP(core) ((core)->ti_ppc64->gpregs->nip) - static inline void *decode_pointer(uint64_t v) { return (void *)v; diff --git a/criu/arch/ppc64/restorer.c b/criu/arch/ppc64/restorer.c index 56c09391e..c17ba1669 100644 --- a/criu/arch/ppc64/restorer.c +++ b/criu/arch/ppc64/restorer.c @@ -45,10 +45,10 @@ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) unsigned long raddr; int ret; - ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ - shmflg, /* second */ + ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ + shmflg, /* second */ (unsigned long)&raddr, /* third */ - shmaddr, /* ptr */ + shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) diff --git a/criu/arch/ppc64/vdso-pie.c b/criu/arch/ppc64/vdso-pie.c index a84ae776b..f01123efe 100644 --- a/criu/arch/ppc64/vdso-pie.c +++ b/criu/arch/ppc64/vdso-pie.c @@ -110,9 +110,9 @@ static inline void put_trampoline_call(unsigned long at, unsigned long to, unsig { uint32_t *addr = (uint32_t *)at; - *addr++ = 0x7C0802a6; /* mflr r0 */ + *addr++ = 0x7C0802a6; /* mflr r0 */ *addr++ = 0x48000001 | ((long)(tr - at - 4) & 0x3fffffc); /* bl tr */ - *(uint64_t *)addr = to; /* the address to read by the trampoline */ + *(uint64_t *)addr = to; /* the address to read by the trampoline */ invalidate_caches(at); } diff --git a/criu/arch/riscv64/Makefile b/criu/arch/riscv64/Makefile deleted file mode 100644 index d19895471..000000000 --- a/criu/arch/riscv64/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -builtin-name := crtools.built-in.o - -ldflags-y += -r - -obj-y += cpu.o -obj-y += crtools.o -obj-y += sigframe.o -obj-y += vdso-lookup.o \ No newline at end of file diff --git a/criu/arch/riscv64/cpu.c b/criu/arch/riscv64/cpu.c deleted file mode 100644 index 97a883b8c..000000000 --- a/criu/arch/riscv64/cpu.c +++ /dev/null @@ -1,40 +0,0 @@ -#undef LOG_PREFIX -#define LOG_PREFIX "cpu: " - -#include -#include "cpu.h" - -int cpu_init(void) -{ - return 0; -} - -int cpu_dump_cpuinfo(void) -{ - return 0; -} - -int cpu_validate_cpuinfo(void) -{ - return 0; -} - -int cpu_dump_cpuinfo_single(void) -{ - return -ENOTSUP; -} - -int cpu_validate_image_cpuinfo_single(void) -{ - return -ENOTSUP; -} - -int cpuinfo_dump(void) -{ - return -ENOTSUP; -} - -int cpuinfo_check(void) -{ - return -ENOTSUP; -} diff --git a/criu/arch/riscv64/crtools.c b/criu/arch/riscv64/crtools.c deleted file mode 100644 index eea98d6de..000000000 --- a/criu/arch/riscv64/crtools.c +++ /dev/null @@ -1,171 +0,0 @@ -#include -#include - -#include - -#include "types.h" -#include - -#include -#include "asm/restorer.h" -#include "common/compiler.h" -#include -#include "asm/dump.h" -#include "protobuf.h" -#include "images/core.pb-c.h" -#include "images/creds.pb-c.h" -#include "parasite-syscall.h" -#include "log.h" -#include "util.h" -#include "cpu.h" -#include "restorer.h" -#include "compel/infect.h" - -#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e - -int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpsimd) -{ - int i; - CoreEntry *core = x; - - // Save riscv64 gprs - assign_reg(core->ti_riscv64->gpregs, regs, pc); - assign_reg(core->ti_riscv64->gpregs, regs, ra); - assign_reg(core->ti_riscv64->gpregs, regs, sp); - assign_reg(core->ti_riscv64->gpregs, regs, gp); - assign_reg(core->ti_riscv64->gpregs, regs, tp); - assign_reg(core->ti_riscv64->gpregs, regs, t0); - assign_reg(core->ti_riscv64->gpregs, regs, t1); - assign_reg(core->ti_riscv64->gpregs, regs, t2); - assign_reg(core->ti_riscv64->gpregs, regs, s0); - assign_reg(core->ti_riscv64->gpregs, regs, s1); - assign_reg(core->ti_riscv64->gpregs, regs, a0); - assign_reg(core->ti_riscv64->gpregs, regs, a1); - assign_reg(core->ti_riscv64->gpregs, regs, a2); - assign_reg(core->ti_riscv64->gpregs, regs, a3); - assign_reg(core->ti_riscv64->gpregs, regs, a4); - assign_reg(core->ti_riscv64->gpregs, regs, a5); - assign_reg(core->ti_riscv64->gpregs, regs, a6); - assign_reg(core->ti_riscv64->gpregs, regs, a7); - assign_reg(core->ti_riscv64->gpregs, regs, s2); - assign_reg(core->ti_riscv64->gpregs, regs, s3); - assign_reg(core->ti_riscv64->gpregs, regs, s4); - assign_reg(core->ti_riscv64->gpregs, regs, s5); - assign_reg(core->ti_riscv64->gpregs, regs, s6); - assign_reg(core->ti_riscv64->gpregs, regs, s7); - assign_reg(core->ti_riscv64->gpregs, regs, s8); - assign_reg(core->ti_riscv64->gpregs, regs, s9); - assign_reg(core->ti_riscv64->gpregs, regs, s10); - assign_reg(core->ti_riscv64->gpregs, regs, s11); - assign_reg(core->ti_riscv64->gpregs, regs, t3); - assign_reg(core->ti_riscv64->gpregs, regs, t4); - assign_reg(core->ti_riscv64->gpregs, regs, t5); - assign_reg(core->ti_riscv64->gpregs, regs, t6); - - // Save riscv64 fprs - for (i = 0; i < 32; ++i) - assign_reg(core->ti_riscv64->fpsimd, fpsimd, f[i]); - assign_reg(core->ti_riscv64->fpsimd, fpsimd, fcsr); - - return 0; -} - -int arch_alloc_thread_info(CoreEntry *core) -{ - ThreadInfoRiscv64 *ti_riscv64; - UserRiscv64RegsEntry *gpregs; - UserRiscv64DExtEntry *fpsimd; - - ti_riscv64 = xmalloc(sizeof(*ti_riscv64)); - if (!ti_riscv64) - goto err; - thread_info_riscv64__init(ti_riscv64); - core->ti_riscv64 = ti_riscv64; - - gpregs = xmalloc(sizeof(*gpregs)); - if (!gpregs) - goto err; - user_riscv64_regs_entry__init(gpregs); - - ti_riscv64->gpregs = gpregs; - - fpsimd = xmalloc(sizeof(*fpsimd)); - if (!fpsimd) - goto err; - user_riscv64_d_ext_entry__init(fpsimd); - ti_riscv64->fpsimd = fpsimd; - fpsimd->f = xmalloc(32 * sizeof(fpsimd->f[0])); - fpsimd->n_f = 32; - if (!fpsimd->f) - goto err; - - return 0; -err: - return -1; -} - -void arch_free_thread_info(CoreEntry *core) -{ - if (core->ti_riscv64) { - if (core->ti_riscv64->fpsimd) { - xfree(core->ti_riscv64->fpsimd->f); - xfree(core->ti_riscv64->fpsimd); - } - xfree(core->ti_riscv64->gpregs); - xfree(core->ti_riscv64); - core->ti_riscv64 = NULL; - } -} - -int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) -{ - int i; - UserRiscv64DExtEntry *fpsimd = core->ti_riscv64->fpsimd; - - if (fpsimd->n_f != 32) - return 1; - - for (i = 0; i < 32; ++i) - sigframe->uc.uc_mcontext.__fpregs.__d.__f[i] = fpsimd->f[i]; - sigframe->uc.uc_mcontext.__fpregs.__d.__fcsr = fpsimd->fcsr; - - return 0; -} - -int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r) -{ - f->uc.uc_mcontext.__gregs[0] = r->pc; - f->uc.uc_mcontext.__gregs[1] = r->ra; - f->uc.uc_mcontext.__gregs[2] = r->sp; - f->uc.uc_mcontext.__gregs[3] = r->gp; - f->uc.uc_mcontext.__gregs[4] = r->tp; - f->uc.uc_mcontext.__gregs[5] = r->t0; - f->uc.uc_mcontext.__gregs[6] = r->t1; - f->uc.uc_mcontext.__gregs[7] = r->t2; - f->uc.uc_mcontext.__gregs[8] = r->s0; - f->uc.uc_mcontext.__gregs[9] = r->s1; - f->uc.uc_mcontext.__gregs[10] = r->a0; - f->uc.uc_mcontext.__gregs[11] = r->a1; - f->uc.uc_mcontext.__gregs[12] = r->a2; - f->uc.uc_mcontext.__gregs[13] = r->a3; - f->uc.uc_mcontext.__gregs[14] = r->a4; - f->uc.uc_mcontext.__gregs[15] = r->a5; - f->uc.uc_mcontext.__gregs[16] = r->a6; - f->uc.uc_mcontext.__gregs[17] = r->a7; - f->uc.uc_mcontext.__gregs[18] = r->s2; - f->uc.uc_mcontext.__gregs[19] = r->s3; - f->uc.uc_mcontext.__gregs[20] = r->s4; - f->uc.uc_mcontext.__gregs[21] = r->s5; - f->uc.uc_mcontext.__gregs[22] = r->s6; - f->uc.uc_mcontext.__gregs[23] = r->s7; - f->uc.uc_mcontext.__gregs[24] = r->s8; - f->uc.uc_mcontext.__gregs[25] = r->s9; - f->uc.uc_mcontext.__gregs[26] = r->s10; - f->uc.uc_mcontext.__gregs[27] = r->s11; - f->uc.uc_mcontext.__gregs[28] = r->t3; - f->uc.uc_mcontext.__gregs[29] = r->t4; - f->uc.uc_mcontext.__gregs[30] = r->t5; - f->uc.uc_mcontext.__gregs[31] = r->t6; - - return 0; -} diff --git a/criu/arch/riscv64/include/asm/dump.h b/criu/arch/riscv64/include/asm/dump.h deleted file mode 100644 index 4f0a2d209..000000000 --- a/criu/arch/riscv64/include/asm/dump.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __CR_ASM_DUMP_H__ -#define __CR_ASM_DUMP_H__ - -extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); -extern int arch_alloc_thread_info(CoreEntry *core); -extern void arch_free_thread_info(CoreEntry *core); - -static inline void core_put_tls(CoreEntry *core, tls_t tls) -{ - core->ti_riscv64->tls = tls; -} - -#define get_task_futex_robust_list_compat(pid, info) -1 - -#endif diff --git a/criu/arch/riscv64/include/asm/int.h b/criu/arch/riscv64/include/asm/int.h deleted file mode 100644 index 642804e9b..000000000 --- a/criu/arch/riscv64/include/asm/int.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CR_ASM_INT_H__ -#define __CR_ASM_INT_H__ - -#include "asm-generic/int.h" - -#endif /* __CR_ASM_INT_H__ */ diff --git a/criu/arch/riscv64/include/asm/kerndat.h b/criu/arch/riscv64/include/asm/kerndat.h deleted file mode 100644 index bb70cf6cf..000000000 --- a/criu/arch/riscv64/include/asm/kerndat.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __CR_ASM_KERNDAT_H__ -#define __CR_ASM_KERNDAT_H__ - -#define kdat_compatible_cr() 0 -#define kdat_can_map_vdso() 0 - -#endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/riscv64/include/asm/parasite-syscall.h b/criu/arch/riscv64/include/asm/parasite-syscall.h deleted file mode 100644 index 6008c3792..000000000 --- a/criu/arch/riscv64/include/asm/parasite-syscall.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CR_ASM_PARASITE_SYSCALL_H__ -#define __CR_ASM_PARASITE_SYSCALL_H__ - -struct parasite_ctl; - -#endif diff --git a/criu/arch/riscv64/include/asm/parasite.h b/criu/arch/riscv64/include/asm/parasite.h deleted file mode 100644 index 4798cfd8a..000000000 --- a/criu/arch/riscv64/include/asm/parasite.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __ASM_PARASITE_H__ -#define __ASM_PARASITE_H__ - -/* - * This function is used to retrieve the value of the thread pointer (tp) - * in RISC-V architecture, which is typically used for thread-local storage (TLS). - * The value is then stored in the provided tls_t pointer. - */ -static inline void arch_get_tls(tls_t *ptls) -{ - tls_t tls; - asm("mv %0, tp" : "=r"(tls)); - *ptls = tls; -} - -#endif diff --git a/criu/arch/riscv64/include/asm/restore.h b/criu/arch/riscv64/include/asm/restore.h deleted file mode 100644 index e4f25a57b..000000000 --- a/criu/arch/riscv64/include/asm/restore.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef __CR_ASM_RESTORE_H__ -#define __CR_ASM_RESTORE_H__ - -#include "asm/restorer.h" - -#include "images/core.pb-c.h" - -/* clang-format off */ -#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \ - task_args) \ - asm volatile( \ - "and sp, %0, ~15 \n" \ - "mv a0, %2 \n" \ - "jr %1 \n" \ - : \ - : "r"(new_sp), \ - "r"(restore_task_exec_start), \ - "r"(task_args) \ - : "a0", "memory") -/* clang-format on */ - -static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) -{ - *ptls = pcore->ti_riscv64->tls; -} - -int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core); - -#endif diff --git a/criu/arch/riscv64/include/asm/restorer.h b/criu/arch/riscv64/include/asm/restorer.h deleted file mode 100644 index 45fe847a9..000000000 --- a/criu/arch/riscv64/include/asm/restorer.h +++ /dev/null @@ -1,150 +0,0 @@ -#ifndef __CR_ASM_RESTORER_H__ -#define __CR_ASM_RESTORER_H__ - -#include - -#include "asm/types.h" -#include "images/core.pb-c.h" - -#include - -// kernel arg order for clone -// unsigned long clone_flags, -// unsigned long newsp, -// int __user * parent_tidptr, -// unsigned long tls, -// int __user * child_tidptr -/* clang-format off */ -#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ - thread_args, clone_restore_fn) \ - asm volatile( \ - "clone_emul: \n" \ - "ld a1, %2 \n" \ - "andi a1, a1, ~15 \n" \ - "addi a1, a1, -16 \n" \ - "sd %5, 0(a1) \n" \ - "sd %6, 8(a1) \n" \ - "mv a0, %1 \n" \ - "mv a2, %3 \n" \ - "mv a3, %4 \n" \ - "li a7, "__stringify(__NR_clone)" \n" \ - "ecall \n" \ - \ - "beqz a0, thread_run \n" \ - \ - "mv %0, a0 \n" \ - "j clone_end \n" \ - \ - "thread_run: \n" \ - "ld a1, 0(sp) \n" \ - "ld a0, 8(sp) \n" \ - "jr a1 \n" \ - \ - "clone_end: \n" \ - : "=r"(ret) \ - : "r"(clone_flags), \ - "m"(new_sp), \ - "r"(&parent_tid), \ - "r"(&thread_args[i].pid), \ - "r"(clone_restore_fn), \ - "r"(&thread_args[i]) \ - : "a0", "a1", "a2", "a3", "a7", "memory") - -/* - * Based on sysdeps/unix/sysv/linux/riscv/clone.S - * - * int clone(int (*fn)(void *arg), x0 - * void *child_stack, x1 - * int flags, x2 - * void *arg, x3 - * pid_t *ptid, x4 - * struct user_desc *tls, x5 - * pid_t *ctid); x6 - * - * int clone3(struct clone_args *args, x0 - * size_t size); x1 - * - * Always consult the CLONE3 wrappers for other architectures - * for additional details. - * - */ -#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ - clone_restore_fn) \ - asm volatile( \ - /* In contrast to the clone() wrapper above this does not put - * the thread function and its arguments on the child stack, - * but uses registers to pass these parameters to the child process. - * Based on the glibc clone() wrapper at - * sysdeps/unix/sysv/linux/riscv/clone.S. - */ \ - "clone3_emul: \n" \ - /* - * Based on the glibc clone() wrapper, which uses x10 and x11 - * to save the arguments for the child process, this does the same. - * x10 for the thread function and x11 for the thread arguments. - */ \ - "mv t0, %3 /* clone_restore_fn */ \n" \ - "mv t1, %4 /* args */ \n" \ - "mv a0, %1 /* &clone_args */ \n" \ - "mv a1, %2 /* size */ \n" \ - /* Load syscall number */ \ - "li a7, "__stringify(__NR_clone3)" \n" \ - /* Do the syscall */ \ - "ecall \n" \ - \ - "beqz a0, clone3_thread_run \n" \ - \ - "mv %0, a0 \n" \ - "j clone3_end \n" \ - \ - "clone3_thread_run: \n" \ - /* Move args to a0 */ \ - "mv a0, t1 \n" \ - /* Jump to clone_restore_fn */ \ - "jr t0 \n" \ - \ - "clone3_end: \n" \ - : "=r"(ret) \ - : "r"(&clone_args), \ - "r"(size), \ - "r"(clone_restore_fn), \ - "r"(args) \ - : "a0", "a1", "a7", "t0", "t1", "memory") - -#define ARCH_FAIL_CORE_RESTORE \ - asm volatile( \ - "mv sp, %0 \n" \ - "li a0, 0 \n" \ - "jr x0 \n" \ - : \ - : "r"(ret) \ - : "sp", "a0", "memory") -/* clang-format on */ - -#define arch_map_vdso(map, compat) -1 - -int restore_gpregs(struct rt_sigframe *f, UserRiscv64RegsEntry *r); -int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r); - -static inline void restore_tls(tls_t *ptls) -{ - asm("mv tp, %0" : : "r"(*ptls)); -} - -static inline void *alloc_compat_syscall_stack(void) -{ - return NULL; -} -static inline void free_compat_syscall_stack(void *stack32) -{ -} -static inline int arch_compat_rt_sigaction(void *stack, int sig, void *act) -{ - return -1; -} -static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) -{ - return -1; -} - -#endif \ No newline at end of file diff --git a/criu/arch/riscv64/include/asm/thread_pointer.h b/criu/arch/riscv64/include/asm/thread_pointer.h deleted file mode 100644 index f7e07066a..000000000 --- a/criu/arch/riscv64/include/asm/thread_pointer.h +++ /dev/null @@ -1,27 +0,0 @@ -/* __thread_pointer definition. Generic version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -static inline void *__criu_thread_pointer(void) -{ - return __builtin_thread_pointer(); -} - -#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/riscv64/include/asm/types.h b/criu/arch/riscv64/include/asm/types.h deleted file mode 100644 index 83bb5f65f..000000000 --- a/criu/arch/riscv64/include/asm/types.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef __CR_ASM_TYPES_H__ -#define __CR_ASM_TYPES_H__ - -#include -#include -#include -#include "images/core.pb-c.h" - -#include "page.h" -#include "bitops.h" -#include "asm/int.h" - -#include - -#define core_is_compat(core) false - -typedef UserRiscv64RegsEntry UserRegsEntry; - -#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__RISCV64 - -#define CORE_THREAD_ARCH_INFO(core) core->ti_riscv64 - -#define TI_SP(core) ((core)->ti_riscv64->gpregs->sp) - -#define TI_IP(core) ((core)->ti_riscv64->gpregs->pc) - -static inline void *decode_pointer(uint64_t v) -{ - return (void *)v; -} -static inline uint64_t encode_pointer(void *p) -{ - return (uint64_t)p; -} - -#define AT_VECTOR_SIZE 64 -typedef uint64_t auxv_t; -typedef uint64_t tls_t; - -#endif /* __CR_ASM_TYPES_H__ */ diff --git a/criu/arch/riscv64/include/asm/vdso.h b/criu/arch/riscv64/include/asm/vdso.h deleted file mode 100644 index 322149c6e..000000000 --- a/criu/arch/riscv64/include/asm/vdso.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef __CR_ASM_VDSO_H__ -#define __CR_ASM_VDSO_H__ - -#include "asm/int.h" -#include "common/compiler.h" -#include "asm-generic/vdso.h" - -/* - * This is a minimal amount of symbols - * we should support at the moment. - */ -#define VDSO_SYMBOL_MAX 6 -#define VDSO_SYMBOL_GTOD 2 - -#define ARCH_VDSO_SYMBOLS_LIST \ - const char *rv64_vdso_symbol1 = "__vdso_clock_getres"; \ - const char *rv64_vdso_symbol2 = "__vdso_clock_gettime"; \ - const char *rv64_vdso_symbol3 = "__vdso_gettimeofday"; \ - const char *rv64_vdso_symbol4 = "__vdso_getcpu"; \ - const char *rv64_vdso_symbol5 = "__vdso_flush_icache"; \ - const char *rv64_vdso_symbol6 = "__vdso_rt_sigreturn"; - -#define ARCH_VDSO_SYMBOLS \ - rv64_vdso_symbol1, rv64_vdso_symbol2, rv64_vdso_symbol3, rv64_vdso_symbol4, rv64_vdso_symbol5, rv64_vdso_symbol6 - -extern void write_intraprocedure_branch(unsigned long to, unsigned long from); - -#endif /* __CR_ASM_VDSO_H__ */ \ No newline at end of file diff --git a/criu/arch/riscv64/restorer.c b/criu/arch/riscv64/restorer.c deleted file mode 100644 index d605f048d..000000000 --- a/criu/arch/riscv64/restorer.c +++ /dev/null @@ -1,14 +0,0 @@ -#include - -#include "restorer.h" -#include "asm/restorer.h" - -#include -#include "log.h" -#include -#include "cpu.h" - -int restore_nonsigframe_gpregs(UserRiscv64RegsEntry *r) -{ - return 0; -} diff --git a/criu/arch/riscv64/sigframe.c b/criu/arch/riscv64/sigframe.c deleted file mode 100644 index 8096fab66..000000000 --- a/criu/arch/riscv64/sigframe.c +++ /dev/null @@ -1,8 +0,0 @@ -#include "asm/types.h" -#include -#include "asm/sigframe.h" - -int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *rsigframe) -{ - return 0; -} diff --git a/criu/arch/riscv64/vdso-lookup.S b/criu/arch/riscv64/vdso-lookup.S deleted file mode 100644 index 50d4ecf08..000000000 --- a/criu/arch/riscv64/vdso-lookup.S +++ /dev/null @@ -1,15 +0,0 @@ -#include "common/asm/linkage.h" - -.section .text - -/* Expects t0 to hold the index into the lookup table. */ -GLOBAL(riscv_vdso_lookup) - /* Get the beginning of the lookup table */ - la t1, riscv_vdso_lookup_end - /* Scale the index */ - slli t0, t0, 3 - add t1, t0, t1 - ld t2, 0(t1) - jr t2 - -GLOBAL(riscv_vdso_lookup_end) \ No newline at end of file diff --git a/criu/arch/riscv64/vdso-pie.c b/criu/arch/riscv64/vdso-pie.c deleted file mode 100644 index aa9272fb5..000000000 --- a/criu/arch/riscv64/vdso-pie.c +++ /dev/null @@ -1,159 +0,0 @@ -#include - -#include "asm/types.h" - -#include -#include -#include -#include -#include "atomic.h" -#include "parasite-vdso.h" -#include "log.h" -#include "common/bug.h" - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "vdso: " - -/* These symbols are defined in vdso-lookup.S */ -extern char *riscv_vdso_lookup, *riscv_vdso_lookup_end; - -/* - * li t0, INDEX - * jal x0, riscv_vdso_lookup - */ -#define TRAMP_CALL_SIZE (2 * sizeof(uint32_t)) - -static inline void invalidate_caches(void) -{ - // We're supposed to use the VDSO as the officially sanctioned ABI. But oh well. - int ret; - __smp_mb(); - asm volatile("li a0, 0\n" - "li a1, 0\n" - "li a2, 1\n" /* SYS_RISCV_FLUSH_ICACHE_ALL */ - "li a7, 259\n" /* __NR_arch_specific_syscall */ - "ecall\n" - : "=r"(ret) - : - : "a7"); -} - -static inline size_t vdso_trampoline_size(void) -{ - return (size_t)&riscv_vdso_lookup_end - (size_t)&riscv_vdso_lookup; -} - -static uint64_t put_trampoline(uint64_t at, struct vdso_symtable *sym) -{ - int i, j; - uint64_t total_size, trampoline_size; - uint64_t trampoline = 0; - - /* First of all we have to find a place where to put the trampoline - * code. - */ - trampoline_size = vdso_trampoline_size(); - total_size = trampoline_size + VDSO_SYMBOL_MAX * sizeof(uint64_t); - - for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) { - if (vdso_symbol_empty(&sym->symbols[i])) - continue; - - pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name, sym->symbols[i].offset); - - /* find the nearest following symbol we are interested in */ - for (j = 0; j < ARRAY_SIZE(sym->symbols); j++) { - if (i == j || vdso_symbol_empty(&sym->symbols[j])) - continue; - - if (sym->symbols[j].offset <= sym->symbols[i].offset) - /* this symbol is above the current one */ - continue; - - if ((sym->symbols[i].offset + TRAMP_CALL_SIZE) > sym->symbols[j].offset) { - /* we have a major issue here since we cannot - * even put the trampoline call for this symbol - */ - pr_err("Can't handle small vDSO symbol %s\n", sym->symbols[i].name); - return 0; - } - - if (trampoline) - /* no need to put it twice */ - continue; - - if ((sym->symbols[j].offset - (sym->symbols[i].offset + TRAMP_CALL_SIZE)) <= total_size) - /* not enough place */ - continue; - - /* We can put the trampoline there */ - trampoline = at + sym->symbols[i].offset; - trampoline += TRAMP_CALL_SIZE; - - pr_debug("Putting vDSO trampoline in %s at %lx\n", sym->symbols[i].name, trampoline); - memcpy((void *)trampoline, &riscv_vdso_lookup, trampoline_size); - invalidate_caches(); - return trampoline; - } - } - - return 0; -} - -static inline void put_trampoline_call(uint64_t from, uint64_t to, uint64_t trampoline, unsigned int idx) -{ - size_t trampoline_size = vdso_trampoline_size(); - uint64_t *lookup_table = NULL; - /* - * li t0, INDEX - * addi t0, x0 INDEX - * jal x0, riscv_vdso_lookup - */ - uint32_t trampoline_call[2] = { - 0x00000293, - 0x0000006f, - }; - const size_t insts_len = ARRAY_SIZE(trampoline_call); - uint32_t *call_addr = (uint32_t *)from; - // Offset from the jal instruction to the lookup trampoline. - ssize_t trampoline_offset = trampoline - (from + sizeof(uint32_t)); - - trampoline_call[0] = trampoline_call[0] | (idx << 24); - trampoline_call[1] = trampoline_call[1] | riscv_j_imm(trampoline_offset); - - for (unsigned int i = 0; i < insts_len; i++) { - call_addr[i] = trampoline_call[i]; - } - - // Set the lookup table pointer for this vdso symbol. - lookup_table = (uint64_t *)(trampoline + trampoline_size); - lookup_table[idx] = to; -} - -int vdso_redirect_calls(uint64_t base_to, uint64_t base_from, struct vdso_symtable *to, struct vdso_symtable *from, - bool __always_unused compat_vdso) -{ - unsigned int i, valid_idx = 0; - - uint64_t trampoline = (uint64_t)put_trampoline(base_from, from); - if (!trampoline) - return 1; - - for (i = 0; i < ARRAY_SIZE(to->symbols); i++) { - if (vdso_symbol_empty(&from->symbols[i])) - continue; - - pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n", base_from, from->symbols[i].offset, base_to, - to->symbols[i].offset, i, from->symbols[i].name); - - put_trampoline_call(base_from + from->symbols[i].offset, base_to + to->symbols[i].offset, trampoline, - valid_idx); - valid_idx++; - } - - invalidate_caches(); - - return 0; -} \ No newline at end of file diff --git a/criu/arch/s390/cpu.c b/criu/arch/s390/cpu.c index e227fad5e..3f430f455 100644 --- a/criu/arch/s390/cpu.c +++ b/criu/arch/s390/cpu.c @@ -87,12 +87,6 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; - if (empty_image(img)) { - pr_err("No cpuinfo image\n"); - close_image(img); - return -1; - } - ret = 0; if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0) goto error; diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index e08c83878..b22b64e2b 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -142,29 +142,6 @@ static void print_core_fp_regs(const char *msg, CoreEntry *core) print_core_ri_cb(core); } -/* - * Allocate floating point registers - */ -static UserS390FpregsEntry *allocate_fp_regs(void) -{ - UserS390FpregsEntry *fpregs; - - fpregs = xmalloc(sizeof(*fpregs)); - if (!fpregs) - return NULL; - user_s390_fpregs_entry__init(fpregs); - - fpregs->n_fprs = 16; - fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); - if (!fpregs->fprs) - goto fail_free_fpregs; - return fpregs; - -fail_free_fpregs: - xfree(fpregs); - return NULL; -} - /* * Allocate VxrsLow registers */ @@ -257,7 +234,7 @@ fail_free_gs_cb: } /* - * Free Guarded Storage control blocks + * Free Guareded Storage control blocks */ static void free_gs_cb(UserS390GsCbEntry *gs_cb) { @@ -305,7 +282,7 @@ static void free_ri_cb(UserS390RiEntry *ri_cb) /* * Copy internal structures into Google Protocol Buffers */ -int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) +int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f) { UserS390VxrsHighEntry *vxrs_high = NULL; UserS390VxrsLowEntry *vxrs_low = NULL; @@ -317,13 +294,7 @@ int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_stru CoreEntry *core = arg; gpregs = CORE_THREAD_ARCH_INFO(core)->gpregs; - /* - * We delay allocating this until now because checkpointing can fail earlier. - * When it fails we need to know if we reached here or not so that the cleanup - * code doesn't restore FPRs that were never saved in the first place. - */ - fpregs = allocate_fp_regs(); - CORE_THREAD_ARCH_INFO(core)->fpregs = fpregs; + fpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; /* Vector registers */ if (f->flags & USER_FPREGS_VXRS) { @@ -428,15 +399,36 @@ int restore_fpu(struct rt_sigframe *f, CoreEntry *core) return 0; } +/* + * Allocate floating point registers + */ +static UserS390FpregsEntry *allocate_fp_regs(void) +{ + UserS390FpregsEntry *fpregs; + + fpregs = xmalloc(sizeof(*fpregs)); + if (!fpregs) + return NULL; + user_s390_fpregs_entry__init(fpregs); + + fpregs->n_fprs = 16; + fpregs->fprs = xzalloc(16 * sizeof(uint64_t)); + if (!fpregs->fprs) + goto fail_free_fpregs; + return fpregs; + +fail_free_fpregs: + xfree(fpregs); + return NULL; +} + /* * Free floating point registers */ static void free_fp_regs(UserS390FpregsEntry *fpregs) { - if (fpregs) { - xfree(fpregs->fprs); - xfree(fpregs); - } + xfree(fpregs->fprs); + xfree(fpregs); } /* @@ -495,17 +487,15 @@ int arch_alloc_thread_info(CoreEntry *core) ti_s390->gpregs = allocate_gp_regs(); if (!ti_s390->gpregs) goto fail_free_ti_s390; - - /* - * Delay allocating space until needed. Checkpointing can fail before that - * and the cleanup code needs to be able to tell if FPRs were saved or not - * before trying to restore the register state. - */ - ti_s390->fpregs = NULL; + ti_s390->fpregs = allocate_fp_regs(); + if (!ti_s390->fpregs) + goto fail_free_gp_regs; CORE_THREAD_ARCH_INFO(core) = ti_s390; return 0; +fail_free_gp_regs: + free_gp_regs(ti_s390->gpregs); fail_free_ti_s390: xfree(ti_s390); return -1; @@ -688,18 +678,14 @@ static int set_task_regs(pid_t pid, CoreEntry *core) user_fpregs_struct_t fpregs; memset(&fpregs, 0, sizeof(fpregs)); - /* - * Floating point registers - * Optional on checkpoint; checkpoint may have failed and we may reach here as part of cleanup - * so there's no guarantee that we saved FPRs for this thread. - */ + /* Floating point registers */ cfpregs = CORE_THREAD_ARCH_INFO(core)->fpregs; - if (cfpregs) { - fpregs.prfpreg.fpc = cfpregs->fpc; - memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); - if (set_fp_regs(pid, &fpregs) < 0) - return -1; - } + if (!cfpregs) + return -1; + fpregs.prfpreg.fpc = cfpregs->fpc; + memcpy(fpregs.prfpreg.fprs, cfpregs->fprs, sizeof(fpregs.prfpreg.fprs)); + if (set_fp_regs(pid, &fpregs) < 0) + return -1; /* Vector registers (optional) */ cvxrs_low = CORE_THREAD_ARCH_INFO(core)->vxrs_low; if (cvxrs_low != NULL) { diff --git a/criu/arch/s390/include/asm/dump.h b/criu/arch/s390/include/asm/dump.h index 5a24c5b3d..c200724d7 100644 --- a/criu/arch/s390/include/asm/dump.h +++ b/criu/arch/s390/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -int save_task_regs(pid_t pid, void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); +int save_task_regs(void *arg, user_regs_struct_t *u, user_fpregs_struct_t *f); int arch_alloc_thread_info(CoreEntry *core); void arch_free_thread_info(CoreEntry *core); diff --git a/criu/arch/s390/include/asm/restorer.h b/criu/arch/s390/include/asm/restorer.h index b8472afc8..668f9a413 100644 --- a/criu/arch/s390/include/asm/restorer.h +++ b/criu/arch/s390/include/asm/restorer.h @@ -17,15 +17,15 @@ asm volatile( \ "lgr %%r0,%6\n" /* Save thread_args in %r0 */ \ "lgr %%r1,%5\n" /* Save clone_restore_fn in %r1 */ \ - "lgr %%r2,%2\n" /* Parameter 1: new_sp (child stack) */ \ - "lgr %%r3,%1\n" /* Parameter 2: clone_flags */ \ - "lgr %%r4,%3\n" /* Parameter 3: &parent_tid */ \ - "lgr %%r5,%4\n" /* Parameter 4: &thread_args[i].pid */ \ - "lghi %%r6,0\n" /* Parameter 5: tls = 0 */ \ + "lgr %%r2,%2\n" /* Parm 1: new_sp (child stack) */ \ + "lgr %%r3,%1\n" /* Parm 2: clone_flags */ \ + "lgr %%r4,%3\n" /* Parm 3: &parent_tid */ \ + "lgr %%r5,%4\n" /* Parm 4: &thread_args[i].pid */ \ + "lghi %%r6,0\n" /* Parm 5: tls = 0 */ \ "svc "__stringify(__NR_clone)"\n" \ "ltgr %0,%%r2\n" /* Set and check "ret" */ \ "jnz 0f\n" /* ret != 0: Continue caller */ \ - "lgr %%r2,%%r0\n" /* Parameter 1: &thread_args */ \ + "lgr %%r2,%%r0\n" /* Parm 1: &thread_args */ \ "aghi %%r15,-160\n" /* Prepare stack frame */ \ "xc 0(8,%%r15),0(%%r15)\n" \ "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ diff --git a/criu/arch/s390/include/asm/thread_pointer.h b/criu/arch/s390/include/asm/thread_pointer.h deleted file mode 100644 index f7e07066a..000000000 --- a/criu/arch/s390/include/asm/thread_pointer.h +++ /dev/null @@ -1,27 +0,0 @@ -/* __thread_pointer definition. Generic version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -static inline void *__criu_thread_pointer(void) -{ - return __builtin_thread_pointer(); -} - -#endif /* _SYS_THREAD_POINTER_H */ diff --git a/criu/arch/s390/include/asm/types.h b/criu/arch/s390/include/asm/types.h index abf12dec0..7522cf2cd 100644 --- a/criu/arch/s390/include/asm/types.h +++ b/criu/arch/s390/include/asm/types.h @@ -19,8 +19,6 @@ typedef UserS390RegsEntry UserRegsEntry; #define CORE_THREAD_ARCH_INFO(core) core->ti_s390 -#define TI_IP(core) ((core)->ti_s390->gpregs->psw_addr) - static inline u64 encode_pointer(void *p) { return (u64)p; diff --git a/criu/arch/s390/restorer.c b/criu/arch/s390/restorer.c index 8b3bc44ba..6907ad75b 100644 --- a/criu/arch/s390/restorer.c +++ b/criu/arch/s390/restorer.c @@ -23,10 +23,10 @@ unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg) unsigned long raddr; int ret; - ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ - shmflg, /* second */ + ret = sys_ipc(21 /*SHMAT */, shmid, /* first */ + shmflg, /* second */ (unsigned long)&raddr, /* third */ - shmaddr, /* ptr */ + shmaddr, /* ptr */ 0 /* fifth not used */); if (ret) diff --git a/criu/arch/s390/vdso-pie.c b/criu/arch/s390/vdso-pie.c index bf0366b0e..ad504beda 100644 --- a/criu/arch/s390/vdso-pie.c +++ b/criu/arch/s390/vdso-pie.c @@ -18,9 +18,9 @@ */ typedef struct { u8 larl[6]; /* Load relative address of imm64 */ - u8 lg[6]; /* Load %r1 with imm64 */ - u8 br[2]; /* Branch to %r1 */ - u64 addr; /* Jump address */ + u8 lg[6]; /* Load %r1 with imm64 */ + u8 br[2]; /* Branch to %r1 */ + u64 addr; /* Jump address */ u32 guards; /* Guard bytes */ } __packed jmp_t; diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile index 46f00e9e9..618e85bb3 100644 --- a/criu/arch/x86/Makefile +++ b/criu/arch/x86/Makefile @@ -9,7 +9,6 @@ obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o -obj-y += shstk.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index 2e1f2de9a..d02f4abd5 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -78,7 +78,7 @@ int cpu_dump_cpuinfo(void) cpu_info.n_x86_entry = 1; cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ? CPUINFO_X86_ENTRY__VENDOR__INTEL : - CPUINFO_X86_ENTRY__VENDOR__AMD; + CPUINFO_X86_ENTRY__VENDOR__AMD; cpu_x86_info.cpu_family = rt_cpu_info.x86_family; cpu_x86_info.model = rt_cpu_info.x86_model; @@ -107,103 +107,64 @@ int cpu_dump_cpuinfo(void) #define __ins_bit(__l, __v) (1u << ((__v)-32u * (__l))) -// clang-format off static uint32_t x86_ins_capability_mask[NCAPINTS] = { - [CPUID_1_EDX] = - __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | - __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), + [CPUID_1_EDX] = __ins_bit(CPUID_1_EDX, X86_FEATURE_FPU) | __ins_bit(CPUID_1_EDX, X86_FEATURE_TSC) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CX8) | __ins_bit(CPUID_1_EDX, X86_FEATURE_SEP) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_CMOV) | __ins_bit(CPUID_1_EDX, X86_FEATURE_CLFLUSH) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_MMX) | __ins_bit(CPUID_1_EDX, X86_FEATURE_FXSR) | + __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM) | __ins_bit(CPUID_1_EDX, X86_FEATURE_XMM2), - [CPUID_8000_0001_EDX] = - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | - __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), + [CPUID_8000_0001_EDX] = __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_SYSCALL) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_MMXEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_RDTSCP) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOWEXT) | + __ins_bit(CPUID_8000_0001_EDX, X86_FEATURE_3DNOW), - [CPUID_LNX_1] = - __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | - __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), + [CPUID_LNX_1] = __ins_bit(CPUID_LNX_1, X86_FEATURE_REP_GOOD) | __ins_bit(CPUID_LNX_1, X86_FEATURE_NOPL), - [CPUID_1_ECX] = - __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | - __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), + [CPUID_1_ECX] = __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM3) | __ins_bit(CPUID_1_ECX, X86_FEATURE_PCLMULQDQ) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_MWAIT) | __ins_bit(CPUID_1_ECX, X86_FEATURE_SSSE3) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_CX16) | __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_1) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XMM4_2) | __ins_bit(CPUID_1_ECX, X86_FEATURE_MOVBE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_POPCNT) | __ins_bit(CPUID_1_ECX, X86_FEATURE_AES) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_XSAVE) | __ins_bit(CPUID_1_ECX, X86_FEATURE_OSXSAVE) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_AVX) | __ins_bit(CPUID_1_ECX, X86_FEATURE_F16C) | + __ins_bit(CPUID_1_ECX, X86_FEATURE_RDRAND), [CPUID_8000_0001_ECX] = - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | - __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_ABM) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_SSE4A) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_MISALIGNSSE) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_3DNOWPREFETCH) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_XOP) | __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_FMA4) | + __ins_bit(CPUID_8000_0001_ECX, X86_FEATURE_TBM), [CPUID_7_0_EBX] = - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | - __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_FSGSBASE) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI1) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_HLE) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX2) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_BMI2) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ERMS) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RTM) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_MPX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512F) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512DQ) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_RDSEED) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_ADX) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_CLFLUSHOPT) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512PF) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512ER) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512CD) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_SHA_NI) | __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512BW) | + __ins_bit(CPUID_7_0_EBX, X86_FEATURE_AVX512VL), - [CPUID_D_1_EAX] = - __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | - __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | - __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), + [CPUID_D_1_EAX] = __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEOPT) | + __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XSAVEC) | __ins_bit(CPUID_D_1_EAX, X86_FEATURE_XGETBV1), [CPUID_7_0_ECX] = - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | - __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512VBMI) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VBMI2) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_GFNI) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VAES) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_VPCLMULQDQ) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VNNI) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_BITALG) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_TME) | + __ins_bit(CPUID_7_0_ECX, X86_FEATURE_AVX512_VPOPCNTDQ) | __ins_bit(CPUID_7_0_ECX, X86_FEATURE_RDPID), - [CPUID_8000_0008_EBX] = - __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), + [CPUID_8000_0008_EBX] = __ins_bit(CPUID_8000_0008_EBX, X86_FEATURE_CLZERO), - [CPUID_7_0_EDX] = - __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | - __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), + [CPUID_7_0_EDX] = __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4VNNIW) | + __ins_bit(CPUID_7_0_EDX, X86_FEATURE_AVX512_4FMAPS), }; -// clang-format on #undef __ins_bit @@ -407,12 +368,6 @@ int cpu_validate_cpuinfo(void) if (!img) return -1; - if (empty_image(img)) { - pr_err("No cpuinfo image\n"); - close_image(img); - return -1; - } - if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0) goto err; diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index 1f4d0736b..f177b9e7b 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -15,7 +15,7 @@ #define XSAVE_PB_NELEMS(__s, __obj, __member) (sizeof(__s) / sizeof(*(__obj)->__member)) -int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) +int save_task_regs(void *x, user_regs_struct_t *regs, user_fpregs_struct_t *fpregs) { CoreEntry *core = x; UserX86RegsEntry *gpregs = core->thread_info->gpregs; @@ -133,14 +133,6 @@ int save_task_regs(pid_t pid, void *x, user_regs_struct_t *regs, user_fpregs_str #undef assign_array #undef assign_xsave - if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { - UserX86CetEntry *cet = core->thread_info->fpregs->xsave->cet; - struct cet_user_state *regs = &fpregs->cet; - - cet->cet = regs->cet; - cet->ssp = regs->ssp; - } - return 0; } @@ -207,13 +199,6 @@ static int alloc_xsave_extends(UserX86XsaveEntry *xsave) goto err; } - if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) { - xsave->cet = xzalloc(sizeof(UserX86CetEntry)); - if (!xsave->cet) - goto err; - user_x86_cet_entry__init(xsave->cet); - } - return 0; err: return -1; @@ -235,8 +220,6 @@ int arch_alloc_thread_info(CoreEntry *core) with_xsave = compel_cpu_has_feature(X86_FEATURE_OSXSAVE); if (with_xsave) sz += sizeof(UserX86XsaveEntry); - if (compel_cpu_has_feature(X86_FEATURE_SHSTK)) - sz += sizeof(UserX86CetEntry); } m = xmalloc(sz); @@ -426,7 +409,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) { fpu_state_t *fpu_state = core_is_compat(core) ? &sigframe->compat.fpu_state : &sigframe->native.fpu_state; struct xsave_struct *x = core_is_compat(core) ? (void *)&fpu_state->fpu_state_ia32.xsave : - (void *)&fpu_state->fpu_state_64.xsave; + (void *)&fpu_state->fpu_state_64.xsave; /* * If no FPU information provided -- we're restoring @@ -450,7 +433,7 @@ int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core) #define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e)) #define assign_xsave(feature, xsave, member, area) \ do { \ - if (compel_fpu_has_feature(feature) && (xsave->xstate_bv & (1UL << feature))) { \ + if (compel_fpu_has_feature(feature)) { \ uint32_t off = compel_fpu_feature_offset(feature); \ void *to = &area[off]; \ void *from = xsave->member; \ diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h index 4ca704fd7..867357fa2 100644 --- a/criu/arch/x86/include/asm/compat.h +++ b/criu/arch/x86/include/asm/compat.h @@ -11,8 +11,6 @@ #include -#include "log.h" - static inline void *alloc_compat_syscall_stack(void) { void *mem = (void *)sys_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE, diff --git a/criu/arch/x86/include/asm/dump.h b/criu/arch/x86/include/asm/dump.h index 925ea91ff..03715e003 100644 --- a/criu/arch/x86/include/asm/dump.h +++ b/criu/arch/x86/include/asm/dump.h @@ -1,7 +1,7 @@ #ifndef __CR_ASM_DUMP_H__ #define __CR_ASM_DUMP_H__ -extern int save_task_regs(pid_t pid, void *, user_regs_struct_t *, user_fpregs_struct_t *); +extern int save_task_regs(void *, user_regs_struct_t *, user_fpregs_struct_t *); extern int arch_alloc_thread_info(CoreEntry *core); extern void arch_free_thread_info(CoreEntry *core); extern int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info); @@ -25,7 +25,7 @@ static inline void core_put_tls(CoreEntry *core, tls_t tls) COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); - COPY_TLS(usable); + COPY_TLS(useable); #undef COPY_TLS } } diff --git a/criu/arch/x86/include/asm/kerndat.h b/criu/arch/x86/include/asm/kerndat.h index 5c3717230..903bc80f7 100644 --- a/criu/arch/x86/include/asm/kerndat.h +++ b/criu/arch/x86/include/asm/kerndat.h @@ -4,6 +4,5 @@ extern int kdat_compatible_cr(void); extern int kdat_can_map_vdso(void); extern int kdat_x86_has_ptrace_fpu_xsave_bug(void); -extern int kdat_has_shstk(void); #endif /* __CR_ASM_KERNDAT_H__ */ diff --git a/criu/arch/x86/include/asm/restore.h b/criu/arch/x86/include/asm/restore.h index addf716a4..7cb725d98 100644 --- a/criu/arch/x86/include/asm/restore.h +++ b/criu/arch/x86/include/asm/restore.h @@ -49,7 +49,7 @@ static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) COPY_TLS(read_exec_only); COPY_TLS(limit_in_pages); COPY_TLS(seg_not_present); - COPY_TLS(usable); + COPY_TLS(useable); #undef COPY_TLS } } diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index 3a673958d..23438314f 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -8,13 +8,12 @@ #include #include #include "asm/compat.h" -#include "asm/shstk.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); extern int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act); extern int set_compat_robust_list(uint32_t head_ptr, uint32_t len); -#else /* CONFIG_COMPAT */ +#else /* CONFIG_COMPAT */ static inline void restore_tls(tls_t *ptls) { } diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h deleted file mode 100644 index d113fd8ab..000000000 --- a/criu/arch/x86/include/asm/shstk.h +++ /dev/null @@ -1,304 +0,0 @@ -#ifndef __CR_ASM_SHSTK_H__ -#define __CR_ASM_SHSTK_H__ - -/* - * Shadow stack constants from Linux - */ -/* arch/x86/include/uapi/asm/mman.h */ -#ifndef SHADOW_STACK_SET_TOKEN -#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ -#endif - -/* arch/x86/include/uapi/asm/prctl.h */ -#define ARCH_SHSTK_ENABLE 0x5001 -#define ARCH_SHSTK_DISABLE 0x5002 -#define ARCH_SHSTK_LOCK 0x5003 -#define ARCH_SHSTK_UNLOCK 0x5004 -#define ARCH_SHSTK_STATUS 0x5005 - -#define ARCH_SHSTK_SHSTK (1ULL << 0) -#define ARCH_SHSTK_WRSS (1ULL << 1) - -#define ARCH_HAS_SHSTK - -/* from arch/x86/kernel/shstk.c */ -#define SHSTK_DATA_BIT (1UL << 63) /* BIT(63) */ - -/* - * Shadow stack memory cannot be restored with memcpy/pread but only using - * a special instruction that can write to shadow stack. - * That instruction is only available when shadow stack is enabled, - * otherwise it causes #UD. - * - * Also, shadow stack VMAs cannot be mmap()ed or mrepmap()ed, they must be - * created using map_shadow_stack() system call. This pushes creation of - * shadow stack VMAs to the restorer blob after CRIU mappings are freed. - * - * And there is an additional jungling with shadow stacks to ensure that we - * don't unmap an active shadow stack - * - * The overall sequence of restoring shadow stack is - * - Enable shadow stack early after clone()ing the task - * - Unlock shadow stack features using ptrace - * - In the restorer blob: - * - switch to a temporary shadow stack to be able to unmap shadow stack - * with the CRIU mappings - * - after memory mappigns are restored, recreate shadow stack VMAs, - * populate them using wrss instruction and switch to the task shadow - * stack - * - lock shadow stack features - */ -struct rst_shstk_info { - unsigned long vma_start; /* start of shadow stack VMA */ - unsigned long vma_size; /* size of shadow stack VMA */ - unsigned long premmaped_addr; /* address of shadow stack copy in - the premmaped area */ - unsigned long tmp_shstk; /* address of temporary shadow stack */ - u64 ssp; /* shadow stack pointer */ - u64 cet; /* CET conrtol state */ -}; -#define rst_shstk_info rst_shstk_info - -struct task_restore_args; -struct pstree_item; - -int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, - struct task_restore_args *ta); -#define arch_shstk_prepare arch_shstk_prepare - -int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid); -#define arch_shstk_unlock arch_shstk_unlock - -int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, - int (*func)(void *arg), void *arg); -#define arch_shstk_trampoline arch_shstk_trampoline - -static always_inline long shstk_restorer_stack_size(void) -{ - return PAGE_SIZE; -} -#define shstk_restorer_stack_size shstk_restorer_stack_size -static always_inline void shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) -{ - info->tmp_shstk = (unsigned long)ptr; -} -#define shstk_set_restorer_stack shstk_set_restorer_stack - -static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long __maybe_unused def) -{ - return !(info->cet & ARCH_SHSTK_SHSTK) ? def : (4UL << 30); -} -#define shstk_min_mmap_addr shstk_min_mmap_addr - -#ifdef CR_NOGLIBC - -#include -#include -#include "vma.h" - -#define SHSTK_BUSY_BIT (1UL << 0) /* BIT(0) */ - -static inline int shstk_map(unsigned long addr, unsigned long size) -{ - long shstk = sys_map_shadow_stack(addr, size, SHADOW_STACK_SET_TOKEN); - - if (shstk < 0) { - pr_err("Failed to map shadow stack at %lx: %ld\n", addr, shstk); - return -1; - } - - if (shstk != addr) { - pr_err("Shadow stack address mismatch: need %lx, got %lx\n", addr, shstk); - return -1; - } - - pr_info("Created shadow stack at %lx\n", shstk); - - return 0; -} - -/* clang-format off */ -static inline unsigned long get_ssp(void) -{ - unsigned long ssp; - - asm volatile("rdsspq %0" : "=r"(ssp) :: ); - - return ssp; -} - -static inline void wrssq(unsigned long addr, unsigned long val) -{ - asm volatile("wrssq %1, (%0)" :: "r"(addr), "r"(val) : "memory"); -} -/* clang-format off */ - -static always_inline void shstk_switch_ssp(unsigned long new_ssp) -{ - unsigned long old_ssp = get_ssp(); - - asm volatile("rstorssp (%0)\n" :: "r"(new_ssp)); - asm volatile("saveprevssp"); - - pr_debug("changed ssp from %lx to %lx\n", old_ssp, new_ssp); -} - -/* - * Disable writes to the shadow stack and lock it's disable/enable control - */ -static inline int shstk_finalize(void) -{ - int ret = 0; - - ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); - if (ret) { - pr_err("Failed to disable writes to shadow stack\n"); - return ret; - } - - ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); - if (ret) - pr_err("Failed to lock shadow stack controls\n"); - - return ret; -} - -/* - * Create shadow stack vma and restore its content from premmapped anonymous (non-shstk) vma - */ -static always_inline int shstk_vma_restore(VmaEntry *vma_entry) -{ - long shstk, i; - unsigned long *shstk_data = (void *)vma_premmaped_start(vma_entry); - unsigned long vma_size = vma_entry_len(vma_entry); - long ret; - - shstk = sys_map_shadow_stack(0, vma_size, SHADOW_STACK_SET_TOKEN); - if (shstk < 0) { - pr_err("Failed to map shadow stack: %ld\n", shstk); - return -1; - } - - /* restore shadow stack contents */ - for (i = 0; i < vma_size / 8; i++) - wrssq(shstk + i * 8, shstk_data[i]); - - ret = sys_munmap(shstk_data, vma_size); - if (ret < 0) { - pr_err("Failed to unmap premmaped shadow stack\n"); - return ret; - } - - /* - * From that point premapped vma is (shstk) and we need - * to mremap() it to the final location. Originally premapped - * (shstk_data) has been unmapped already. - */ - vma_premmaped_start(vma_entry) = shstk; - - return 0; -} -#define shstk_vma_restore shstk_vma_restore - -/* - * Restore contents of the shadow stack and set shadow stack pointer - */ -static always_inline int shstk_restore(struct rst_shstk_info *cet) -{ - unsigned long ssp, val; - - if (!(cet->cet & ARCH_SHSTK_SHSTK)) - return 0; - - /* - * Add tokens for sigreturn frame and for switch of the shadow stack. - * The sigreturn token will be checked by the kernel during - * processing of sigreturn - * The token for stack switch is required by rstorssp and - * saveprevssp semantics - */ - - /* token for sigreturn frame */ - ssp = cet->ssp - 8; - val = ALIGN_DOWN(cet->ssp, 8) | SHSTK_DATA_BIT; - wrssq(ssp, val); - - /* shadow stack switch token */ - val = ssp | SHSTK_BUSY_BIT; - ssp -= 8; - wrssq(ssp, val); - - /* reset shadow stack pointer to the proper location */ - shstk_switch_ssp(ssp); - - return shstk_finalize(); -} -#define arch_shstk_restore shstk_restore - -/* - * Disable shadow stack - */ -static inline int shstk_disable(void) -{ - int ret; - - ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_WRSS); - if (ret) { - pr_err("Failed to disable writes to shadow stack\n"); - return ret; - } - - ret = sys_arch_prctl(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK); - if (ret) { - pr_err("Failed to disable shadow stack\n"); - return ret; - } - - ret = sys_arch_prctl(ARCH_SHSTK_LOCK, ARCH_SHSTK_SHSTK); - if (ret) - pr_err("Failed to lock shadow stack controls\n"); - - return 0; -} - -/* - * Switch to temporary shadow stack - */ -static always_inline int shstk_switch_to_restorer(struct rst_shstk_info *cet) -{ - unsigned long ssp; - long ret; - - if (!(cet->cet & ARCH_SHSTK_SHSTK)) - return 0; - - ret = sys_munmap((void *)cet->tmp_shstk, PAGE_SIZE); - if (ret < 0) { - pr_err("Failed to unmap area for temporary shadow stack\n"); - return -1; - } - - ret = shstk_map(cet->tmp_shstk, PAGE_SIZE); - if (ret < 0) - return -1; - - /* - * Switch shadow stack from the default created by the kernel to a - * temporary shadow stack allocated in the premmaped area - */ - ssp = cet->tmp_shstk + PAGE_SIZE - 8; - shstk_switch_ssp(ssp); - - ret = sys_arch_prctl(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS); - if (ret) { - pr_err("Failed to enable writes to shadow stack\n"); - return ret; - } - - return 0; -} -#define arch_shstk_switch_to_restorer shstk_switch_to_restorer - -#endif /* CR_NOGLIBC */ - -#endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/include/asm/thread_pointer.h b/criu/arch/x86/include/asm/thread_pointer.h deleted file mode 100644 index 08603aed4..000000000 --- a/criu/arch/x86/include/asm/thread_pointer.h +++ /dev/null @@ -1,37 +0,0 @@ -/* __thread_pointer definition. x86 version. - Copyright (C) 2021 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library. If not, see - . */ - -#ifndef _SYS_THREAD_POINTER_H -#define _SYS_THREAD_POINTER_H - -static inline void *__criu_thread_pointer(void) -{ -#if __GNUC_PREREQ(11, 1) - return __builtin_thread_pointer(); -#else - void *__result; -#ifdef __x86_64__ - __asm__("mov %%fs:0, %0" : "=r"(__result)); -#else - __asm__("mov %%gs:0, %0" : "=r"(__result)); -#endif - return __result; -#endif /* !GCC 11 */ -} - -#endif /* _SYS_THREAD_POINTER_H */ \ No newline at end of file diff --git a/criu/arch/x86/include/asm/types.h b/criu/arch/x86/include/asm/types.h index 8919d0ae6..a0a8ed987 100644 --- a/criu/arch/x86/include/asm/types.h +++ b/criu/arch/x86/include/asm/types.h @@ -28,8 +28,6 @@ static inline int core_is_compat(CoreEntry *c) #define CORE_THREAD_ARCH_INFO(core) core->thread_info -#define TI_IP(core) ((core)->thread_info->gpregs->ip) - typedef UserX86RegsEntry UserRegsEntry; static inline u64 encode_pointer(void *p) diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h index ca46374a5..3b3f292bd 100644 --- a/criu/arch/x86/include/asm/vdso.h +++ b/criu/arch/x86/include/asm/vdso.h @@ -12,7 +12,7 @@ * This is a minimal amount of symbols * we should support at the moment. */ -#define VDSO_SYMBOL_MAX 7 +#define VDSO_SYMBOL_MAX 6 #define VDSO_SYMBOL_GTOD 2 /* @@ -42,12 +42,11 @@ const char *aarch_vdso_symbol3 = "__vdso_gettimeofday"; \ const char *aarch_vdso_symbol4 = "__vdso_time"; \ const char *aarch_vdso_symbol5 = "__kernel_sigreturn"; \ - const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; \ - const char *aarch_vdso_symbol7 = "__vdso_clock_gettime64"; \ + const char *aarch_vdso_symbol6 = "__kernel_rt_sigreturn"; #define ARCH_VDSO_SYMBOLS \ aarch_vdso_symbol1, aarch_vdso_symbol2, aarch_vdso_symbol3, aarch_vdso_symbol4, aarch_vdso_symbol5, \ - aarch_vdso_symbol6, aarch_vdso_symbol7 + aarch_vdso_symbol6 /* "__kernel_vsyscall", */ diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c index 3a58bbea7..a98797d39 100644 --- a/criu/arch/x86/kerndat.c +++ b/criu/arch/x86/kerndat.c @@ -17,7 +17,6 @@ #include "asm/compat.h" #include "asm/dump.h" -#include "asm/shstk.h" int kdat_can_map_vdso(void) { @@ -252,29 +251,3 @@ out_kill: return ret; } - -/* - * Unlike most kerndat knobs, this does not check for availability of the - * shadow stack in the kernel, but rather checks if criu runs with shadow - * stack enabled. - * - * This depends on hardware availability, kernel and glibc support, compiler - * options and glibc tunables. - */ -int kdat_has_shstk(void) -{ - unsigned long features; - - if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) - return 0; - - if (syscall(__NR_arch_prctl, ARCH_SHSTK_STATUS, &features)) { - /* kernels that don't support shadow stack return -EINVAL */ - if (errno == EINVAL) - return 0; - pr_perror("Cannot get shadow stack status"); - return 1; - } - - return !!(features & ARCH_SHSTK_SHSTK); -} diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c deleted file mode 100644 index 0810efac5..000000000 --- a/criu/arch/x86/shstk.c +++ /dev/null @@ -1,222 +0,0 @@ -#include -#include - -#include - -#include - -#include "pstree.h" -#include "restorer.h" -#include "rst-malloc.h" -#include "vma.h" - -static bool task_needs_shstk(struct pstree_item *item, CoreEntry *core) -{ - UserX86FpregsEntry *fpregs; - - if (!task_alive(item)) - return false; - - fpregs = core->thread_info->fpregs; - if (fpregs->xsave && fpregs->xsave->cet) { - if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) { - pr_warn_once("Restoring task with shadow stack on non-CET machine\n"); - return false; - } - - if (fpregs->xsave->cet->cet & ARCH_SHSTK_SHSTK) - return true; - } - - return false; -} - -static int shstk_prepare_task(struct vm_area_list *vmas, - struct rst_shstk_info *shstk) -{ - struct vma_area *vma; - - list_for_each_entry(vma, &vmas->h, list) { - if (vma_area_is(vma, VMA_AREA_SHSTK) && - in_vma_area(vma, shstk->ssp)) { - unsigned long premmaped_addr = vma->premmaped_addr; - unsigned long size = vma_area_len(vma); - - shstk->vma_start = vma->e->start; - shstk->vma_size = size; - shstk->premmaped_addr = premmaped_addr; - - break; - } - } - - return 0; -} - -int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, - struct task_restore_args *ta) -{ - struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); - UserX86FpregsEntry *fpregs = core->thread_info->fpregs; - struct vm_area_list *vmas = &rsti(item)->vmas; - struct rst_shstk_info *shstk = &ta->shstk; - int i; - - if (!task_needs_shstk(item, core)) - return 0; - - shstk->cet = fpregs->xsave->cet->cet; - shstk->ssp = fpregs->xsave->cet->ssp; - - if (shstk_prepare_task(vmas, shstk)) { - pr_err("Failed to prepare shadow stack memory\n"); - return -1; - } - - for (i = 0; i < item->nr_threads; i++) { - struct thread_restore_args *thread_args = &args_array[i]; - - core = item->core[i]; - fpregs = core->thread_info->fpregs; - shstk = &thread_args->shstk; - - shstk->cet = fpregs->xsave->cet->cet; - shstk->ssp = fpregs->xsave->cet->ssp; - if (shstk_prepare_task(vmas, shstk)) { - pr_err("Failed to prepare shadow stack memory\n"); - return -1; - } - } - - return 0; -} - -int arch_shstk_unlock(struct pstree_item *item, CoreEntry *core, pid_t pid) -{ - unsigned long features; - int status; - int ret = -1; - - /* - * CRIU runs with no shadow stack and the task does not need one, - * nothing to do. - */ - if (!kdat.has_shstk && !task_needs_shstk(item, core)) - return 0; - - futex_wait_until(&rsti(item)->shstk_enable, 1); - - if (ptrace(PTRACE_SEIZE, pid, 0, 0)) { - pr_perror("Cannot attach to %d", pid); - goto futex_wake; - } - - if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { - pr_perror("Cannot interrupt the %d task", pid); - goto detach; - } - - if (wait4(pid, &status, __WALL, NULL) != pid) { - pr_perror("waitpid(%d) failed", pid); - goto detach; - } - - features = ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS; - if (ptrace(PTRACE_ARCH_PRCTL, pid, features, ARCH_SHSTK_UNLOCK)) { - pr_perror("Cannot unlock CET for %d task", pid); - goto detach; - } - -detach: - if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { - pr_perror("Unable to detach %d", pid); - goto futex_wake; - } - - ret = 0; - -futex_wake: - futex_set_and_wake(&rsti(item)->shstk_unlock, 1); - - return ret; -} - -static void shstk_sync_unlock(struct pstree_item *item) -{ - /* notify parent that shadow stack is enabled ... */ - futex_set_and_wake(&rsti(item)->shstk_enable, 1); - - /* ... and wait until it unlocks its features with ptrace */ - futex_wait_until(&rsti(item)->shstk_unlock, 1); -} - -static void __arch_shstk_enable(struct pstree_item *item, - int (*func)(void *arg), void *arg) -{ - int ret; - - shstk_sync_unlock(item); - - /* return here would cause #CP, use exit() instead */ - ret = func(arg); - exit(ret); -} - -static int shstk_disable(struct pstree_item *item) -{ - shstk_sync_unlock(item); - - /* disable shadow stack, implicitly clears ARCH_SHSTK_WRSS */ - if (syscall(__NR_arch_prctl, ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) { - pr_perror("Failed to disable shadow stack"); - return -1; - } - - if (syscall(__NR_arch_prctl, ARCH_SHSTK_LOCK, - ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS)) { - pr_perror("Failed to lock shadow stack controls"); - return -1; - } - - return 0; -} - -int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, - int (*func)(void *arg), void *arg) -{ - unsigned long features = ARCH_SHSTK_SHSTK; - int code = ARCH_SHSTK_ENABLE; - - /* - * If task does not need shadow stack but CRIU runs with shadow - * stack enabled, we should disable it before continuing with - * restore - */ - if (!task_needs_shstk(item, core)) { - if (kdat.has_shstk && shstk_disable(item)) - return -1; - return func(arg); - } - - /* - * Calling sys_arch_prctl() means there will be use of retq - * instruction after shadow stack is enabled and this will cause - * Control Protectiond fault. Open code sys_arch_prctl() in - * assembly. - * - * code and addr should be in %rdi and %rsi and will be passed to - * the system call as is. - */ - asm volatile("movq $"__stringify(__NR_arch_prctl)", %%rax \n" - "syscall \n" - "cmpq $0, %%rax \n" - "je 1f \n" - "retq \n" - "1: \n" - :: "D"(code), "S"(features)); - - __arch_shstk_enable(item, func, arg); - - /* never reached */ - return -1; -} diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c index 506a8d1bb..f02b2cc0e 100644 --- a/criu/arch/x86/sigaction_compat.c +++ b/criu/arch/x86/sigaction_compat.c @@ -44,8 +44,8 @@ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) memcpy(stack32, act, sizeof(rt_sigaction_t_compat)); arg.nr = __NR32_rt_sigaction; arg.arg0 = sig; - arg.arg1 = (uint32_t)act_stack; /* act */ - arg.arg2 = 0; /* oldact */ + arg.arg1 = (uint32_t)act_stack; /* act */ + arg.arg2 = 0; /* oldact */ arg.arg3 = (uint32_t)sizeof(act->rt_sa_mask); /* sigsetsize */ return do_full_int80(&arg); diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 46612e70d..4fa7eb3dc 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -23,7 +23,7 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, struct rt_sigframe *r } sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; - } else { + } else if (!sigframe->is_native) { unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; if ((addr % 64ul)) { diff --git a/criu/autofs.c b/criu/autofs.c index a1775cbc9..71edc7bce 100644 --- a/criu/autofs.c +++ b/criu/autofs.c @@ -388,7 +388,7 @@ free_str: */ static int access_autofs_mount(struct mount_info *pm) { - const char *mnt_path = service_mountpoint(pm) + 1; + const char *mnt_path = pm->mountpoint + 1; dev_t dev_id = pm->s_dev; int new_pid_ns = -1, old_pid_ns = -1; int old_mnt_ns, old_cwd_fd; @@ -431,7 +431,8 @@ static int access_autofs_mount(struct mount_info *pm) pr_err("failed to fork\n"); goto close_autofs_mnt; case 0: - /* We don't care about results, all we need is to "touch" */ + /* We don't care about results. + * All we need is to "touch" */ /* coverity[check_return] */ openat(autofs_mnt, mnt_path, O_RDONLY | O_NONBLOCK | O_DIRECTORY); _exit(0); @@ -497,7 +498,7 @@ static int autofs_create_entry(struct mount_info *pm, AutofsEntry *entry) * options, then we can read them again and dump it. */ if (access_autofs_mount(pm)) { - pr_err("failed to access autofs %s\n", service_mountpoint(pm) + 1); + pr_err("failed to access autofs %s\n", pm->mountpoint + 1); return -1; } if (parse_options(pm->options, entry, &pipe_ino)) @@ -658,7 +659,7 @@ static int autofs_mnt_make_catatonic(const char *mnt_path, int mnt_fd) static int autofs_mnt_set_timeout(time_t timeout, const char *mnt_path, int mnt_fd) { - pr_info("%s: set timeout %" PRId64 " for %s\n", __func__, (int64_t)timeout, mnt_path); + pr_info("%s: set timeout %ld for %s\n", __func__, timeout, mnt_path); return autofs_ioctl(mnt_path, mnt_fd, AUTOFS_IOC_SETTIMEOUT, &timeout); } @@ -724,19 +725,14 @@ static int autofs_create_dentries(const struct mount_info *mi, char *mnt_path) struct mount_info *c; list_for_each_entry(c, &mi->children, siblings) { - char *path, *rel_path; + char *path, *basename; - rel_path = get_relative_path(c->ns_mountpoint, mi->ns_mountpoint); - if (!rel_path) { - pr_err("Can't get path %s relative to %s\n", c->ns_mountpoint, mi->ns_mountpoint); + basename = strrchr(c->mountpoint, '/'); + if (!basename) { + pr_info("%s: mount path \"%s\" doesn't have '/'\n", __func__, c->mountpoint); return -1; } - - /* Skip children-overmount */ - if (*rel_path == '\0') - continue; - - path = xsprintf("%s/%s", mnt_path, rel_path); + path = xsprintf("%s%s", mnt_path, basename); if (!path) return -1; if (mkdir(path, 0555) < 0) { @@ -754,7 +750,7 @@ static int autofs_populate_mount(const struct mount_info *mi, const AutofsEntry if (entry->mode != AUTOFS_MODE_INDIRECT) return 0; - return autofs_create_dentries(mi, service_mountpoint(mi)); + return autofs_create_dentries(mi, mi->mountpoint); } static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout) @@ -770,7 +766,7 @@ static int autofs_post_mount(const char *mnt_path, dev_t mnt_dev, time_t timeout } if (autofs_mnt_set_timeout(timeout, mnt_path, mnt_fd)) { - pr_err("Failed to set timeout %" PRId64 " for %s\n", (int64_t)timeout, mnt_path); + pr_err("Failed to set timeout %ld for %s\n", timeout, mnt_path); return -1; } @@ -1030,10 +1026,10 @@ int autofs_mount(struct mount_info *mi, const char *source, const char *filesyst goto close_pipe; } - pr_info("autofs: mounting to %s with options: \"%s\"\n", service_mountpoint(mi), opts); + pr_info("autofs: mounting to %s with options: \"%s\"\n", mi->mountpoint, opts); - if (mount(source, service_mountpoint(mi), filesystemtype, mountflags, opts) < 0) { - pr_perror("Failed to mount autofs to %s", service_mountpoint(mi)); + if (mount(source, mi->mountpoint, filesystemtype, mountflags, opts) < 0) { + pr_perror("Failed to mount autofs to %s", mi->mountpoint); goto free_opts; } @@ -1048,8 +1044,8 @@ int autofs_mount(struct mount_info *mi, const char *source, const char *filesyst * data is not ready yet. So, let's put in on mi->private and copy to * shared data in autofs_add_mount_info(). */ - if (stat(service_mountpoint(mi), &buf) < 0) { - pr_perror("Failed to stat %s", service_mountpoint(mi)); + if (stat(mi->mountpoint, &buf) < 0) { + pr_perror("Failed to stat %s", mi->mountpoint); goto free_info; } info->mnt_dev = buf.st_dev; @@ -1060,7 +1056,7 @@ int autofs_mount(struct mount_info *mi, const char *source, const char *filesyst goto free_info; /* In case of catatonic mounts all we need as the function call below */ - ret = autofs_post_mount(service_mountpoint(mi), buf.st_dev, entry->timeout); + ret = autofs_post_mount(mi->mountpoint, buf.st_dev, entry->timeout); if (ret < 0) goto free_info; @@ -1083,7 +1079,7 @@ close_pipe: free_info: free(info); umount: - if (umount(service_mountpoint(mi)) < 0) - pr_perror("Failed to umount %s", service_mountpoint(mi)); + if (umount(mi->mountpoint) < 0) + pr_perror("Failed to umount %s", mi->mountpoint); goto close_pipe; } diff --git a/criu/bpfmap.c b/criu/bpfmap.c index 25098368d..55b381c18 100644 --- a/criu/bpfmap.c +++ b/criu/bpfmap.c @@ -1,4 +1,5 @@ #include +#include #include #include "common/compiler.h" @@ -11,11 +12,6 @@ #include "protobuf.h" -#ifndef LIBBPF_OPTS -#define LIBBPF_OPTS DECLARE_LIBBPF_OPTS -#define LEGACY_LIBBPF /* Using libbpf < 0.7 */ -#endif - int is_bpfmap_link(char *link) { return is_anon_link_type(link, "bpf-map"); @@ -70,7 +66,7 @@ int restore_bpfmap_data(int map_fd, uint32_t map_id, struct bpfmap_data_rst **bp void *keys = NULL; void *values = NULL; unsigned int count; - LIBBPF_OPTS(bpf_map_batch_opts, opts); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, ); for (map_data = bpf_hash_table[map_id & BPFMAP_DATA_HASH_MASK]; map_data != NULL; map_data = map_data->next) { if (map_data->bde->map_id == map_id) @@ -153,7 +149,7 @@ int dump_one_bpfmap_data(BpfmapFileEntry *bpf, int lfd, const struct fd_parms *p void *keys = NULL, *values = NULL; void *in_batch = NULL, *out_batch = NULL; BpfmapDataEntry bde = BPFMAP_DATA_ENTRY__INIT; - LIBBPF_OPTS(bpf_map_batch_opts, opts); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, ); int ret; key_size = bpf->key_size; @@ -220,14 +216,9 @@ static int dump_one_bpfmap(int lfd, u32 id, const struct fd_parms *p) { BpfmapFileEntry bpf = BPFMAP_FILE_ENTRY__INIT; FileEntry fe = FILE_ENTRY__INIT; - int ret; - /* If we are using a bigger struct than the kernel knows of, - * ensure all the unknown bits are 0 - i.e. new user-space - * does not rely on any unknown kernel feature extensions. - * https://github.com/torvalds/linux/blob/a1994480/kernel/bpf/syscall.c#L70 - */ - struct bpf_map_info map_info = {}; + struct bpf_map_info map_info; uint32_t info_len = sizeof(struct bpf_map_info); + int ret; if (parse_fdinfo(lfd, FD_TYPES__BPFMAP, &bpf)) return -1; @@ -275,19 +266,12 @@ static int bpfmap_open(struct file_desc *d, int *new_fd) { struct bpfmap_file_info *info; BpfmapFileEntry *bpfe; - int bpfmap_fd; -#ifdef LEGACY_LIBBPF struct bpf_create_map_attr xattr; -#else - LIBBPF_OPTS(bpf_map_create_opts, bpfmap_opts); -#endif + int bpfmap_fd; info = container_of(d, struct bpfmap_file_info, d); bpfe = info->bpfe; - pr_info_bpfmap("Creating and opening ", bpfe); - -#ifdef LEGACY_LIBBPF xattr.name = xstrdup(bpfe->map_name); xattr.map_type = bpfe->map_type; xattr.map_flags = bpfe->map_flags; @@ -301,25 +285,13 @@ static int bpfmap_open(struct file_desc *d, int *new_fd) xattr.map_ifindex = bpfe->ifindex; xattr.inner_map_fd = 0; + pr_info_bpfmap("Creating and opening ", bpfe); bpfmap_fd = bpf_create_map_xattr(&xattr); -#else - bpfmap_opts.map_flags = bpfe->map_flags; - bpfmap_opts.map_ifindex = bpfe->ifindex; - if (bpfe->has_map_extra) - bpfmap_opts.map_extra = bpfe->map_extra; - - bpfmap_fd = bpf_map_create(bpfe->map_type, bpfe->map_name, bpfe->key_size, bpfe->value_size, bpfe->max_entries, - &bpfmap_opts); -#endif - if (bpfmap_fd < 0) { pr_perror("Can't create bpfmap %#08x", bpfe->id); return -1; } - if (bpfe->has_map_extra && bpfe->map_extra) - pr_warn("bpfmap map_extra has non-zero value. This will not be restored.\n"); - if (restore_bpfmap_data(bpfmap_fd, bpfe->map_id, bpfmap_data_hash_table)) return -1; diff --git a/criu/cgroup-props.c b/criu/cgroup-props.c index 1b85c5b5a..5bed7dd9d 100644 --- a/criu/cgroup-props.c +++ b/criu/cgroup-props.c @@ -35,29 +35,12 @@ static const char *____criu_global_props____[] = { "tasks", }; -/* cgroup2 global properties */ -// clang-format off -static const char *____criu_global_props_v2____[] = { - "cgroup.subtree_control", - "cgroup.max.descendants", - "cgroup.max.depth", - "cgroup.freeze", - "cgroup.type", -}; -// clang-format on - cgp_t cgp_global = { .name = "____criu_global_props____", .nr_props = ARRAY_SIZE(____criu_global_props____), .props = ____criu_global_props____, }; -cgp_t cgp_global_v2 = { - .name = "____criu_global_props_v2____", - .nr_props = ARRAY_SIZE(____criu_global_props_v2____), - .props = ____criu_global_props_v2____, -}; - typedef struct { struct list_head list; cgp_t cgp; diff --git a/criu/cgroup.c b/criu/cgroup.c index 9246be639..ccac37fcc 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -8,7 +8,6 @@ #include #include #include -#include #include "common/list.h" #include "xmalloc.h" @@ -55,7 +54,6 @@ static u32 cg_set_ids = 1; static LIST_HEAD(cgroups); static unsigned int n_cgroups; -static pid_t cgroupd_pid; static CgSetEntry *find_rst_set_by_id(u32 id) { @@ -83,7 +81,7 @@ static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what) if (l2->next != ctls) c2 = list_first_entry(l2, struct cg_ctl, l); - if (!c1 || !c2) /* Nowhere to move next */ + if (!c1 || !c2) /* Nowhere to move next */ return !c1 && !c2; /* Both lists scanned -- match */ if (strcmp(c1->name, c2->name)) @@ -175,7 +173,6 @@ struct cg_controller *new_controller(const char *name) nc->n_controllers = 1; nc->n_heads = 0; - nc->is_threaded = false; INIT_LIST_HEAD(&nc->heads); return nc; @@ -248,7 +245,7 @@ static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir return EXACT_MATCH; } - if (issubpath(path, d->path)) { + if (strstartswith(path, d->path)) { int ret = find_dir(path, &d->children, rdir); if (ret == NO_MATCH) { *rdir = d; @@ -373,8 +370,7 @@ static void free_all_cgroup_props(struct cgroup_dir *ncd) ncd->n_properties = 0; } -static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp, - struct cg_controller *controller) +static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const cgp_t *cgp) { int j; char buf[PATH_MAX]; @@ -425,14 +421,6 @@ static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd, const prop->value = new; } - /* - * Set the is_threaded flag if cgroup.type's value is threaded - * or it is a cgroup v1 (it has a 'tasks' property). - * Ignore all other values. - */ - if ((!strcmp("cgroup.type", prop->name) && !strcmp("threaded", prop->value)) || !strcmp("tasks", prop->name)) - controller->is_threaded = true; - pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name); list_add_tail(&prop->list, &ncd->properties); ncd->n_properties++; @@ -448,20 +436,12 @@ static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd, stru for (i = 0; i < controller->n_controllers; ++i) { const cgp_t *cgp = cgp_get_props(controller->controllers[i]); - if (dump_cg_props_array(fpath, ncd, cgp, controller) < 0) { + if (dump_cg_props_array(fpath, ncd, cgp) < 0) { pr_err("dumping known properties failed\n"); return -1; } - } - /* cgroup v2 */ - if (controller->controllers[0][0] == 0) { - if (dump_cg_props_array(fpath, ncd, &cgp_global_v2, controller) < 0) { - pr_err("dumping global properties v2 failed\n"); - return -1; - } - } else { - if (dump_cg_props_array(fpath, ncd, &cgp_global, controller) < 0) { + if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) { pr_err("dumping global properties failed\n"); return -1; } @@ -580,15 +560,14 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) int fsfd, fd; char *name; - fsfd = cr_fsopen(fstype, 0); + fsfd = sys_fsopen(fstype, 0); if (fsfd < 0) { pr_perror("Unable to open the cgroup file system"); return -1; } if (strstartswith(cc->name, namestr)) { - if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { - fsfd_dump_messages(fsfd); + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "name", cc->name + strlen(namestr), 0)) { pr_perror("Unable to configure the cgroup (%s) file system", cc->name); goto err; } @@ -596,8 +575,7 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) char *saveptr = NULL, *buf = strdupa(cc->name); name = strtok_r(buf, ",", &saveptr); while (name) { - if (cr_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { - fsfd_dump_messages(fsfd); + if (sys_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { pr_perror("Unable to configure the cgroup (%s) file system", name); goto err; } @@ -605,17 +583,14 @@ static int __new_open_cgroupfs(struct cg_ctl *cc) } } - if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { - fsfd_dump_messages(fsfd); + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { pr_perror("Unable to create the cgroup (%s) file system", cc->name); goto err; } - fd = cr_fsmount(fsfd, 0, 0); - if (fd < 0) { - fsfd_dump_messages(fsfd); + fd = sys_fsmount(fsfd, 0, 0); + if (fd < 0) pr_perror("Unable to mount the cgroup (%s) file system", cc->name); - } close(fsfd); return fd; @@ -644,8 +619,8 @@ static int open_cgroupfs(struct cg_ctl *cc) return -1; } - if (mount("none", prefix, fstype, 0, mopts[0] ? mopts : NULL) < 0) { - pr_perror("Unable to mount %s %s", fstype, mopts); + if (mount("none", prefix, fstype, 0, mopts) < 0) { + pr_perror("Unable to mount %s", mopts); rmdir(prefix); return -1; } @@ -719,8 +694,6 @@ static int collect_cgroups(struct list_head *ctls) } } else { fd = open_cgroupfs(cc); - if (fd < 0) - return -1; } path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); @@ -753,28 +726,20 @@ static int collect_cgroups(struct list_head *ctls) return 0; } -int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args, int id) +int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args) { - int pid, tid; + int pid; LIST_HEAD(ctls); unsigned int n_ctls = 0; struct cg_set *cs; - if (opts.unprivileged) - return 0; - if (item) pid = item->pid->real; else pid = getpid(); - if (id < 0) - tid = pid; - else - tid = item->threads[id].real; - - pr_info("Dumping cgroups for thread %d\n", tid); - if (parse_thread_cgroup(pid, tid, args, &ctls, &n_ctls)) + pr_info("Dumping cgroups for %d\n", pid); + if (parse_task_cgroup(pid, args, &ctls, &n_ctls)) return -1; cs = get_cg_set(&ctls, n_ctls, item); @@ -787,10 +752,9 @@ int dump_thread_cgroup(const struct pstree_item *item, u32 *cg_id, struct parasi pr_info("Set %d is criu one\n", cs->id); } else { if (item == root_item) { - if (!root_cgset) { - root_cgset = cs; - pr_info("Set %d is root one\n", cs->id); - } + BUG_ON(root_cgset); + root_cgset = cs; + pr_info("Set %d is root one\n", cs->id); } else { struct cg_ctl *root, *stray; @@ -896,7 +860,7 @@ static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ** cde->dir_perms->gid = cur->gid; cde->dir_name = cur->path + poff; - if (poff != 1) /* parent isn't "/" */ + if (poff != 1) /* parent isn't "/" */ cde->dir_name++; /* leading / */ cde->n_children = cur->n_children; if (cur->n_children > 0) @@ -937,8 +901,6 @@ static int dump_controllers(CgroupEntry *cg) list_for_each_entry(cur, &cgroups, l) { cg_controller_entry__init(ce); - ce->has_is_threaded = true; - ce->is_threaded = cur->is_threaded; ce->cnames = cur->controllers; ce->n_cnames = cur->n_controllers; ce->n_dirs = cur->n_heads; @@ -1026,9 +988,6 @@ int dump_cgroups(void) CgroupEntry cg = CGROUP_ENTRY__INIT; int ret = -1; - if (opts.unprivileged) - return 0; - BUG_ON(!criu_cgset || !root_cgset); /* @@ -1076,7 +1035,7 @@ static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt } if (n[0] == 0) - doff += snprintf(dir + doff, ds - doff, "unified,"); + doff += snprintf(dir + doff, ds - doff, "unified"); else doff += snprintf(dir + doff, ds - doff, "%s,", n); if (opt) @@ -1095,15 +1054,8 @@ static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds, char *opt * it. We restore these properties as soon as the cgroup is created. */ static const char *special_props[] = { - "cpuset.cpus", - "cpuset.mems", - "devices.list", - "memory.kmem.limit_in_bytes", - "memory.swappiness", - "memory.oom_control", - "memory.use_hierarchy", - "cgroup.type", - NULL, + "cpuset.cpus", "cpuset.mems", "devices.list", "memory.kmem.limit_in_bytes", + "memory.swappiness", "memory.oom_control", "memory.use_hierarchy", NULL, }; bool is_special_property(const char *prop) @@ -1209,12 +1161,17 @@ static int prepare_cgns(CgSetEntry *se) return 0; } -static int move_in_cgroup(CgSetEntry *se) +static int move_in_cgroup(CgSetEntry *se, bool setup_cgns) { int i; pr_info("Move into %d\n", se->id); + if (setup_cgns && prepare_cgns(se) < 0) { + pr_err("failed preparing cgns\n"); + return -1; + } + for (i = 0; i < se->n_ctls; i++) { char aux[PATH_MAX]; int fd = -1, err, j, aux_off; @@ -1254,52 +1211,12 @@ static int move_in_cgroup(CgSetEntry *se) return 0; } -int prepare_cgroup_namespace(struct pstree_item *root_task) -{ - CgSetEntry *se; - - if (opts.manage_cgroups == CG_MODE_IGNORE) - return 0; - - if (root_task->parent) { - pr_err("Expecting root_task to restore cgroup namespace\n"); - return -1; - } - - /* - * If on dump all dumped tasks are in same cgset with criu we don't - * dump cgsets and thus cgroup namespaces and rely that on restore - * criu caller would prepare proper cgset/cgns for us. Also in case - * of --unprivileged we don't even have the root cgset here. - */ - if (!rsti(root_task)->cg_set || rsti(root_task)->cg_set == root_cg_set) { - pr_info("Cgroup namespace inherited from parent\n"); - return 0; - } - - se = find_rst_set_by_id(rsti(root_task)->cg_set); - if (!se) { - pr_err("No set %d found\n", rsti(root_task)->cg_set); - return -1; - } - - if (prepare_cgns(se) < 0) { - pr_err("failed preparing cgns\n"); - return -1; - } - - return 0; -} - -int restore_task_cgroup(struct pstree_item *me) +int prepare_task_cgroup(struct pstree_item *me) { struct pstree_item *parent = me->parent; CgSetEntry *se; u32 current_cgset; - if (opts.manage_cgroups == CG_MODE_IGNORE) - return 0; - if (!rsti(me)->cg_set) return 0; @@ -1323,7 +1240,13 @@ int restore_task_cgroup(struct pstree_item *me) return -1; } - return move_in_cgroup(se); + /* Since don't support nesting of cgroup namespaces, let's only set up + * the cgns (if it exists) in the init task. In the future, we should + * just check that the cgns prefix string matches for all the entries + * in the cgset, and only unshare if that's true. + */ + + return move_in_cgroup(se, !me->parent); } void fini_cgroup(void) @@ -1342,78 +1265,39 @@ void fini_cgroup(void) cg_yard = NULL; } -static int add_subtree_control_prop_prefix(char *input, char *output, char prefix) +static int restore_perms(int fd, const char *path, CgroupPerms *perms) { - char *current, *next; - size_t len, off = 0; + struct stat sb; - current = input; - do { - next = strchrnul(current, ' '); - len = next - current; + if (perms) { + if (fstat(fd, &sb) < 0) { + pr_perror("stat of property %s failed", path); + return -1; + } - output[off] = prefix; - off++; - memcpy(output + off, current, len); - off += len; - output[off] = ' '; - off++; + /* only chmod/chown if the perms are actually different: we aren't + * allowed to chmod some cgroup props (e.g. the read only ones), so we + * don't want to try if the perms already match. + */ + if (sb.st_mode != (mode_t)perms->mode && fchmod(fd, perms->mode) < 0) { + pr_perror("chmod of %s failed", path); + return -1; + } - current = next + 1; - } while (*next != '\0'); - - return off; -} - -static int restore_cgroup_subtree_control(const CgroupPropEntry *cg_prop_entry_p, int fd) -{ - char buf[1024]; - char line[1024]; - int ret, off = 0; - - ret = read(fd, buf, sizeof(buf) - 1); - if (ret < 0) { - pr_perror("read from cgroup.subtree_control"); - return ret; - } - /* Remove the trailing newline */ - buf[ret] = '\0'; - - /* Remove all current subsys in subtree_control */ - if (buf[0] != '\0') - off = add_subtree_control_prop_prefix(buf, line, '-'); - - /* Add subsys need to be restored in subtree_control */ - if (cg_prop_entry_p->value[0] != '\0') - off += add_subtree_control_prop_prefix(cg_prop_entry_p->value, line + off, '+'); - - /* Remove the trailing space */ - if (off != 0) { - off--; - line[off] = '\0'; - } - - if (write(fd, line, off) != off) { - pr_perror("write to cgroup.subtree_control"); - return -1; + if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) && fchown(fd, perms->uid, perms->gid)) { + pr_perror("chown of %s failed", path); + return -1; + } } return 0; } -/* - * Note: The path string can be modified in this function, - * the length of path string should be at least PATH_MAX. - */ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *path, int off, bool split_lines, bool skip_fails) { - int cg, fd, exit_code = -1, flag; + int cg, fd, ret = -1; CgroupPerms *perms = cg_prop_entry_p->perms; - int is_subtree_control = !strcmp(cg_prop_entry_p->name, "cgroup.subtree_control"); - - if (opts.manage_cgroups == CG_MODE_IGNORE) - return 0; if (!cg_prop_entry_p->value) { pr_err("cg_prop_entry->value was empty when should have had a value\n"); @@ -1427,35 +1311,19 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path); - if (is_subtree_control) - flag = O_RDWR; - else - flag = O_WRONLY; - cg = get_service_fd(CGROUP_YARD); - fd = openat(cg, path, flag); + fd = openat(cg, path, O_WRONLY); if (fd < 0) { pr_perror("bad cgroup path: %s", path); return -1; } - if (perms && cr_fchperm(fd, perms->uid, perms->gid, perms->mode) < 0) + if (restore_perms(fd, path, perms) < 0) goto out; /* skip these two since restoring their values doesn't make sense */ if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) { - exit_code = 0; - goto out; - } - - if (is_subtree_control) { - exit_code = restore_cgroup_subtree_control(cg_prop_entry_p, fd); - goto out; - } - - /* skip restoring cgroup.type if its value is not "threaded" */ - if (!strcmp(cg_prop_entry_p->name, "cgroup.type") && strcmp(cg_prop_entry_p->value, "threaded")) { - exit_code = 0; + ret = 0; goto out; } @@ -1477,28 +1345,21 @@ static int restore_cgroup_prop(const CgroupPropEntry *cg_prop_entry_p, char *pat } while (*next_line != '\0'); } else { size_t len = strlen(cg_prop_entry_p->value); - int ret; - ret = write(fd, cg_prop_entry_p->value, len); - /* memory.kmem.limit_in_bytes has been deprecated. Look at - * 58056f77502f3 ("memcg, kmem: further deprecate - * kmem.limit_in_bytes") for more details. */ - if (ret == -1 && errno == EOPNOTSUPP && - !strcmp(cg_prop_entry_p->name, "memory.kmem.limit_in_bytes")) - ret = len; - if (ret != len) { + if (write(fd, cg_prop_entry_p->value, len) != len) { pr_perror("Failed writing %s to %s", cg_prop_entry_p->value, path); if (!skip_fails) goto out; } } - exit_code = 0; + ret = 0; + out: if (close(fd) != 0) pr_perror("Failed closing %s", path); - return exit_code; + return ret; } static CgroupPropEntry *freezer_state_entry; @@ -1763,7 +1624,7 @@ static int restore_special_props(char *paux, size_t off, CgroupDirEntry *e) static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) { - int fd, ret = 0; + int fd, ret; fd = openat(cg, path, O_DIRECTORY); if (fd < 0) { @@ -1771,8 +1632,7 @@ static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms) return -1; } - if (perms) - ret = cr_fchperm(fd, perms->uid, perms->gid, perms->mode); + ret = restore_perms(fd, path, perms); close(fd); return ret; } @@ -1811,9 +1671,12 @@ static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux return -1; for (j = 0; j < n_controllers; j++) { - if (restore_special_props(paux, off2, e) < 0) { - pr_err("Restoring special cpuset props failed!\n"); - return -1; + if (!strcmp(controllers[j], "cpuset") || !strcmp(controllers[j], "memory") || + !strcmp(controllers[j], "devices")) { + if (restore_special_props(paux, off2, e) < 0) { + pr_err("Restoring special cpuset props failed!\n"); + return -1; + } } } } else { @@ -1927,7 +1790,7 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) if (ctrl->cnames[0][0] == 0) fstype = "cgroup2"; - pr_debug("\tMaking controller dir %s (%s), type %s\n", paux, opt, fstype); + pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); return -1; @@ -1951,161 +1814,6 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) return 0; } -static int cgroupd_unblock_sigterm(void) -{ - sigset_t unblockmask; - - sigemptyset(&unblockmask); - sigaddset(&unblockmask, SIGTERM); - - if (sigprocmask(SIG_UNBLOCK, &unblockmask, NULL)) { - pr_perror("cgroupd: can't unblock SIGTERM"); - return -1; - } - - return 0; -} - -/* - * If a thread is a different cgroup set than the main thread in process, - * it means it is in a threaded controller. This daemon receives the cg_set - * number from the restored thread and move this thread to the correct - * cgroup controllers - */ -static int cgroupd(int sk) -{ - /* - * This pairs with SIGTERM in stop_cgroupd(), and ensures that cgroupd - * will receive termination signal, regardless of which signal block - * mask was inherited. - */ - if (cgroupd_unblock_sigterm()) - return -1; - - pr_info("cgroud: Daemon started\n"); - - while (1) { - struct unsc_msg um; - uns_call_t call; - pid_t tid; - int fd, cg_set, i; - CgSetEntry *cg_set_entry; - int ret; - - unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, NULL); - ret = recvmsg(sk, &um.h, 0); - if (ret <= 0) { - pr_perror("cgroupd: recv req error"); - return -1; - } - - unsc_msg_pid_fd(&um, &tid, &fd); - pr_debug("cgroupd: move process %d into cg_set %d\n", tid, cg_set); - - cg_set_entry = find_rst_set_by_id(cg_set); - if (!cg_set_entry) { - pr_err("cgroupd: No set found %d\n", cg_set); - return -1; - } - - for (i = 0; i < cg_set_entry->n_ctls; i++) { - int j, aux_off; - CgMemberEntry *ce = cg_set_entry->ctls[i]; - char aux[PATH_MAX]; - CgControllerEntry *ctrl = NULL; - const char *format; - - for (j = 0; j < n_controllers; j++) { - CgControllerEntry *cur = controllers[j]; - if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name, NULL)) { - ctrl = cur; - break; - } - } - - if (!ctrl) { - pr_err("cgroupd: No cg_controller_entry found for %s/%s\n", ce->name, ce->path); - return -1; - } - - /* - * This is not a threaded controller, all threads in this - * process must be in this controller. Main thread has been - * restored, so this thread is in this controller already. - */ - if (!ctrl->has_is_threaded || !ctrl->is_threaded) - continue; - - aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0); - format = ctrl->cnames[0][0] ? "/%s/tasks" : "/%s/cgroup.threads"; - snprintf(aux + aux_off, sizeof(aux) - aux_off, format, ce->path); - - /* - * Cgroupd runs outside of the namespaces so we don't - * need to use userns_call here - */ - if (userns_move(aux, 0, tid)) { - pr_err("cgroupd: Can't move thread %d into %s/%s\n", tid, ce->name, ce->path); - return -1; - } - } - - /* - * We only want to send the cred which contains thread id back. - * The restored thread recvmsg(MSG_PEEK) until it gets its own - * thread id. - */ - unsc_msg_init(&um, &call, &cg_set, NULL, 0, 0, &tid); - if (sendmsg(sk, &um.h, 0) <= 0) { - pr_perror("cgroupd: send req error"); - return -1; - } - } - - return 0; -} - -int stop_cgroupd(void) -{ - if (cgroupd_pid) { - sigset_t blockmask, oldmask; - - /* - * Block the SIGCHLD signal to avoid triggering - * sigchld_handler() - */ - sigemptyset(&blockmask); - sigaddset(&blockmask, SIGCHLD); - sigprocmask(SIG_BLOCK, &blockmask, &oldmask); - - kill(cgroupd_pid, SIGTERM); - waitpid(cgroupd_pid, NULL, 0); - - sigprocmask(SIG_SETMASK, &oldmask, NULL); - } - - return 0; -} - -static int prepare_cgroup_thread_sfd(void) -{ - int sk; - - sk = start_unix_cred_daemon(&cgroupd_pid, cgroupd); - if (sk < 0) { - pr_err("failed to start cgroupd\n"); - return -1; - } - - if (install_service_fd(CGROUPD_SK, sk) < 0) { - kill(cgroupd_pid, SIGKILL); - waitpid(cgroupd_pid, NULL, 0); - return -1; - } - - return 0; -} - static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers, char **dir_name, char *newroot) { size_t dirlen = strlen(*dir_name); @@ -2260,19 +1968,15 @@ int prepare_cgroup(void) n_controllers = ce->n_controllers; controllers = ce->controllers; - if (n_sets) { + if (n_sets) /* * We rely on the fact that all sets contain the same * set of controllers. This is checked during dump * with cg_set_compare(CGCMP_ISSUB) call. */ ret = prepare_cgroup_sfd(ce); - if (ret < 0) - return ret; - ret = prepare_cgroup_thread_sfd(); - } else { + else ret = 0; - } return ret; } diff --git a/criu/config.c b/criu/config.c index d7ef3f8e8..91fb0b64d 100644 --- a/criu/config.c +++ b/criu/config.c @@ -18,10 +18,8 @@ #include "cr_options.h" #include "filesystems.h" #include "file-lock.h" -#include "image.h" #include "irmap.h" #include "mount.h" -#include "mount-v2.h" #include "namespaces.h" #include "net.h" #include "sk-inet.h" @@ -231,7 +229,7 @@ out: tmp_string[0] = 0; /* Check for unsupported configuration file entries */ - if (strchr(configuration[i] + offset, ' ')) { + if (configuration[i] + offset + 1 != 0 && strchr(configuration[i] + offset, ' ')) { int j; len = strlen(configuration[i] + offset); for (j = 0; j < len - 1; j++) { @@ -431,7 +429,6 @@ void init_opts(void) opts.pre_dump_mode = PRE_DUMP_SPLICE; opts.file_validation_method = FILE_VALIDATION_DEFAULT; opts.network_lock_method = NETWORK_LOCK_DEFAULT; - opts.ghost_fiemap = FIEMAP_DEFAULT; } bool deprecated_ok(char *what) @@ -552,7 +549,7 @@ static size_t parse_size(char *optarg) static int parse_join_ns(const char *ptr) { char *aux, *ns_file, *extra_opts = NULL; - cleanup_free char *ns = NULL; + char *ns; ns = xstrdup(ptr); if (ns == NULL) @@ -698,21 +695,13 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097 }, { "file-validation", required_argument, 0, 1098 }, - BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, - BOOL_OPT("mntns-compat-mode", &opts.mntns_compat_mode), - BOOL_OPT("unprivileged", &opts.unprivileged), - BOOL_OPT("ghost-fiemap", &opts.ghost_fiemap), - BOOL_OPT(OPT_ALLOW_UPROBES, &opts.allow_uprobes), {}, }; #undef BOOL_OPT - if (argv && argv[0]) - SET_CHAR_OPTS(argv_0, argv[0]); - ret = pre_parse(argc, argv, usage_error, &no_default_config, &cfg_file); if (ret) @@ -1038,8 +1027,6 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, opts.network_lock_method = NETWORK_LOCK_IPTABLES; } else if (!strcmp("nftables", optarg)) { opts.network_lock_method = NETWORK_LOCK_NFTABLES; - } else if (!strcmp("skip", optarg) || !strcmp("none", optarg)) { - opts.network_lock_method = NETWORK_LOCK_SKIP; } else { pr_err("Invalid value for --network-lock: %s\n", optarg); return 1; @@ -1084,7 +1071,7 @@ int check_options(void) if (opts.link_remap_ok) pr_info("Will allow link remaps on FS\n"); if (opts.weak_sysctls) - pr_info("Will skip non-existent sysctls on restore\n"); + pr_info("Will skip non-existant sysctls on restore\n"); if (opts.deprecated_ok) pr_info("Turn deprecated stuff ON\n"); @@ -1116,21 +1103,6 @@ int check_options(void) } #endif - if (opts.mntns_compat_mode && opts.mode != CR_RESTORE) { - pr_err("Option --mntns-compat-mode is only valid on restore\n"); - return 1; - } else if (!opts.mntns_compat_mode && opts.mode == CR_RESTORE) { - if (check_mount_v2()) { - pr_debug("Mount engine fallback to --mntns-compat-mode mode\n"); - opts.mntns_compat_mode = true; - } - } - - if (opts.track_mem && !kdat.has_dirty_track) { - pr_err("Tracking memory is not available. Consider omitting --track-mem option.\n"); - return 1; - } - if (check_namespace_opts()) { pr_err("Error: namespace flags conflict\n"); return 1; diff --git a/criu/cr-check.c b/criu/cr-check.c index 7c3dc76dd..3575fb3b3 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -21,8 +21,7 @@ #include #include #include -#include -#include +#include #include "../soccr/soccr.h" @@ -31,7 +30,7 @@ #include "sockets.h" #include "crtools.h" #include "log.h" -#include "util-caps.h" +#include "util-pie.h" #include "prctl.h" #include "files.h" #include "sk-inet.h" @@ -53,8 +52,6 @@ #include "net.h" #include "restorer.h" #include "uffd.h" -#include "linux/aio_abi.h" -#include "mount-v2.h" #include "images/inventory.pb-c.h" @@ -107,7 +104,7 @@ out: static int check_apparmor_stacking(void) { - if (!kdat.apparmor_ns_dumping_enabled) + if (!check_aa_ns_dumping()) return -1; return 0; @@ -518,14 +515,6 @@ static int check_ipc(void) { int ret; - /* - * Since kernel 5.16 sem_next_id can be accessed via CAP_CHECKPOINT_RESTORE, however - * for non-root users access() runs with an empty set of caps and will therefore always - * fail. - */ - if (opts.uid) - return 0; - ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK); if (!ret) return 0; @@ -548,6 +537,61 @@ static int check_sigqueuinfo(void) return 0; } +static pid_t fork_and_ptrace_attach(int (*child_setup)(void)) +{ + pid_t pid; + int sk_pair[2], sk; + char c = 0; + + if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { + pr_perror("socketpair"); + return -1; + } + + pid = fork(); + if (pid < 0) { + pr_perror("fork"); + return -1; + } else if (pid == 0) { + sk = sk_pair[1]; + close(sk_pair[0]); + + if (child_setup && child_setup() != 0) + exit(1); + + if (write(sk, &c, 1) != 1) { + pr_perror("write"); + exit(1); + } + + while (1) + sleep(1000); + exit(1); + } + + sk = sk_pair[0]; + close(sk_pair[1]); + + if (read(sk, &c, 1) != 1) { + close(sk); + kill(pid, SIGKILL); + pr_perror("read"); + return -1; + } + + close(sk); + + if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { + pr_perror("Unable to ptrace the child"); + kill(pid, SIGKILL); + return -1; + } + + waitpid(pid, NULL, 0); + + return pid; +} + static int check_ptrace_peeksiginfo(void) { struct ptrace_peeksiginfo_args arg; @@ -574,7 +618,6 @@ static int check_ptrace_peeksiginfo(void) } kill(pid, SIGKILL); - waitpid(pid, NULL, 0); return ret; } @@ -725,7 +768,6 @@ static int check_special_mapping_mremap(void) /* Probably, we're interrupted with a signal - cleanup */ pr_err("Failed to wait for a child %d\n", errno); kill(child, SIGKILL); - waitpid(child, NULL, 0); return -1; } @@ -764,7 +806,6 @@ static int check_ptrace_suspend_seccomp(void) } kill(pid, SIGKILL); - waitpid(pid, NULL, 0); return ret; } @@ -805,19 +846,9 @@ static int check_ptrace_dump_seccomp_filters(void) } kill(pid, SIGKILL); - waitpid(pid, NULL, 0); return ret; } -static int check_ptrace_get_rseq_conf(void) -{ - if (!kdat.has_ptrace_get_rseq_conf) { - pr_warn("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported. C/R of processes which are using rseq() won't work.\n"); - return -1; - } - return 0; -} - static int check_mem_dirty_track(void) { if (!kdat.has_dirty_track) { @@ -1050,14 +1081,10 @@ static int check_tcp(void) } val = 1; - if (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) { - ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); - if (ret < 0) { - pr_perror("Can't turn TCP repair mode ON"); - goto out; - } - } else { - pr_info("Not checking for TCP repair mode. Please set CAP_NET_ADMIN\n"); + ret = setsockopt(sk, SOL_TCP, TCP_REPAIR, &val, sizeof(val)); + if (ret < 0) { + pr_perror("Can't turn TCP repair mode ON"); + goto out; } optlen = sizeof(val); @@ -1088,8 +1115,6 @@ static int kerndat_tcp_repair_window(void) int sk, val = 1; sk = socket(AF_INET, SOCK_STREAM, 0); - if (sk < 0 && errno == EAFNOSUPPORT) - sk = socket(AF_INET6, SOCK_STREAM, 0); if (sk < 0) { pr_perror("Unable to create inet socket"); goto errn; @@ -1197,7 +1222,7 @@ static int check_ipt_legacy(void) char *ipt_legacy_bin; char *ip6t_legacy_bin; - ipt_legacy_bin = get_legacy_iptables_bin(false, false); + ipt_legacy_bin = get_legacy_iptables_bin(false); if (!ipt_legacy_bin) { pr_warn("Couldn't find iptables version which is using iptables legacy API\n"); return -1; @@ -1208,7 +1233,7 @@ static int check_ipt_legacy(void) if (!kdat.ipv6) return 0; - ip6t_legacy_bin = get_legacy_iptables_bin(true, false); + ip6t_legacy_bin = get_legacy_iptables_bin(true); if (!ip6t_legacy_bin) { pr_warn("Couldn't find ip6tables version which is using iptables legacy API\n"); return -1; @@ -1308,7 +1333,7 @@ static int check_net_diag_raw(void) { check_sock_diag(); return (socket_test_collect_bit(AF_INET, IPPROTO_RAW) && socket_test_collect_bit(AF_INET6, IPPROTO_RAW)) ? 0 : - -1; + -1; } static int check_pidfd_store(void) @@ -1328,15 +1353,10 @@ static int check_pidfd_store(void) static int check_ns_pid(void) { - if (!kdat.has_nspid) + if (kerndat_has_nspid() < 0) return -1; - return 0; -} - -static int check_memfd_hugetlb(void) -{ - if (!kdat.has_memfd_hugetlb) + if (!kdat.has_nspid) return -1; return 0; @@ -1352,260 +1372,6 @@ static int check_network_lock_nftables(void) return 0; } -static int check_sockopt_buf_lock(void) -{ - if (!kdat.has_sockopt_buf_lock) - return -1; - - return 0; -} - -static int check_move_mount_set_group(void) -{ - if (!kdat.has_move_mount_set_group) - return -1; - - return 0; -} - -static int check_openat2(void) -{ - if (!kdat.has_openat2) - return -1; - - return 0; -} - -static int check_ipv6_freebind(void) -{ - if (!kdat.has_ipv6_freebind) - return -1; - - return 0; -} - -static int check_pagemap_scan(void) -{ - if (!kdat.has_pagemap_scan) - return -1; - - return 0; -} - -static int check_timer_cr_ids(void) -{ - if (!kdat.has_timer_cr_ids) - return -1; - - return 0; -} - -/* musl doesn't have a statx wrapper... */ -struct staty { - __u32 stx_dev_major; - __u32 stx_dev_minor; - __u64 stx_ino; -}; - -static long get_file_dev_and_inode(void *addr, struct staty *stx) -{ - char buf[4096]; - FILE *mapf; - - mapf = fopen("/proc/self/maps", "r"); - if (mapf == NULL) { - pr_perror("fopen(/proc/self/maps)"); - return -1; - } - - while (fgets(buf, sizeof(buf), mapf)) { - unsigned long start, end; - uint32_t maj, min; - __u64 ino; - - if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", - &start, &end, &maj, &min, &ino) != 5) { - pr_perror("Unable to parse: %s", buf); - return -1; - } - if (start == (unsigned long)addr) { - stx->stx_dev_major = maj; - stx->stx_dev_minor = min; - stx->stx_ino = ino; - return 0; - } - } - - pr_err("Unable to find the mapping\n"); - return -1; -} - -static int ovl_mount(void) -{ - int tmpfs, fsfd, ovl; - - fsfd = cr_fsopen("tmpfs", 0); - if (fsfd == -1) { - pr_perror("Unable to fsopen tmpfs"); - return -1; - } - - if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { - pr_perror("Unable to create tmpfs mount"); - return -1; - } - - tmpfs = cr_fsmount(fsfd, 0, 0); - if (tmpfs == -1) { - pr_perror("Unable to mount tmpfs"); - return -1; - } - - close(fsfd); - - /* overlayfs can't be constructed on top of a detached mount. */ - if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) { - pr_perror("Unable to attach tmpfs mount"); - return -1; - } - close(tmpfs); - - if (chdir("/tmp")) { - pr_perror("Unable to change working directory"); - return -1; - } - - if (mkdir("/tmp/w", 0755) == -1 || - mkdir("/tmp/u", 0755) == -1 || - mkdir("/tmp/l", 0755) == -1) { - pr_perror("mkdir"); - return -1; - } - - fsfd = cr_fsopen("overlay", 0); - if (fsfd == -1) { - pr_perror("Unable to fsopen overlayfs"); - return -1; - } - if (cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || - cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || - cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || - cr_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) { - pr_perror("Unable to configure overlayfs"); - return -1; - } - if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) { - pr_perror("Unable to create overlayfs"); - return -1; - } - ovl = cr_fsmount(fsfd, 0, 0); - if (ovl == -1) { - pr_perror("Unable to mount overlayfs"); - return -1; - } - - return ovl; -} - -/* - * Check that the file device and inode shown in /proc/pid/maps match values - * returned by stat(2). - */ -static int do_check_overlayfs_maps(void) -{ - struct staty stx, mstx; - struct stat st; - int ovl, fd; - void *addr; - - /* Create a new mount namespace to not care about cleaning test mounts. */ - if (unshare(CLONE_NEWNS) == -1) { - pr_warn("Unable to create a new mount namespace\n"); - return 0; - } - - if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { - pr_perror("Unable to remount / with MS_SLAVE"); - return -1; - } - - ovl = ovl_mount(); - if (ovl == -1) - return -1; - - fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); - if (fd == -1) { - pr_perror("Unable to open a test file"); - return -1; - } - - addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - pr_perror("Unable to map the test file"); - return -1; - } - - if (get_file_dev_and_inode(addr, &mstx)) - return -1; - if (fstat(fd, &st)) { - pr_perror("stat"); - return -1; - } - stx.stx_dev_major = major(st.st_dev); - stx.stx_dev_minor = minor(st.st_dev); - stx.stx_ino = st.st_ino; - - if (stx.stx_dev_major != mstx.stx_dev_major || - stx.stx_dev_minor != mstx.stx_dev_minor || - stx.stx_ino != mstx.stx_ino) { - pr_err("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", - mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, - stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); - return -1; - } - - return 0; -} - -static int check_overlayfs_maps(void) -{ - pid_t pid; - int status; - - pid = fork(); - if (pid == -1) { - pr_perror("Unable to fork a child"); - return -1; - } - if (pid == 0) { - if (do_check_overlayfs_maps()) - exit(1); - exit(0); - } - if (waitpid(pid, &status, 0) == -1) { - pr_perror("waitpid"); - return -1; - } - return status == 0 ? 0 : -1; -} - -static int check_breakpoints(void) -{ - if (!kdat.has_breakpoints) { - pr_warn("Hardware breakpoints don't seem to work\n"); - return -1; - } - - return 0; -} - -static int check_pagemap_scan_guard_pages(void) -{ - kerndat_warn_about_madv_guards(); - - return kdat.has_pagemap_scan_guard_pages ? 0 : -1; -} - static int (*chk_feature)(void); /* @@ -1633,12 +1399,14 @@ static int (*chk_feature)(void); return ret; \ } \ } while (0) - int cr_check(void) { struct ns_id *ns; int ret = 0; + if (!is_root_user()) + return -1; + root_item = alloc_pstree_item(); if (root_item == NULL) return -1; @@ -1720,20 +1488,8 @@ int cr_check(void) ret |= check_newifindex(); ret |= check_pidfd_store(); ret |= check_ns_pid(); + ret |= check_apparmor_stacking(); ret |= check_network_lock_nftables(); - ret |= check_sockopt_buf_lock(); - ret |= check_memfd_hugetlb(); - ret |= check_move_mount_set_group(); - ret |= check_openat2(); - ret |= check_ptrace_get_rseq_conf(); - ret |= check_ipv6_freebind(); - ret |= check_pagemap_scan(); - ret |= check_overlayfs_maps(); - ret |= check_timer_cr_ids(); - ret |= check_pagemap_scan_guard_pages(); - - if (kdat.lsm == LSMTYPE__APPARMOR) - ret |= check_apparmor_stacking(); } /* @@ -1743,10 +1499,6 @@ int cr_check(void) ret |= check_autofs(); ret |= check_compat_cr(); } - /* - * Category 4 - optional. - */ - check_breakpoints(); pr_msg("%s\n", ret ? CHECK_MAYBE : CHECK_GOOD); return ret; @@ -1850,17 +1602,6 @@ static struct feature_list feature_list[] = { { "ns_pid", check_ns_pid }, { "apparmor_stacking", check_apparmor_stacking }, { "network_lock_nftables", check_network_lock_nftables }, - { "sockopt_buf_lock", check_sockopt_buf_lock }, - { "memfd_hugetlb", check_memfd_hugetlb }, - { "move_mount_set_group", check_move_mount_set_group }, - { "openat2", check_openat2 }, - { "get_rseq_conf", check_ptrace_get_rseq_conf }, - { "ipv6_freebind", check_ipv6_freebind }, - { "pagemap_scan", check_pagemap_scan }, - { "timer_cr_ids", check_timer_cr_ids }, - { "overlayfs_maps", check_overlayfs_maps }, - { "breakpoints", check_breakpoints }, - { "pagemap_scan_guard_pages", check_pagemap_scan_guard_pages }, { NULL, NULL }, }; @@ -1880,7 +1621,7 @@ void pr_check_features(const char *offset, const char *sep, int width) } pr_msg("%s", fl->name); // no \n pos += len; - if ((fl + 1)->name) { // not the last item + if ((fl + 1)->name) { // not the last item pr_msg("%s", sep); // no \n pos += sep_len; } @@ -1912,54 +1653,3 @@ static char *feature_name(int (*func)(void)) } return NULL; } - -static int pr_set_dumpable(int value) -{ - int ret = prctl(PR_SET_DUMPABLE, value, 0, 0, 0); - if (ret < 0) - pr_perror("Unable to set PR_SET_DUMPABLE"); - return ret; -} - -int check_caps(void) -{ - /* Read out effective capabilities and store in opts.cap_eff. */ - if (set_opts_cap_eff()) - goto out; - - /* - * No matter if running as root or not. CRIU always needs - * at least these capabilities. - */ - if (!has_cap_checkpoint_restore(opts.cap_eff)) - goto out; - - /* For some things we need to know if we are running as root. */ - opts.uid = geteuid(); - - if (!opts.uid) { - /* CRIU is running as root. No further checks are necessary. */ - return 0; - } - - if (!opts.unprivileged) { - pr_msg("Running as non-root requires '--unprivileged'\n"); - pr_msg("Please consult the documentation for limitations when running as non-root\n"); - return -1; - } - - /* - * At his point we know we are running as non-root with the necessary - * capabilities available. Now we have to make the process dumpable - * so that /proc/self is not owned by root. - */ - if (pr_set_dumpable(1)) - return -1; - - return 0; -out: - pr_msg("CRIU needs to have the CAP_SYS_ADMIN or the CAP_CHECKPOINT_RESTORE capability: \n"); - pr_msg("setcap cap_checkpoint_restore+eip %s\n", opts.argv_0); - - return -1; -} diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c index feeb9ebb0..c0c21f53e 100644 --- a/criu/cr-dedup.c +++ b/criu/cr-dedup.c @@ -87,8 +87,7 @@ static int cr_dedup_one_pagemap(unsigned long img_id, int flags) if (ret <= 0) goto exit; - pr_debug("dedup iovec %" PRIx64 " - %" PRIx64 "\n", - pr.pe->vaddr, pr.pe->vaddr + pagemap_len(pr.pe)); + pr_debug("dedup iovec base=%" PRIx64 ", len=%lu\n", pr.pe->vaddr, pagemap_len(pr.pe)); if (!pagemap_in_parent(pr.pe)) { ret = dedup_one_iovec(prp, pr.pe->vaddr, pagemap_len(pr.pe)); if (ret) diff --git a/criu/cr-dump.c b/criu/cr-dump.c index a58aaf34a..940f62246 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -45,7 +45,6 @@ #include "proc_parse.h" #include "parasite.h" #include "parasite-syscall.h" -#include "compel/ptrace.h" #include "files.h" #include "files-reg.h" #include "shmem.h" @@ -86,8 +85,6 @@ #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" -#include "timer.h" -#include "sigact.h" /* * Architectures can overwrite this function to restore register sets that @@ -130,23 +127,6 @@ int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap if (ret < 0) goto err; - /* - * In addition to real process VMAs we should keep an info about - * madvise(MADV_GUARD_INSTALL) pages. While these are not represented - * as a struct vm_area_struct in the kernel, it is convenient to treat - * them as mappings in CRIU and reuse the same VMA images but with only - * VMA_AREA_GUARD flag set. - * - * Also, we don't need to dump them during pre-dump. - */ - if (dump_file) { - ret = collect_madv_guards(pid, vma_area_list); - if (ret < 0) { - pr_err("Collect MADV_GUARD_INSTALL pages (pid: %d) failed with %d\n", pid, ret); - goto err; - } - } - pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->nr_priv_pages_longest); pr_info_vma_list(&vma_area_list->h); @@ -176,11 +156,6 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_policy = true; tc->sched_policy = ret; - /* The reset-on-fork flag might be used in combination - * with SCHED_FIFO or SCHED_RR to reset the scheduling - * policy/priority in child processes. - */ - ret &= ~SCHED_RESET_ON_FORK; if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) { ret = syscall(__NR_sched_getparam, pid, &sp); if (ret < 0) { @@ -212,25 +187,6 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) return 0; } -static int check_thread_rseq(pid_t tid, const struct parasite_check_rseq *ti_rseq) -{ - if (!kdat.has_rseq || kdat.has_ptrace_get_rseq_conf) - return 0; - - pr_debug("%d has rseq_inited = %d\n", tid, ti_rseq->rseq_inited); - - /* - * We have no kdat.has_ptrace_get_rseq_conf and user - * process has rseq() used, let's fail dump. - */ - if (ti_rseq->rseq_inited) { - pr_err("%d has rseq but kernel lacks get_rseq_conf feature\n", tid); - return -1; - } - - return 0; -} - struct cr_imgset *glob_imgset; static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) @@ -453,7 +409,7 @@ static int dump_filemap(struct vma_area *vma_area, int fd) if (vma_area->aufs_rpath) { struct fd_link aufs_link; - __strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); + strlcpy(aufs_link.name, vma_area->aufs_rpath, sizeof(aufs_link.name)); aufs_link.len = strlen(aufs_link.name); p.link = &aufs_link; } @@ -659,7 +615,7 @@ static int dump_task_kobj_ids(struct pstree_item *item) TaskKobjIdsEntry *ids = item->ids; elem.pid = pid; - elem.idx = 0; /* really 0 for all */ + elem.idx = 0; /* really 0 for all */ elem.genid = 0; /* FIXME optimize */ new = 0; @@ -761,17 +717,6 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread if (!ret) ret = seccomp_dump_thread(pid, tc); - /* - * We are dumping rseq() in the dump_thread_rseq() function, - * *before* processes gets infected (because of ptrace requests - * API restriction). At this point, if the kernel lacks - * kdat.has_ptrace_get_rseq_conf support we have to ensure - * that dumpable processes haven't initialized rseq() or - * fail dump if rseq() was used. - */ - if (!ret) - ret = check_thread_rseq(pid, &ti->rseq); - return ret; } @@ -783,7 +728,6 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item pid_t pid = item->pid->real; int ret = -1; struct parasite_dump_cgroup_args cgroup_args, *info = NULL; - u32 *cg_set; BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); @@ -794,16 +738,11 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->tc->child_subreaper = misc->child_subreaper; core->tc->has_child_subreaper = true; - if (misc->membarrier_registration_mask) { - core->tc->membarrier_registration_mask = misc->membarrier_registration_mask; - core->tc->has_membarrier_registration_mask = true; - } - ret = get_task_personality(pid, &core->tc->personality); if (ret < 0) goto err; - __strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); + strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); core->tc->flags = stat->flags; core->tc->task_state = item->pid->state; core->tc->exit_code = 0; @@ -811,11 +750,6 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[0]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; - if (core->tc->task_state == TASK_STOPPED) { - core->tc->has_stop_signo = true; - core->tc->stop_signo = item->pid->stop_signo; - } - ret = parasite_dump_thread_leader_seized(ctl, pid, core); if (ret) goto err; @@ -834,15 +768,13 @@ static int dump_task_core_all(struct parasite_ctl *ctl, struct pstree_item *item */ if (item->ids->has_cgroup_ns_id && !item->parent) { info = &cgroup_args; - strcpy(cgroup_args.thread_cgrp, "self/cgroup"); ret = parasite_dump_cgroup(ctl, &cgroup_args); if (ret) goto err; } - core->thread_core->has_cg_set = true; - cg_set = &core->thread_core->cg_set; - ret = dump_thread_cgroup(item, cg_set, info, -1); + core->tc->has_cg_set = true; + ret = dump_task_cgroup(item, &core->tc->cg_set, info); if (ret) goto err; @@ -904,72 +836,6 @@ static int collect_file_locks(void) return parse_file_locks(); } -static bool task_in_rseq(struct criu_rseq_cs *rseq_cs, uint64_t addr) -{ - return addr >= rseq_cs->start_ip && addr < rseq_cs->start_ip + rseq_cs->post_commit_offset; -} - -static int fixup_thread_rseq(const struct pstree_item *item, int i) -{ - CoreEntry *core = item->core[i]; - struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; - pid_t tid = item->threads[i].real; - - if (!kdat.has_ptrace_get_rseq_conf) - return 0; - - /* equivalent to (struct rseq)->rseq_cs is NULL */ - if (!rseq_cs->start_ip) - return 0; - - pr_debug( - "fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n", - tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags, - rseq_cs->version, (unsigned long)TI_IP(core)); - - if (rseq_cs->version != 0) { - pr_err("unsupported RSEQ ABI version = %d\n", rseq_cs->version); - return -1; - } - - if (task_in_rseq(rseq_cs, TI_IP(core))) { - struct pid *tid = &item->threads[i]; - - /* - * We need to fixup task instruction pointer from - * the original one (which lays inside rseq critical section) - * to rseq abort handler address. But we need to look on rseq_cs->flags - * (please refer to struct rseq -> flags field description). - * Naive idea of flags support may be like... let's change instruction pointer (IP) - * to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL). - * But unfortunately, it doesn't work properly, because the kernel does - * clean up of rseq_cs field in the struct rseq (modifies userspace memory). - * So, we need to preserve original value of (struct rseq)->rseq_cs field in the - * image and restore it's value before releasing threads (see restore_rseq_cs()). - * - * It's worth to mention that we need to fixup IP in CoreEntry - * (used when full dump/restore is performed) and also in - * the parasite regs storage (used if --leave-running option is used, - * or if dump error occurred and process execution is resumed). - */ - - if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) { - pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n", - tid->real); - - TI_IP(core) = rseq_cs->abort_ip; - - if (item->pid->real == tid->real) { - compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip); - } else { - compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip); - } - } - } - - return 0; -} - static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstree_item *item, int id) { struct parasite_thread_ctl *tctl = dmpi(item)->thread_ctls[id]; @@ -993,12 +859,6 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstr core->thread_core->creds->lsm_profile = dmpi(item)->thread_lsms[id]->profile; core->thread_core->creds->lsm_sockcreate = dmpi(item)->thread_lsms[0]->sockcreate; - ret = fixup_thread_rseq(item, id); - if (ret) { - pr_err("Can't fixup rseq for pid %d\n", pid); - goto err; - } - img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt); if (!img) goto err; @@ -1007,7 +867,6 @@ static int dump_task_thread(struct parasite_ctl *parasite_ctl, const struct pstr close_image(img); err: - compel_release_thread(tctl); pr_info("----------------------------------------\n"); return ret; } @@ -1022,7 +881,7 @@ static int dump_one_zombie(const struct pstree_item *item, const struct proc_pid if (!core) return -1; - __strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); + strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN); core->tc->task_state = TASK_DEAD; core->tc->exit_code = pps->exit_code; @@ -1144,152 +1003,11 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item) return 0; } -static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct criu_rseq_cs *rseq_cs, - struct criu_rseq *rseq) -{ - int ret; - - /* rseq is not registered */ - if (!rseqc->rseq_abi_pointer) - return 0; - - /* - * We need to cover the case when victim process was inside rseq critical section - * at the moment when CRIU comes and seized it. We need to determine the borders - * of rseq critical section at first. To achieve that we need to access thread - * memory and read pointer to struct rseq_cs. - * - * We have two ways to access thread memory: from the parasite and using ptrace(). - * But it this case we can't use parasite, because if victim process returns to the - * execution, on the kernel side __rseq_handle_notify_resume hook will be called, - * then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq - * will be cleared. So, let's use ptrace(PTRACE_PEEKDATA). - */ - ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer), sizeof(struct criu_rseq)); - if (ret) { - pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq, - (unsigned long)(rseqc->rseq_abi_pointer), (unsigned long)sizeof(uint64_t)); - return -1; - } - - if (!rseq->rseq_cs) - return 0; - - ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct criu_rseq_cs)); - if (ret) { - pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid, - (unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, - (unsigned long)sizeof(struct criu_rseq_cs)); - return -1; - } - - return 0; -} - -static int dump_thread_rseq(struct pstree_item *item, int i) -{ - struct __ptrace_rseq_configuration rseqc; - RseqEntry *rseqe = NULL; - int ret; - CoreEntry *core = item->core[i]; - RseqEntry **rseqep = &core->thread_core->rseq_entry; - struct criu_rseq rseq = {}; - struct criu_rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i]; - pid_t tid = item->threads[i].real; - - /* - * If we are here it means that rseq() syscall is supported, - * but ptrace(PTRACE_GET_RSEQ_CONFIGURATION) isn't supported, - * we can just fail dump here. But this is bad idea, IMHO. - * - * So, we will try to detect if victim process was used rseq(). - * See check_rseq() and check_thread_rseq() functions. - */ - if (!kdat.has_ptrace_get_rseq_conf) - return 0; - - ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc); - if (ret != sizeof(rseqc)) { - pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret); - return -1; - } - - if (rseqc.flags != 0) { - pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid, - rseqc.flags); - return -1; - } - - pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer, - rseqc.signature); - - rseqe = xmalloc(sizeof(*rseqe)); - if (!rseqe) - return -1; - - rseq_entry__init(rseqe); - - rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer; - rseqe->rseq_abi_size = rseqc.rseq_abi_size; - rseqe->signature = rseqc.signature; - - if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq)) - goto err; - - /* we won't save rseq_cs to the image (only pointer), - * so let's combine flags from both struct rseq and struct rseq_cs - * (kernel does the same when interpreting RSEQ_CS_FLAG_*) - */ - rseq_cs->flags |= rseq.flags; - - if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) { - rseqe->has_rseq_cs_pointer = true; - rseqe->rseq_cs_pointer = rseq.rseq_cs; - } - - /* save rseq entry to the image */ - *rseqep = rseqe; - - return 0; - -err: - xfree(rseqe); - return -1; -} - -static int dump_task_rseq(pid_t pid, struct pstree_item *item) -{ - int i; - struct criu_rseq_cs *thread_rseq_cs; - - /* if rseq() syscall isn't supported then nothing to dump */ - if (!kdat.has_rseq) - return 0; - - thread_rseq_cs = xzalloc(sizeof(*thread_rseq_cs) * item->nr_threads); - if (!thread_rseq_cs) - return -1; - - dmpi(item)->thread_rseq_cs = thread_rseq_cs; - - for (i = 0; i < item->nr_threads; i++) { - if (dump_thread_rseq(item, i)) - goto free_rseq; - } - - return 0; - -free_rseq: - xfree(thread_rseq_cs); - dmpi(item)->thread_rseq_cs = NULL; - return -1; -} - static struct proc_pid_stat pps_buf; static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) { - int i, ret = 0; + int i; for (i = 0; i < item->nr_threads; i++) { /* Leader is already dumped */ @@ -1297,21 +1015,18 @@ static int dump_task_threads(struct parasite_ctl *parasite_ctl, const struct pst item->threads[i].ns[0].virt = vpid(item); continue; } - ret = dump_task_thread(parasite_ctl, item, i); - if (ret) - break; + if (dump_task_thread(parasite_ctl, item, i)) + return -1; } - xfree(dmpi(item)->thread_rseq_cs); - dmpi(item)->thread_rseq_cs = NULL; - return ret; + return 0; } /* * What this routine does is just reads pid-s of dead * tasks in item's children list from item's ns proc. * - * It does *not* find which real pid corresponds to + * It does *not* find wihch real pid corresponds to * which virtual one, but it's not required -- all we * need to dump for zombie can be found in the same * ns proc. @@ -1413,14 +1128,7 @@ static int dump_zombies(void) item->sid = pps_buf.sid; item->pgid = pps_buf.pgid; - BUG_ON(has_children(item)); - - if (!item->sid) { - pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n", - item->pid->real, vpid(item)); - goto err; - } - + BUG_ON(!list_empty(&item->children)); if (dump_one_zombie(item, &pps_buf) < 0) goto err; } @@ -1433,39 +1141,6 @@ err: return ret; } -static int dump_task_cgroup(struct parasite_ctl *parasite_ctl, const struct pstree_item *item) -{ - struct parasite_dump_cgroup_args cgroup_args, *info; - int i; - - BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); - for (i = 0; i < item->nr_threads; i++) { - CoreEntry *core = item->core[i]; - - /* Leader is already dumped */ - if (item->pid->real == item->threads[i].real) - continue; - - /* For now, we only need to dump the root task's cgroup ns, because we - * know all the tasks are in the same cgroup namespace because we don't - * allow nesting. - */ - info = NULL; - if (item->ids->has_cgroup_ns_id && !item->parent) { - info = &cgroup_args; - sprintf(cgroup_args.thread_cgrp, "self/task/%d/cgroup", item->threads[i].ns[0].virt); - if (parasite_dump_cgroup(parasite_ctl, &cgroup_args)) - return -1; - } - - core->thread_core->has_cg_set = true; - if (dump_thread_cgroup(item, &core->thread_core->cg_set, info, i)) - return -1; - } - - return 0; -} - static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) { pid_t pid = item->pid->real; @@ -1478,7 +1153,7 @@ static int pre_dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Pre-dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); + pr_info("Pre-dumping task (pid: %d)\n", pid); pr_info("========================================\n"); /* @@ -1568,7 +1243,7 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) vm_area_list_init(&vmas); pr_info("========================================\n"); - pr_info("Dumping task (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); + pr_info("Dumping task (pid: %d)\n", pid); pr_info("========================================\n"); if (item->pid->state == TASK_DEAD) @@ -1616,24 +1291,12 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; } - ret = dump_task_rseq(pid, item); - if (ret) { - pr_err("Dump %d rseq failed %d\n", pid, ret); - goto err; - } - parasite_ctl = parasite_infect_seized(pid, item, &vmas); if (!parasite_ctl) { pr_err("Can't infect (pid: %d) with parasite\n", pid); goto err; } - ret = fixup_thread_rseq(item, 0); - if (ret) { - pr_err("Fixup rseq for %d failed %d\n", pid, ret); - goto err; - } - if (fault_injected(FI_DUMP_EARLY)) { pr_info("fault: CRIU sudden detach\n"); kill(getpid(), SIGKILL); @@ -1645,29 +1308,29 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) pfd = parasite_get_proc_fd_seized(parasite_ctl); if (pfd < 0) { pr_err("Can't get proc fd (pid: %d)\n", pid); - goto err_cure; + goto err_cure_imgset; } if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0) - goto err_cure; + goto err_cure_imgset; } ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas); if (ret) { pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid); - goto err_cure; + goto err_cure_imgset; } ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ if (ret) { pr_err("Failed to check aio rings (pid: %d)\n", pid); - goto err_cure; + goto err_cure_imgset; } ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); - goto err_cure; + goto err_cure_imgset; } item->pid->ns[0].virt = misc.pid; @@ -1738,12 +1401,6 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err_cure; } - ret = dump_task_cgroup(parasite_ctl, item); - if (ret) { - pr_err("Dump cgroup of threads in process (pid: %d) failed with %d\n", pid, ret); - goto err_cure; - } - ret = compel_stop_daemon(parasite_ctl); if (ret) { pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); @@ -1781,15 +1438,17 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) goto err; } + close_cr_imgset(&cr_imgset); exit_code = 0; err: - close_cr_imgset(&cr_imgset); close_pid_proc(); free_mappings(&vmas); xfree(dfds); return exit_code; err_cure: + close_cr_imgset(&cr_imgset); +err_cure_imgset: ret = compel_cure(parasite_ctl); if (ret) pr_err("Can't cure (pid: %d) from parasite\n", pid); @@ -1808,7 +1467,7 @@ static void alarm_handler(int signo) pr_err("Timeout reached. Try to interrupt: %d\n", alarm_attempts); if (alarm_attempts++ < 5) { alarm(1); - /* A current syscall will be exited with EINTR */ + /* A curren syscall will be exited with EINTR */ return; } pr_err("FATAL: Unable to interrupt the current operation\n"); @@ -2052,6 +1711,7 @@ static int cr_dump_finish(int ret) if (bfd_flush_images()) ret = -1; + cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); cgp_fini(); if (!ret) { @@ -2105,9 +1765,6 @@ static int cr_dump_finish(int ret) if (arch_set_thread_regs(root_item, true) < 0) return -1; - - cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret); - pstree_switch_state(root_item, (ret || post_dump_ret) ? TASK_ALIVE : opts.final_state); timing_stop(TIME_FROZEN); free_pstree(root_item); @@ -2120,11 +1777,7 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); close_image_dir(); - if (ret || post_dump_ret) { - if (fault_injected(FI_DUMP_CRASH)) { - pr_info("fault: CRIU dump crashed!\n"); - abort(); - } + if (ret) { pr_err("Dumping FAILED.\n"); } else { write_stats(DUMP_STATS); @@ -2138,13 +1791,11 @@ int cr_dump_tasks(pid_t pid) InventoryEntry he = INVENTORY_ENTRY__INIT; InventoryEntry *parent_ie = NULL; struct pstree_item *item; - int ret; - int exit_code = -1; - - kerndat_warn_about_madv_guards(); + int pre_dump_ret = 0; + int ret = -1; pr_info("========================================\n"); - pr_info("Dumping processes (pid: %d comm: %s)\n", pid, __task_comm_info(pid)); + pr_info("Dumping processes (pid: %d)\n", pid); pr_info("========================================\n"); /* @@ -2159,9 +1810,9 @@ int cr_dump_tasks(pid_t pid) goto err; root_item->pid->real = pid; - ret = run_scripts(ACT_PRE_DUMP); - if (ret != 0) { - pr_err("Pre dump script failed with %d!\n", ret); + pre_dump_ret = run_scripts(ACT_PRE_DUMP); + if (pre_dump_ret != 0) { + pr_err("Pre dump script failed with %d!\n", pre_dump_ret); goto err; } if (init_stats(DUMP_STATS)) @@ -2211,18 +1862,12 @@ int cr_dump_tasks(pid_t pid) if (collect_pstree()) goto err; - if (checkpoint_devices()) - goto err; - if (collect_pstree_ids()) goto err; if (network_lock()) goto err; - if (rpc_query_external_files()) - goto err; - if (collect_file_locks()) goto err; @@ -2247,10 +1892,6 @@ int cr_dump_tasks(pid_t pid) goto err; } - ret = run_plugins(DUMP_DEVICES_LATE, pid); - if (ret && ret != -ENOTSUP) - goto err; - if (parent_ie) { inventory_entry__free_unpacked(parent_ie, NULL); parent_ie = NULL; @@ -2287,44 +1928,49 @@ int cr_dump_tasks(pid_t pid) * ipc shared memory, but an ipc namespace is dumped in a child * process. */ - if (cr_dump_shmem()) + ret = cr_dump_shmem(); + if (ret) goto err; if (root_ns_mask) { - if (dump_namespaces(root_item, root_ns_mask)) + ret = dump_namespaces(root_item, root_ns_mask); + if (ret) goto err; } if ((root_ns_mask & CLONE_NEWTIME) == 0) { - if (dump_time_ns(0)) + ret = dump_time_ns(0); + if (ret) goto err; } if (dump_aa_namespaces() < 0) goto err; - if (dump_cgroups()) + ret = dump_cgroups(); + if (ret) goto err; - if (fix_external_unix_sockets()) + ret = fix_external_unix_sockets(); + if (ret) goto err; - if (tty_post_actions()) + ret = tty_post_actions(); + if (ret) goto err; - if (inventory_save_uptime(&he)) + ret = inventory_save_uptime(&he); + if (ret) goto err; he.has_pre_dump_mode = false; - if (found_uprobes_vma()) { - he.has_allow_uprobes = true; - he.allow_uprobes = true; - } - exit_code = write_img_inventory(&he); + ret = write_img_inventory(&he); + if (ret) + goto err; err: if (parent_ie) inventory_entry__free_unpacked(parent_ie, NULL); - return cr_dump_finish(exit_code); + return cr_dump_finish(ret); } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b92b92715..9d2d957f8 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -17,13 +17,12 @@ #include #include #include -#include #include "types.h" #include #include "common/compiler.h" -#include "linux/rseq.h" +#include "linux/mount.h" #include "clone-noasan.h" #include "cr_options.h" @@ -80,15 +79,12 @@ #include "timens.h" #include "bpfmap.h" #include "apparmor.h" -#include "pidfd.h" #include "parasite-syscall.h" #include "files-reg.h" #include #include "compel/include/asm/syscall.h" -#include "linux/mount.h" - #include "protobuf.h" #include "images/sa.pb-c.h" #include "images/timer.pb-c.h" @@ -100,8 +96,6 @@ #include "restore.h" #include "cr-errno.h" -#include "timer.h" -#include "sigact.h" #ifndef arch_export_restore_thread #define arch_export_restore_thread __export_restore_thread @@ -122,6 +116,7 @@ static int restore_task_with_children(void *); static int sigreturn_restore(pid_t pid, struct task_restore_args *ta, unsigned long alen, CoreEntry *core); static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); +static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); /* @@ -282,7 +277,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, }; /* These images are required to restore namespaces */ @@ -355,10 +350,6 @@ static int root_prepare_shared(void) if (ret) goto err; - ret = add_fake_unix_queuers(); - if (ret) - goto err; - /* * This should be called with all packets collected AND all * fdescs and fles prepared BUT post-prep-s not run. @@ -375,6 +366,10 @@ static int root_prepare_shared(void) if (ret) goto err; + ret = add_fake_unix_queuers(); + if (ret) + goto err; + show_saved_files(); err: return ret; @@ -410,6 +405,268 @@ static int populate_pid_proc(void) return 0; } +static rt_sigaction_t sigchld_act; +/* + * If parent's sigaction has blocked SIGKILL (which is non-sense), + * this parent action is non-valid and shouldn't be inherited. + * Used to mark parent_act* no more valid. + */ +static rt_sigaction_t parent_act[SIGMAX]; +#ifdef CONFIG_COMPAT +static rt_sigaction_t_compat parent_act_compat[SIGMAX]; +#endif + +static bool sa_inherited(int sig, rt_sigaction_t *sa) +{ + rt_sigaction_t *pa; + int i; + + if (current == root_item) + return false; /* XXX -- inherit from CRIU? */ + + pa = &parent_act[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_native_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); +#ifdef CONFIG_MIPS + e->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); + + memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); + memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); +#else + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); +#endif + if (sig == SIGCHLD) { + sigchld_act = act; + return 0; + } + + if (sa_inherited(sig - 1, &act)) + return 1; + + /* + * A pure syscall is used, because glibc + * sigaction overwrites se_restorer. + */ + ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); + if (ret < 0) { + pr_perror("Can't restore sigaction"); + return ret; + } + + parent_act[sig - 1] = act; + /* Mark SIGKILL blocked which makes compat sigaction non-valid */ +#ifdef CONFIG_COMPAT + parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; +#endif + + return 1; +} + +static void *stack32; + +#ifdef CONFIG_COMPAT +static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) +{ + rt_sigaction_t_compat *pa; + int i; + + if (current == root_item) + return false; + + pa = &parent_act_compat[sig]; + + /* Omitting non-valid sigaction */ + if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) + return false; + + for (i = 0; i < _KNSIG_WORDS; i++) + if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) + return false; + + return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && + pa->rt_sa_restorer == sa->rt_sa_restorer; +} + +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + rt_sigaction_t_compat act; + int ret; + + ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); + ASSIGN_TYPED(act.rt_sa_flags, e->flags); + ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); + BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); + memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); + + if (sig == SIGCHLD) { + memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); + return 0; + } + + if (sa_compat_inherited(sig - 1, &act)) + return 1; + + if (!stack32) { + stack32 = alloc_compat_syscall_stack(); + if (!stack32) + return -1; + } + + ret = arch_compat_rt_sigaction(stack32, sig, &act); + if (ret < 0) { + pr_err("Can't restore compat sigaction: %d\n", ret); + return ret; + } + + parent_act_compat[sig - 1] = act; + /* Mark SIGKILL blocked which makes native sigaction non-valid */ + parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; + + return 1; +} +#else +static int restore_compat_sigaction(int sig, SaEntry *e) +{ + return -1; +} +#endif + +static int prepare_sigactions_from_core(TaskCoreEntry *tc) +{ + int sig, i; + + if (tc->n_sigactions != SIGMAX - 2) { + pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); + return -1; + } + + pr_info("Restore on-core sigactions for %d\n", vpid(current)); + + for (sig = 1, i = 0; sig <= SIGMAX; sig++) { + int ret; + SaEntry *e; + bool sigaction_is_compat; + + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + e = tc->sigactions[i++]; + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + if (ret < 0) + return ret; + } + + return 0; +} + +/* Returns number of restored signals, -1 or negative errno on fail */ +static int restore_one_sigaction(int sig, struct cr_img *img, int pid) +{ + bool sigaction_is_compat; + SaEntry *e; + int ret = 0; + + BUG_ON(sig == SIGKILL || sig == SIGSTOP); + + ret = pb_read_one_eof(img, &e, PB_SIGACT); + if (ret == 0) { + if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ + pr_err("Unexpected EOF %d\n", sig); + return -1; + } + pr_warn("This format of sigacts-%d.img is deprecated\n", pid); + return -1; + } + if (ret < 0) + return ret; + + sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; + if (sigaction_is_compat) + ret = restore_compat_sigaction(sig, e); + else + ret = restore_native_sigaction(sig, e); + + sa_entry__free_unpacked(e, NULL); + + return ret; +} + +static int prepare_sigactions_from_image(void) +{ + int pid = vpid(current); + struct cr_img *img; + int sig, rst = 0; + int ret = 0; + + pr_info("Restore sigacts for %d\n", pid); + + img = open_image(CR_FD_SIGACT, O_RSTR, pid); + if (!img) + return -1; + + for (sig = 1; sig <= SIGMAX; sig++) { + if (sig == SIGKILL || sig == SIGSTOP) + continue; + + ret = restore_one_sigaction(sig, img, pid); + if (ret < 0) + break; + if (ret) + rst++; + } + + pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); + + close_image(img); + return ret; +} + +static int prepare_sigactions(CoreEntry *core) +{ + int ret; + + if (!task_alive(current)) + return 0; + + if (core->tc->n_sigactions != 0) + ret = prepare_sigactions_from_core(core->tc); + else + ret = prepare_sigactions_from_image(); + + if (stack32) { + free_compat_syscall_stack(stack32); + stack32 = NULL; + } + + return ret; +} + static int __collect_child_pids(struct pstree_item *p, int state, unsigned int *n) { struct pstree_item *pi; @@ -554,23 +811,6 @@ static int open_cores(int pid, CoreEntry *leader_core) } } - for (i = 0; i < current->nr_threads; i++) { - ThreadCoreEntry *tc = cores[i]->thread_core; - struct rst_rseq *rseqs = rsti(current)->rseqe; - RseqEntry *rseqe = tc->rseq_entry; - - /* compatibility with older CRIU versions */ - if (!rseqe) - continue; - - /* rseq cs had no RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL */ - if (!rseqe->has_rseq_cs_pointer) - continue; - - rseqs[i].rseq_abi_pointer = rseqe->rseq_abi_pointer; - rseqs[i].rseq_cs_pointer = rseqe->rseq_cs_pointer; - } - return 0; err: xfree(cores); @@ -604,9 +844,6 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a if (tc->has_child_subreaper) args->child_subreaper = tc->child_subreaper; - if (tc->has_membarrier_registration_mask) - args->membarrier_registration_mask = tc->membarrier_registration_mask; - /* loginuid value is critical to restore */ if (kdat.luid == LUID_FULL && tc->has_loginuid && tc->loginuid != INVALID_UID) { ret = prepare_loginuid(tc->loginuid); @@ -623,6 +860,7 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc, struct task_restore_a return 0; } +static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); static int prepare_mm(pid_t pid, struct task_restore_args *args); static int restore_one_alive_task(int pid, CoreEntry *core) @@ -715,9 +953,6 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (setup_uffd(pid, ta)) return -1; - if (arch_shstk_prepare(current, core, ta)) - return -1; - return sigreturn_restore(pid, ta, args_len, core); } @@ -1095,22 +1330,7 @@ static inline int fork_with_pid(struct pstree_item *item) return -1; item->pid->state = ca.core->tc->task_state; - - /* - * Zombie tasks' cgroup is not dumped/restored. - * cg_set == 0 is skipped in prepare_task_cgroup() - */ - if (item->pid->state == TASK_DEAD) { - rsti(item)->cg_set = 0; - } else { - if (ca.core->thread_core->has_cg_set) - rsti(item)->cg_set = ca.core->thread_core->cg_set; - else - rsti(item)->cg_set = ca.core->tc->cg_set; - } - - if (ca.core->tc->has_stop_signo) - item->pid->stop_signo = ca.core->tc->stop_signo; + rsti(item)->cg_set = ca.core->tc->cg_set; if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); @@ -1238,8 +1458,6 @@ static inline int fork_with_pid(struct pstree_item *item) pr_debug("PID: real %d virt %d\n", item->pid->real, vpid(item)); } - arch_shstk_unlock(item, ca.core, ret); - err_unlock: if (!(ca.clone_flags & CLONE_NEWPID)) unlock_last_pid(); @@ -1506,7 +1724,7 @@ static int create_children_and_session(void) return 0; } -static int __restore_task_with_children(void *_arg) +static int restore_task_with_children(void *_arg) { struct cr_clone_arg *ca = _arg; pid_t pid; @@ -1542,7 +1760,7 @@ static int __restore_task_with_children(void *_arg) } if (log_init_by_pid(vpid(current))) - goto err; + return -1; if (current->parent == NULL) { /* @@ -1569,19 +1787,9 @@ static int __restore_task_with_children(void *_arg) goto err; } - if (set_opts_cap_eff()) - goto err; - /* Wait prepare_userns */ if (restore_finish_ns_stage(CR_STATE_ROOT_TASK, CR_STATE_PREPARE_NAMESPACES) < 0) goto err; - - /* - * Since we don't support nesting of cgroup namespaces, let's - * only set up the cgns (if it exists) in the init task. - */ - if (prepare_cgroup_namespace(current) < 0) - goto err; } if (needs_prep_creds(current) && (prepare_userns_creds())) @@ -1593,7 +1801,7 @@ static int __restore_task_with_children(void *_arg) * we will only move the root one there, others will * just have it inherited. */ - if (restore_task_cgroup(current) < 0) + if (prepare_task_cgroup(current) < 0) goto err; /* Restore root task */ @@ -1698,19 +1906,6 @@ err: exit(1); } -static int restore_task_with_children(void *_arg) -{ - struct cr_clone_arg *arg = _arg; - struct pstree_item *item = arg->item; - CoreEntry *core = arg->core; - - return arch_shstk_trampoline(item, core, __restore_task_with_children, - arg); -} - -int __attribute((weak)) arch_ptrace_restore(int pid, struct pstree_item *item); -int arch_ptrace_restore(int pid, struct pstree_item *item) { return 0; } - static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -1747,12 +1942,6 @@ static int attach_to_tasks(bool root_seized) return -1; } - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESYSGOOD)) { - pr_perror("Unable to set PTRACE_O_TRACESYSGOOD for %d", pid); - return -1; - } - if (arch_ptrace_restore(pid, item)) - return -1; /* * Suspend seccomp if necessary. We need to do this because * although seccomp is restored at the very end of the @@ -1773,55 +1962,10 @@ static int attach_to_tasks(bool root_seized) return 0; } -static int restore_rseq_cs(void) +static int catch_tasks(bool root_seized, enum trace_flags *flag) { struct pstree_item *item; - for_each_pstree_item(item) { - int i; - - if (!task_alive(item)) - continue; - - if (item->nr_threads == 1) { - item->threads[0].real = item->pid->real; - } else { - if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) { - pr_err("restore_rseq_cs: parse_threads failed\n"); - return -1; - } - } - - for (i = 0; i < item->nr_threads; i++) { - pid_t pid = item->threads[i].real; - struct rst_rseq *rseqe = rsti(item)->rseqe; - - if (!rseqe) { - pr_err("restore_rseq_cs: rsti(item)->rseqe is NULL\n"); - return -1; - } - - if (!rseqe[i].rseq_cs_pointer || !rseqe[i].rseq_abi_pointer) - continue; - - if (ptrace_poke_area( - pid, &rseqe[i].rseq_cs_pointer, - decode_pointer(rseqe[i].rseq_abi_pointer + offsetof(struct criu_rseq, rseq_cs)), - sizeof(uint64_t))) { - pr_err("Can't restore rseq_cs pointer (pid: %d)\n", pid); - return -1; - } - } - } - - return 0; -} - -static int catch_tasks(bool root_seized) -{ - struct pstree_item *item; - bool nobp = fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints; - for_each_pstree_item(item) { int status, i, ret; @@ -1848,7 +1992,7 @@ static int catch_tasks(bool root_seized) return -1; } - ret = compel_stop_pie(pid, rsti(item)->breakpoint, nobp); + ret = compel_stop_pie(pid, rsti(item)->breakpoint, flag, fault_injected(FI_NO_BREAKPOINTS)); if (ret < 0) return -1; } @@ -1857,6 +2001,24 @@ static int catch_tasks(bool root_seized) return 0; } +static int clear_breakpoints(void) +{ + struct pstree_item *item; + int ret = 0, i; + + if (fault_injected(FI_NO_BREAKPOINTS)) + return 0; + + for_each_pstree_item(item) { + if (!task_alive(item)) + continue; + for (i = 0; i < item->nr_threads; i++) + ret |= ptrace_flush_breakpoints(item->threads[i].real); + } + + return ret; +} + static void finalize_restore(void) { struct pstree_item *item; @@ -1880,14 +2042,8 @@ static void finalize_restore(void) xfree(ctl); - if (opts.final_state == TASK_STOPPED) + if ((item->pid->state == TASK_STOPPED) || (opts.final_state == TASK_STOPPED)) kill(item->pid->real, SIGSTOP); - else if (item->pid->state == TASK_STOPPED) { - if (item->pid->stop_signo > 0) - kill(item->pid->real, item->pid->stop_signo); - else - kill(item->pid->real, SIGSTOP); - } } } @@ -1905,7 +2061,7 @@ static int finalize_restore_detach(void) for (i = 0; i < item->nr_threads; i++) { pid = item->threads[i].real; if (pid < 0) { - pr_err("pstree item has invalid pid %d\n", pid); + pr_err("pstree item has unvalid pid %d\n", pid); continue; } @@ -1997,6 +2153,7 @@ static void reap_zombies(void) static int restore_root_task(struct pstree_item *init) { + enum trace_flags flag = TRACE_ALL; int ret, fd, mnt_ns_fd = -1; int root_seized = 0; struct pstree_item *item; @@ -2120,7 +2277,7 @@ static int restore_root_task(struct pstree_item *init) * the '--empty-ns net' mode no iptables C/R is done and we * need to return these rules by hands. */ - ret = network_lock_internal(/* restore = */ true); + ret = network_lock_internal(); if (ret) goto out_kill; } @@ -2132,9 +2289,6 @@ static int restore_root_task(struct pstree_item *init) __restore_switch_stage(CR_STATE_FORKING); skip_ns_bouncing: - ret = run_plugins(POST_FORKING); - if (ret < 0 && ret != -ENOTSUP) - goto out_kill; ret = restore_wait_inprogress_tasks(); if (ret < 0) @@ -2162,10 +2316,6 @@ skip_ns_bouncing: if (ret < 0) goto out_kill; - ret = stop_cgroupd(); - if (ret < 0) - goto out_kill; - ret = move_veth_to_bridge(); if (ret < 0) goto out_kill; @@ -2218,7 +2368,7 @@ skip_ns_bouncing: timing_stop(TIME_RESTORE); - if (catch_tasks(root_seized)) { + if (catch_tasks(root_seized, &flag)) { pr_err("Can't catch all tasks\n"); goto out_kill_network_unlocked; } @@ -2228,44 +2378,17 @@ skip_ns_bouncing: __restore_switch_stage(CR_STATE_COMPLETE); - ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1)); + ret = compel_stop_on_syscall(task_entries->nr_threads, __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); if (ret) { pr_err("Can't stop all tasks on rt_sigreturn\n"); goto out_kill_network_unlocked; } + if (clear_breakpoints()) + pr_err("Unable to flush breakpoints\n"); + finalize_restore(); - /* just before releasing threads we have to restore rseq_cs */ - if (restore_rseq_cs()) - pr_err("Unable to restore rseq_cs state\n"); - - /* - * Some external devices such as GPUs might need a very late - * trigger to kick-off some events, memory notifiers and for - * restarting the previously restored queues during criu restore - * stage. This is needed since criu pie code may shuffle VMAs - * around so things such as registering MMU notifiers (for GPU - * mapped memory) could be done sanely once the pie code hands - * over the control to master process. - */ - pr_info("Run late stage hook from criu master for external devices\n"); - for_each_pstree_item(item) { - if (!task_alive(item)) - continue; - ret = run_plugins(RESUME_DEVICES_LATE, item->pid->real); - /* - * This may not really be an error. Only certain plugin hooks - * (if available) will return success such as amdgpu_plugin that - * validates the pid of the resuming tasks in the kernel mode. - * Most of the times, it'll be -ENOTSUP and in few cases, it - * might actually be a true error code but that would be also - * captured in the plugin so no need to print the error here. - */ - if (ret < 0 && ret != -ENOTSUP) - pr_debug("restore late stage hook for external plugin failed\n"); - } - ret = run_scripts(ACT_PRE_RESUME); if (ret) pr_err("Pre-resume script ret code %d\n", ret); @@ -2339,7 +2462,6 @@ int prepare_task_entries(void) task_entries->nr_helpers = 0; futex_set(&task_entries->start, CR_STATE_FAIL); mutex_init(&task_entries->userns_sync_lock); - mutex_init(&task_entries->cgroupd_sync_lock); mutex_init(&task_entries->last_pid_mutex); return 0; @@ -2365,48 +2487,42 @@ int cr_restore_tasks(void) if (init_service_fd()) return 1; - if (check_img_inventory(/* restore = */ true) < 0) + if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) return -1; + if (check_img_inventory(/* restore = */ true) < 0) + goto err; + if (init_stats(RESTORE_STATS)) - return -1; + goto err; if (lsm_check_opts()) - return -1; + goto err; timing_start(TIME_RESTORE); if (cpu_init() < 0) - return -1; + goto err; if (vdso_init_restore()) - return -1; + goto err; if (tty_init_restore()) - return -1; + goto err; if (opts.cpu_cap & CPU_CAP_IMAGE) { if (cpu_validate_cpuinfo()) - return -1; + goto err; } if (prepare_task_entries() < 0) - return -1; + goto err; if (prepare_pstree() < 0) - return -1; + goto err; if (fdstore_init()) - return -1; - - /* - * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store - * its socket file descriptor. This allows the main process and the target process to - * communicate with each other through this file descriptor. Therefore, cr_plugin_init - * must be initialized after fdstore_init. - */ - if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE)) - return -1; + goto err; if (inherit_fd_move_to_fdstore()) goto err; @@ -2431,24 +2547,23 @@ err: return ret; } -static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long min_addr, long vma_len) +static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_head *self_vma_list, long vma_len) { struct vma_area *t_vma, *s_vma; - long prev_vma_end = min_addr; + long prev_vma_end = 0; struct vma_area end_vma; VmaEntry end_e; end_vma.e = &end_e; end_e.start = end_e.end = kdat.task_size; - INIT_LIST_HEAD(&end_vma.list); + prev_vma_end = kdat.mmap_min_addr; s_vma = list_first_entry(self_vma_list, struct vma_area, list); t_vma = list_first_entry(tgt_vma_list, struct vma_area, list); while (1) { if (prev_vma_end + vma_len > s_vma->e->start) { - if ((s_vma->list.next == self_vma_list) || - vma_area_is(vma_next(s_vma), VMA_AREA_GUARD)) { + if (s_vma->list.next == self_vma_list) { s_vma = &end_vma; continue; } @@ -2461,8 +2576,7 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he } if (prev_vma_end + vma_len > t_vma->e->start) { - if ((t_vma->list.next == tgt_vma_list) || - vma_area_is(vma_next(t_vma), VMA_AREA_GUARD)) { + if (t_vma->list.next == tgt_vma_list) { t_vma = &end_vma; continue; } @@ -2480,6 +2594,251 @@ static long restorer_get_vma_hint(struct list_head *tgt_vma_list, struct list_he return -1; } +static inline int timeval_valid(struct timeval *tv) +{ + return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); +} + +static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) +{ + if (ie->isec == 0 && ie->iusec == 0) { + memzero_p(val); + return 0; + } + + val->it_interval.tv_sec = ie->isec; + val->it_interval.tv_usec = ie->iusec; + + if (!timeval_valid(&val->it_interval)) { + pr_err("Invalid timer interval\n"); + return -1; + } + + if (ie->vsec == 0 && ie->vusec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + val->it_value.tv_sec = ie->isec; + val->it_value.tv_usec = ie->iusec; + } else { + val->it_value.tv_sec = ie->vsec; + val->it_value.tv_usec = ie->vusec; + } + + if (!timeval_valid(&val->it_value)) { + pr_err("Invalid timer value\n"); + return -1; + } + + pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n, val->it_value.tv_sec, val->it_value.tv_usec, + val->it_interval.tv_sec, val->it_interval.tv_usec); + + return 0; +} + +/* + * Legacy itimers restore from CR_FD_ITIMERS + */ + +static int prepare_itimers_from_fd(int pid, struct task_restore_args *args) +{ + int ret = -1; + struct cr_img *img; + ItimerEntry *ie; + + if (!deprecated_ok("Itimers")) + return -1; + + img = open_image(CR_FD_ITIMERS, O_RSTR, pid); + if (!img) + return -1; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("real", ie, &args->itimers[0]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("virt", ie, &args->itimers[1]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; + + ret = pb_read_one(img, &ie, PB_ITIMER); + if (ret < 0) + goto out; + ret = decode_itimer("prof", ie, &args->itimers[2]); + itimer_entry__free_unpacked(ie, NULL); + if (ret < 0) + goto out; +out: + close_image(img); + return ret; +} + +static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) +{ + int ret = 0; + TaskTimersEntry *tte = core->tc->timers; + + if (!tte) + return prepare_itimers_from_fd(pid, args); + + ret |= decode_itimer("real", tte->real, &args->itimers[0]); + ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); + ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); + + return ret; +} + +static inline int timespec_valid(struct timespec *ts) +{ + return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); +} + +static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) +{ + pt->val.it_interval.tv_sec = pte->isec; + pt->val.it_interval.tv_nsec = pte->insec; + + if (!timespec_valid(&pt->val.it_interval)) { + pr_err("Invalid timer interval(posix)\n"); + return -1; + } + + if (pte->vsec == 0 && pte->vnsec == 0) { + /* + * Remaining time was too short. Set it to + * interval to make the timer armed and work. + */ + pt->val.it_value.tv_sec = pte->isec; + pt->val.it_value.tv_nsec = pte->insec; + } else { + pt->val.it_value.tv_sec = pte->vsec; + pt->val.it_value.tv_nsec = pte->vnsec; + } + + if (!timespec_valid(&pt->val.it_value)) { + pr_err("Invalid timer value(posix)\n"); + return -1; + } + + pt->spt.it_id = pte->it_id; + pt->spt.clock_id = pte->clock_id; + pt->spt.si_signo = pte->si_signo; + pt->spt.it_sigev_notify = pte->it_sigev_notify; + pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); + pt->spt.notify_thread_id = pte->notify_thread_id; + pt->overrun = pte->overrun; + + return 0; +} + +static int cmp_posix_timer_proc_id(const void *p1, const void *p2) +{ + return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; +} + +static void sort_posix_timers(struct task_restore_args *ta) +{ + void *tmem; + + /* + * This is required for restorer's create_posix_timers(), + * it will probe them one-by-one for the desired ID, since + * kernel doesn't provide another API for timer creation + * with given ID. + */ + + if (ta->posix_timers_n > 0) { + tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); + qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); + } +} + +/* + * Legacy posix timers restoration from CR_FD_POSIX_TIMERS + */ + +static int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) +{ + struct cr_img *img; + int ret = -1; + struct restore_posix_timer *t; + + if (!deprecated_ok("Posix timers")) + return -1; + + img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); + if (!img) + return -1; + + ta->posix_timers_n = 0; + while (1) { + PosixTimerEntry *pte; + + ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); + if (ret <= 0) + break; + + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + break; + + ret = decode_posix_timer(pte, t); + if (ret < 0) + break; + + posix_timer_entry__free_unpacked(pte, NULL); + ta->posix_timers_n++; + } + + close_image(img); + if (!ret) + sort_posix_timers(ta); + + return ret; +} + +static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) +{ + int i, ret = -1; + TaskTimersEntry *tte = core->tc->timers; + struct restore_posix_timer *t; + + ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); + + if (!tte) + return prepare_posix_timers_from_fd(pid, ta); + + ta->posix_timers_n = tte->n_posix; + for (i = 0; i < ta->posix_timers_n; i++) { + t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); + if (!t) + goto out; + + if (decode_posix_timer(tte->posix[i], t)) + goto out; + } + + ret = 0; + sort_posix_timers(ta); +out: + return ret; +} + +static inline int verify_cap_size(CredsEntry *ce) +{ + return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) && (ce->n_cap_prm == CR_CAP_SIZE) && + (ce->n_cap_bnd == CR_CAP_SIZE)); +} + static int prepare_mm(pid_t pid, struct task_restore_args *args) { int exe_fd, i, ret = -1; @@ -2505,7 +2864,7 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args) args->fd_exe_link = exe_fd; - args->thp_disabled = mm->has_thp_disabled && mm->thp_disabled; + args->has_thp_enabled = rsti(current)->has_thp_enabled; ret = 0; out: @@ -2571,17 +2930,6 @@ static int remap_restorer_blob(void *addr) restorer_setup_c_header_desc(&pbd, true); compel_relocs_apply(addr, addr, &pbd); - /* - * Ensure the infected thread sees the updated code. - * - * On architectures like ARM64, the Data Cache (D-cache) and - * Instruction Cache (I-cache) are not automatically coherent. - * Modifications land in the D-cache, so we must flush (clean) the - * D-cache to push changes to RAM to ensure the CPU fetches the updated - * instructions. - */ - __builtin___clear_cache(addr, addr + pbd.hdr.bsize); - return 0; } @@ -2590,7 +2938,7 @@ static int validate_sched_parm(struct rst_sched_param *sp) if ((sp->nice < -20) || (sp->nice > 19)) return 0; - switch (sp->policy & ~SCHED_RESET_ON_FORK) { + switch (sp->policy) { case SCHED_RR: case SCHED_FIFO: return ((sp->prio > 0) && (sp->prio < 100)); @@ -2623,55 +2971,6 @@ static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc) return 0; } -static int prep_rseq(struct rst_rseq_param *rseq, ThreadCoreEntry *tc) -{ - /* compatibility with older CRIU versions */ - if (!tc->rseq_entry) - return 0; - - rseq->rseq_abi_pointer = tc->rseq_entry->rseq_abi_pointer; - rseq->rseq_abi_size = tc->rseq_entry->rseq_abi_size; - rseq->signature = tc->rseq_entry->signature; - - if (rseq->rseq_abi_pointer && !kdat.has_rseq) { - pr_err("rseq: can't restore as kernel doesn't support it\n"); - return -1; - } - - return 0; -} - -static void prep_libc_rseq_info(struct rst_rseq_param *rseq) -{ - if (!kdat.has_rseq) { - rseq->rseq_abi_pointer = 0; - return; - } - - if (!kdat.has_ptrace_get_rseq_conf) { -#if defined(__GLIBC__) && defined(RSEQ_SIG) - rseq->rseq_abi_pointer = encode_pointer(__criu_thread_pointer() + __rseq_offset); - /* - * Current glibc reports the feature/active size in - * __rseq_size, not the size passed to the kernel. - * This could be 20, but older kernels expect 32 for - * the size argument even if only 20 bytes are used. - */ - rseq->rseq_abi_size = __rseq_size; - if (rseq->rseq_abi_size < 32) - rseq->rseq_abi_size = 32; - rseq->signature = RSEQ_SIG; -#else - rseq->rseq_abi_pointer = 0; -#endif - return; - } - - rseq->rseq_abi_pointer = kdat.libc_rseq_conf.rseq_abi_pointer; - rseq->rseq_abi_size = kdat.libc_rseq_conf.rseq_abi_size; - rseq->signature = kdat.libc_rseq_conf.signature; -} - static rlim_t decode_rlim(rlim_t ival) { return ival == -1 ? RLIM_INFINITY : ival; @@ -2759,11 +3058,11 @@ static int prepare_rlimits(int pid, struct task_restore_args *ta, CoreEntry *cor return 0; } -static int signal_to_mem(SiginfoEntry *se) +static int signal_to_mem(SiginfoEntry *sie) { siginfo_t *info, *t; - info = (siginfo_t *)se->siginfo.data; + info = (siginfo_t *)sie->siginfo.data; t = rst_mem_alloc(sizeof(siginfo_t), RM_PRIVATE); if (!t) return -1; @@ -2784,24 +3083,24 @@ static int open_signal_image(int type, pid_t pid, unsigned int *nr) *nr = 0; while (1) { - SiginfoEntry *se; + SiginfoEntry *sie; - ret = pb_read_one_eof(img, &se, PB_SIGINFO); + ret = pb_read_one_eof(img, &sie, PB_SIGINFO); if (ret <= 0) break; - if (se->siginfo.len != sizeof(siginfo_t)) { + if (sie->siginfo.len != sizeof(siginfo_t)) { pr_err("Unknown image format\n"); ret = -1; break; } - ret = signal_to_mem(se); + ret = signal_to_mem(sie); if (ret) break; (*nr)++; - siginfo_entry__free_unpacked(se, NULL); + siginfo_entry__free_unpacked(sie, NULL); } close_image(img); @@ -2913,31 +3212,17 @@ static bool groups_match(gid_t *groups, int n_groups) return ret; } -static void copy_caps(u32 *out_caps, u32 *in_caps, int n_words) -{ - int i, cap_end; - - for (i = kdat.last_cap + 1; i < 32 * n_words; ++i) { - if (~in_caps[i / 32] & (1 << (i % 32))) - continue; - - pr_warn("Dropping unsupported capability %d > %d)\n", i, kdat.last_cap); - /* extra caps will be cleared below */ - } - - n_words = min(n_words, (kdat.last_cap + 31) / 32); - cap_end = (kdat.last_cap & 31) + 1; - memcpy(out_caps, in_caps, sizeof(*out_caps) * n_words); - if ((cap_end & 31) && n_words) - out_caps[n_words - 1] &= (1 << cap_end) - 1; - memset(out_caps + n_words, 0, sizeof(*out_caps) * (CR_CAP_SIZE - n_words)); -} - static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) { unsigned long this_pos; struct thread_creds_args *args; + if (!verify_cap_size(ce)) { + pr_err("Caps size mismatch %d %d %d %d\n", (int)ce->n_cap_inh, (int)ce->n_cap_eff, (int)ce->n_cap_prm, + (int)ce->n_cap_bnd); + return ERR_PTR(-EINVAL); + } + this_pos = rst_mem_align_cpos(RM_PRIVATE); args = rst_mem_alloc(sizeof(*args), RM_PRIVATE); @@ -2973,7 +3258,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_profile = lsm_profile; - __strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); + strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); xfree(rendered); } } else { @@ -3007,7 +3292,7 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_sockcreate = lsm_sockcreate; - __strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); + strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); xfree(rendered); } } else { @@ -3022,15 +3307,13 @@ static struct thread_creds_args *rst_prep_creds_args(CredsEntry *ce, unsigned lo args->creds.cap_eff = NULL; args->creds.cap_prm = NULL; args->creds.cap_bnd = NULL; - args->creds.cap_amb = NULL; args->creds.groups = NULL; args->creds.lsm_profile = NULL; - copy_caps(args->cap_inh, ce->cap_inh, ce->n_cap_inh); - copy_caps(args->cap_eff, ce->cap_eff, ce->n_cap_eff); - copy_caps(args->cap_prm, ce->cap_prm, ce->n_cap_prm); - copy_caps(args->cap_bnd, ce->cap_bnd, ce->n_cap_bnd); - copy_caps(args->cap_amb, ce->cap_amb, ce->n_cap_amb); + memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh)); + memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff)); + memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm)); + memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd)); if (ce->n_groups && !groups_match(ce->groups, ce->n_groups)) { unsigned int *groups; @@ -3133,9 +3416,6 @@ static void *restorer_munmap_addr(CoreEntry *core, void *restorer_blob) return restorer_sym(restorer_blob, arch_export_unmap); } -void arch_rsti_init(struct pstree_item *p) __attribute__((weak)); -void arch_rsti_init(struct pstree_item *p) {} - static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, unsigned long alen, CoreEntry *core) { void *mem = MAP_FAILED; @@ -3196,7 +3476,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns rst_mem_size = rst_mem_lock(); memzone_size = round_up(sizeof(struct restore_mem_zone) * current->nr_threads, page_size()); - task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size + shstk_restorer_stack_size(); + task_args->bootstrap_len = restorer_len + memzone_size + alen + rst_mem_size; BUG_ON(task_args->bootstrap_len & (PAGE_SIZE - 1)); pr_info("%d threads require %ldK of memory\n", current->nr_threads, KBYTES(task_args->bootstrap_len)); @@ -3226,9 +3506,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * or inited from scratch). */ - mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, - shstk_min_mmap_addr(&task_args->shstk, kdat.mmap_min_addr), - task_args->bootstrap_len); + mem = (void *)restorer_get_vma_hint(&vmas->h, &self_vmas.h, task_args->bootstrap_len); if (mem == (void *)-1) { pr_err("No suitable area for task_restore bootstrap (%ldK)\n", task_args->bootstrap_len); goto err; @@ -3346,18 +3624,11 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns strncpy(task_args->comm, core->tc->comm, TASK_COMM_LEN - 1); task_args->comm[TASK_COMM_LEN - 1] = 0; - prep_libc_rseq_info(&task_args->libc_rseq); - - task_args->uid = opts.uid; - for (i = 0; i < CR_CAP_SIZE; i++) - task_args->cap_eff[i] = opts.cap_eff[i]; - /* * Fill up per-thread data. */ creds_pos_next = creds_pos; siginfo_n = task_args->siginfo_n; - arch_rsti_init(current); for (i = 0; i < current->nr_threads; i++) { CoreEntry *tcore; struct rt_sigframe *sigframe; @@ -3410,17 +3681,6 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr; core_get_tls(tcore, &thread_args[i].tls); - if (tcore->thread_core->has_cg_set && rsti(current)->cg_set != tcore->thread_core->cg_set) { - thread_args[i].cg_set = tcore->thread_core->cg_set; - thread_args[i].cgroupd_sk = dup(get_service_fd(CGROUPD_SK)); - } else { - thread_args[i].cg_set = -1; - } - - ret = prep_rseq(&thread_args[i].rseq, tcore->thread_core); - if (ret) - goto err; - rst_reloc_creds(&thread_args[i], &creds_pos_next); thread_args[i].futex_rla = tcore->thread_core->futex_rla; @@ -3467,10 +3727,6 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns * self-vmas are unmaped. */ mem += rst_mem_size; - - shstk_set_restorer_stack(&task_args->shstk, mem); - mem += shstk_restorer_stack_size(); - task_args->vdso_rt_parked_at = (unsigned long)mem; task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; @@ -3515,7 +3771,6 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns close_service_fd(USERNSD_SK); close_service_fd(FDSTORE_SK_OFF); close_service_fd(RPC_SK_OFF); - close_service_fd(CGROUPD_SK); __gcov_flush(); diff --git a/criu/cr-service.c b/criu/cr-service.c index dccf4ef38..0f8bc4cc1 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "version.h" #include "crtools.h" @@ -170,11 +169,11 @@ int send_criu_dump_resp(int socket_fd, bool success, bool restored) return send_criu_msg(socket_fd, &msg); } -static int send_criu_pre_dump_resp(int socket_fd, bool success, bool single) +static int send_criu_pre_dump_resp(int socket_fd, bool success) { CriuResp msg = CRIU_RESP__INIT; - msg.type = single ? CRIU_REQ_TYPE__SINGLE_PRE_DUMP : CRIU_REQ_TYPE__PRE_DUMP; + msg.type = CRIU_REQ_TYPE__PRE_DUMP; msg.success = success; set_resp_err(&msg); @@ -240,165 +239,15 @@ int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd) return 0; } -int exec_rpc_query_external_files(char *name, int sk) -{ - int i, ret; - CriuNotify cn = CRIU_NOTIFY__INIT; - CriuResp msg = CRIU_RESP__INIT; - CriuReq *req; - - cn.script = name; - - msg.type = CRIU_REQ_TYPE__NOTIFY; - msg.success = true; - msg.notify = &cn; - - ret = send_criu_msg_with_fd(sk, &msg, -1); - if (ret < 0) - return ret; - - ret = recv_criu_msg(sk, &req); - if (ret < 0) - return ret; - - if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) { - pr_err("RPC client reported script error\n"); - return -1; - } - - ret = 0; - if (req->opts) - for (i = 0; i < req->opts->n_external; i++) { - char *key = req->opts->external[i]; - pr_info("Adding external object: %s\n", key); - if (add_external(key)) { - pr_err("Failed to add external object: %s\n", key); - ret = -1; - } - } - else - pr_info("RPC NOTIFY %s: no `opts` returned.\n", name); - - criu_req__free_unpacked(req, NULL); - return ret; -} - -static int resolve_images_dir_path(char *images_dir_path, - bool imgs_changed_by_rpc_conf, - const CriuOpts *req, - pid_t peer_pid) -{ - /* - * images_dir_fd is a required RPC parameter with -1 as default value. - * - * This assumes that if opts.imgs_dir is set, we have a value - * from the configuration file parser. The test to see that - * imgs_changed_by_rpc_conf is true is used to make sure the value - * is from the RPC configuration file. The idea is that only the - * RPC configuration file is able to overwrite RPC settings: - * * apply_config(global_conf) - * * apply_config(user_conf) - * * apply_config(environment variable) - * * apply_rpc_options() - * * apply_config(rpc_conf) - */ - if (imgs_changed_by_rpc_conf) { - strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); - images_dir_path[PATH_MAX - 1] = '\0'; - } else if (req->images_dir_fd != -1) { - snprintf(images_dir_path, PATH_MAX, "/proc/%d/fd/%d", peer_pid, req->images_dir_fd); - } else if (req->images_dir) { - strncpy(images_dir_path, req->images_dir, PATH_MAX - 1); - images_dir_path[PATH_MAX - 1] = '\0'; - } else { - /* - * Since images dir is not required in CHECK mode, we need to - * check for work_dir_fd in setup_images_and_workdir() - */ - if (opts.mode == CR_CHECK) - return 0; - pr_err("Neither images_dir_fd nor images_dir was passed by RPC client.\n"); - return -1; - } - - return 0; -} - -static int setup_images_and_workdir(const char *images_dir_path, - bool work_changed_by_rpc_conf, - CriuOpts *req, - pid_t peer_pid) -{ - char work_dir_path[PATH_MAX] = ""; - - /* We don't need to open images dir in CHECK mode. */ - if (opts.mode != CR_CHECK) { - /* - * Image streaming is not supported with CRIU's service feature as - * the streamer must be started for each dump/restore operation. - * It is unclear how to do that with RPC, so we punt for now. - * This explains why we provide the argument mode=-1 instead of - * O_RSTR or O_DUMP. - */ - if (open_image_dir(images_dir_path, -1) < 0) { - pr_perror("Can't open images directory"); - return -1; - } - } - - if (work_changed_by_rpc_conf) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (req->has_work_dir_fd) - sprintf(work_dir_path, "/proc/%d/fd/%d", peer_pid, req->work_dir_fd); - else if (opts.work_dir) - strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); - else if (images_dir_path[0] != '\0') - strcpy(work_dir_path, images_dir_path); - - if (work_dir_path[0] == '\0') { - pr_err("images-dir or work-dir is required when using log file\n"); - return -1; - } - - if (chdir(work_dir_path)) { - pr_perror("Can't chdir to work_dir"); - return -1; - } - - return 0; -} - -static int setup_logging_from_req(CriuOpts *req, bool output_changed_by_rpc_conf) -{ - if (req->log_file && !output_changed_by_rpc_conf) { - if (strchr(req->log_file, '/')) { - pr_perror("No subdirs are allowed in log_file name"); - return -1; - } - SET_CHAR_OPTS(output, req->log_file); - } else if (req->has_log_to_stderr && req->log_to_stderr && !output_changed_by_rpc_conf) { - xfree(opts.output); - opts.output = NULL; /* log_init(NULL) writes to stderr */ - } else if (!opts.output) { - SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); - } - - opts.log_level = req->log_level; - log_set_loglevel(opts.log_level); - if (log_init(opts.output)) { - pr_perror("Can't initiate log"); - return -1; - } - - return 0; -} +static char images_dir[PATH_MAX]; static int setup_opts_from_req(int sk, CriuOpts *req) { struct ucred ids; struct stat st; socklen_t ids_len = sizeof(struct ucred); - char images_dir_path[PATH_MAX] = ""; + char images_dir_path[PATH_MAX]; + char work_dir_path[PATH_MAX]; char status_fd[PATH_MAX]; bool output_changed_by_rpc_conf = false; bool work_changed_by_rpc_conf = false; @@ -411,23 +260,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } - /* - * The options relevant in CHECK mode are: log_file, log_to_stderr, and log_level. - * When logging to a file, we also need to resolve images_dir and work_dir. - */ - if (opts.mode == CR_CHECK) { - if (!req) - return 0; /* nothing to do */ - - /* - * A log file is needed only if: - * - log_file is explicitly set, or - * - log_to_stderr is NOT requested (i.e., using DEFAULT_LOG_FILENAME) - */ - if (!req->log_file || (req->has_log_to_stderr && req->log_to_stderr)) - return 0; /* no log file, don't require images_dir or work_dir */ - } - if (fstat(sk, &st)) { pr_perror("Can't get socket stat"); goto err; @@ -436,8 +268,149 @@ static int setup_opts_from_req(int sk, CriuOpts *req) BUG_ON(st.st_ino == -1); service_sk_ino = st.st_ino; - if (req->has_unprivileged) - opts.unprivileged = req->unprivileged; + /* + * Evaluate an additional configuration file if specified. + * This needs to happen twice, because it is needed early to detect + * things like work_dir, imgs_dir and logfile. The second parsing + * of the optional RPC configuration file happens at the end and + * overwrites all options set via RPC. + */ + if (req->config_file) { + char *tmp_output = opts.output; + char *tmp_work = opts.work_dir; + char *tmp_imgs = opts.imgs_dir; + + opts.output = NULL; + opts.work_dir = NULL; + opts.imgs_dir = NULL; + + rpc_cfg_file = req->config_file; + i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); + if (i) { + xfree(tmp_output); + xfree(tmp_work); + xfree(tmp_imgs); + goto err; + } + /* If this is non-NULL, the RPC configuration file had a value, use it.*/ + if (opts.output) + output_changed_by_rpc_conf = true; + /* If this is NULL, use the old value if it was set. */ + if (!opts.output && tmp_output) { + opts.output = tmp_output; + tmp_output = NULL; + } + + if (opts.work_dir) + work_changed_by_rpc_conf = true; + if (!opts.work_dir && tmp_work) { + opts.work_dir = tmp_work; + tmp_work = NULL; + } + + if (opts.imgs_dir) + imgs_changed_by_rpc_conf = true; + /* + * As the images directory is a required RPC setting, it is not + * necessary to use the value from other configuration files. + * Either it is set in the RPC configuration file or it is set + * via RPC. + */ + xfree(tmp_output); + xfree(tmp_work); + xfree(tmp_imgs); + } + + /* + * open images_dir - images_dir_fd is a required RPC parameter + * + * This assumes that if opts.imgs_dir is set we have a value + * from the configuration file parser. The test to see that + * imgs_changed_by_rpc_conf is true is used to make sure the value + * is from the RPC configuration file. + * The idea is that only the RPC configuration file is able to + * overwrite RPC settings: + * * apply_config(global_conf) + * * apply_config(user_conf) + * * apply_config(environment variable) + * * apply_rpc_options() + * * apply_config(rpc_conf) + */ + if (imgs_changed_by_rpc_conf) + strncpy(images_dir_path, opts.imgs_dir, PATH_MAX - 1); + else + sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd); + + if (req->parent_img) + SET_CHAR_OPTS(img_parent, req->parent_img); + + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { + pr_perror("Can't open images directory"); + goto err; + } + + /* get full path to images_dir to use in process title */ + if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) { + pr_perror("Can't readlink %s", images_dir_path); + goto err; + } + + /* chdir to work dir */ + if (work_changed_by_rpc_conf) + /* Use the value from the RPC configuration file first. */ + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else if (req->has_work_dir_fd) + /* Use the value set via RPC. */ + sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd); + else if (opts.work_dir) + /* Use the value from one of the other configuration files. */ + strncpy(work_dir_path, opts.work_dir, PATH_MAX - 1); + else + /* Use the images directory a work directory. */ + strcpy(work_dir_path, images_dir_path); + + if (chdir(work_dir_path)) { + pr_perror("Can't chdir to work_dir"); + goto err; + } + + /* initiate log file in work dir */ + if (req->log_file && !output_changed_by_rpc_conf) { + /* + * If RPC sets a log file and if there nothing from the + * RPC configuration file, use the RPC value. + */ + if (strchr(req->log_file, '/')) { + pr_perror("No subdirs are allowed in log_file name"); + goto err; + } + + SET_CHAR_OPTS(output, req->log_file); + } else if (!opts.output) { + SET_CHAR_OPTS(output, DEFAULT_LOG_FILENAME); + } + + /* This is needed later to correctly set the log_level */ + opts.log_level = req->log_level; + log_set_loglevel(req->log_level); + if (log_init(opts.output) == -1) { + pr_perror("Can't initiate log"); + goto err; + } + + if (req->config_file) { + pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); + } + + if (kerndat_init()) + return 1; if (log_keep_err()) { pr_perror("Can't tune log"); @@ -448,9 +421,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_leave_running && req->leave_running) opts.final_state = TASK_ALIVE; - if (req->has_leave_stopped && req->leave_stopped) - opts.final_state = TASK_STOPPED; - if (!req->has_pid) { req->has_pid = true; req->pid = ids.pid; @@ -494,9 +464,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->has_shell_job) opts.shell_job = req->shell_job; - if (req->has_skip_file_rwx_check) - opts.skip_file_rwx_check = req->skip_file_rwx_check; - if (req->has_file_locks) opts.handle_file_locks = req->file_locks; @@ -543,9 +510,6 @@ static int setup_opts_from_req(int sk, CriuOpts *req) case CRIU_NETWORK_LOCK_METHOD__NFTABLES: opts.network_lock_method = NETWORK_LOCK_NFTABLES; break; - case CRIU_NETWORK_LOCK_METHOD__SKIP: - opts.network_lock_method = NETWORK_LOCK_SKIP; - break; default: goto err; } @@ -721,6 +685,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->empty_ns & ~(CLONE_NEWNET)) goto err; } + + if (req->n_irmap_scan_paths) { + for (i = 0; i < req->n_irmap_scan_paths; i++) { + if (irmap_scan_path_add(req->irmap_scan_paths[i])) + goto err; + } + } + if (req->has_status_fd) { pr_warn("status_fd is obsoleted; use status-ready notification instead\n"); @@ -732,95 +704,22 @@ static int setup_opts_from_req(int sk, CriuOpts *req) } } - if (req->orphan_pts_master) - opts.orphan_pts_master = true; - - if (req->has_display_stats) - opts.display_stats = req->display_stats; - - /* Evaluate additional configuration file (e.g., runc.conf) to overwrite all RPC settings. */ - if (req->config_file) { - char *tmp_output = opts.output; - char *tmp_work = opts.work_dir; - - opts.output = NULL; - opts.work_dir = NULL; - - /* - * As the images directory is a required RPC setting, it is not - * necessary to use the value from other configuration files. - * Either it is set in the RPC configuration file or it is set - * via RPC. - */ - xfree(opts.imgs_dir); - opts.imgs_dir = NULL; - - pr_debug("Would overwrite RPC settings with values from %s\n", req->config_file); - - rpc_cfg_file = req->config_file; - i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); - if (i) { - xfree(tmp_output); - xfree(tmp_work); - goto err; - } - - /* If opts.{output,work_dir} is non-NULL, the RPC configuration file had a value, use it.*/ - /* If opts.{output,work_dir} is NULL, use the old value if it was set. */ - if (opts.output) { - output_changed_by_rpc_conf = true; - } else { - opts.output = tmp_output; - tmp_output = NULL; - } - - if (opts.work_dir) { - work_changed_by_rpc_conf = true; - } else { - opts.work_dir = tmp_work; - tmp_work = NULL; - } - - if (opts.imgs_dir) - imgs_changed_by_rpc_conf = true; - - xfree(tmp_output); - xfree(tmp_work); - } - - if (resolve_images_dir_path(images_dir_path, imgs_changed_by_rpc_conf, req, ids.pid) < 0) - goto err; - - if (req->parent_img) - SET_CHAR_OPTS(img_parent, req->parent_img); - - if (setup_images_and_workdir(images_dir_path, work_changed_by_rpc_conf, req, ids.pid)) - goto err; - - if (req->n_irmap_scan_paths) { - for (i = 0; i < req->n_irmap_scan_paths; i++) { - if (irmap_scan_path_add(req->irmap_scan_paths[i])) - goto err; - } - } - - /* initiate log file in work dir */ - if (setup_logging_from_req(req, output_changed_by_rpc_conf)) - goto err; - - if (check_caps()) - goto err; - - if (kerndat_init()) - goto err; - - /* init_pidfd_store_sk must be called after kerndat_init. */ if (req->has_pidfd_store_sk && init_pidfd_store_sk(ids.pid, req->pidfd_store_sk)) goto err; - if (req->mntns_compat_mode) - opts.mntns_compat_mode = true; + if (req->orphan_pts_master) + opts.orphan_pts_master = true; + /* Evaluate additional configuration file a second time to overwrite + * all RPC settings. */ + if (req->config_file) { + rpc_cfg_file = req->config_file; + i = parse_options(0, NULL, &dummy, &dummy, PARSING_RPC_CONF); + if (i) + goto err; + } + + log_set_loglevel(opts.log_level); if (check_options()) goto err; @@ -836,11 +735,10 @@ static int dump_using_req(int sk, CriuOpts *req) bool success = false; bool self_dump = !req->pid; - opts.mode = CR_DUMP; if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("dump --rpc -t %d", req->pid); + setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -879,11 +777,10 @@ static int restore_using_req(int sk, CriuOpts *req) opts.restore_detach = true; - opts.mode = CR_RESTORE; if (setup_opts_from_req(sk, req)) goto exit; - __setproctitle("restore --rpc"); + setproctitle("restore --rpc -D %s", images_dir); if (cr_restore_tasks()) goto exit; @@ -922,11 +819,6 @@ static int check(int sk, CriuOpts *req) resp.type = CRIU_REQ_TYPE__CHECK; - if (log_keep_err()) { - pr_perror("Can't tune log"); - goto out; - } - pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -934,9 +826,8 @@ static int check(int sk, CriuOpts *req) } if (pid == 0) { - __setproctitle("check --rpc"); + setproctitle("check --rpc"); - opts.mode = CR_CHECK; if (setup_opts_from_req(sk, req)) exit(1); @@ -951,20 +842,14 @@ static int check(int sk, CriuOpts *req) resp.success = true; out: - set_resp_err(&resp); return send_criu_msg(sk, &resp); } -static int pre_dump_using_req(int sk, CriuOpts *req, bool single) +static int pre_dump_using_req(int sk, CriuOpts *req) { int pid, status; bool success = false; - if (log_keep_err()) { - pr_perror("Can't tune log"); - goto out; - } - pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -974,11 +859,10 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) if (pid == 0) { int ret = 1; - opts.mode = CR_PRE_DUMP; if (setup_opts_from_req(sk, req)) goto cout; - __setproctitle("pre-dump --rpc -t %d", req->pid); + setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir); if (init_pidfd_store_hash()) goto pidfd_store_err; @@ -1002,7 +886,7 @@ static int pre_dump_using_req(int sk, CriuOpts *req, bool single) success = true; out: - if (send_criu_pre_dump_resp(sk, success, single) == -1) { + if (send_criu_pre_dump_resp(sk, success) == -1) { pr_perror("Can't send pre-dump resp"); success = false; } @@ -1015,7 +899,7 @@ static int pre_dump_loop(int sk, CriuReq *msg) int ret; do { - ret = pre_dump_using_req(sk, msg->opts, false); + ret = pre_dump_using_req(sk, msg->opts); if (ret < 0) return ret; @@ -1043,11 +927,6 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT; struct ps_info info; - if (log_keep_err()) { - pr_perror("Can't tune log"); - goto out; - } - if (pipe(start_pipe)) { pr_perror("No start pipe"); goto out; @@ -1057,11 +936,10 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) if (pid == 0) { close(start_pipe[0]); - opts.mode = CR_PAGE_SERVER; if (setup_opts_from_req(sk, req)) goto out_ch; - __setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); + setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port); pr_debug("Starting page server\n"); @@ -1121,7 +999,6 @@ static int start_page_server_req(int sk, CriuOpts *req, bool daemon_mode) out: resp.type = CRIU_REQ_TYPE__PAGE_SERVER; resp.success = success; - set_resp_err(&resp); return send_criu_msg(sk, &resp); } @@ -1222,7 +1099,7 @@ static int handle_feature_check(int sk, CriuReq *msg) if (kerndat_init()) exit(1); - __setproctitle("feature-check --rpc"); + setproctitle("feature-check --rpc"); if ((msg->features->has_mem_track == 1) && (msg->features->mem_track == true)) feat.mem_track = kdat.has_dirty_track; @@ -1296,11 +1173,6 @@ static int handle_cpuinfo(int sk, CriuReq *msg) bool success = false; int pid, status; - if (log_keep_err()) { - pr_perror("Can't tune log"); - goto out; - } - pid = fork(); if (pid < 0) { pr_perror("Can't fork"); @@ -1310,11 +1182,11 @@ static int handle_cpuinfo(int sk, CriuReq *msg) if (pid == 0) { int ret = 1; - opts.mode = (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ? CR_CPUINFO_DUMP : CR_CPUINFO_CHECK; if (setup_opts_from_req(sk, msg->opts)) goto cout; - __setproctitle("cpuinfo %s --rpc", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check"); + setproctitle("cpuinfo %s --rpc -D %s", msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ? "dump" : "check", + images_dir); if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP) ret = cpuinfo_dump(); @@ -1349,7 +1221,7 @@ static int handle_cpuinfo(int sk, CriuReq *msg) out: resp.type = msg->type; resp.success = success; - set_resp_err(&resp); + return send_criu_msg(sk, &resp); } @@ -1358,17 +1230,7 @@ int cr_service_work(int sk) int ret = -1; CriuReq *msg = 0; - /* - * util_init initializes criu_run_id and compel_run_id so that sockets - * are generated with an unique name identifying the specific process - * even in cases where multiple processes with the same pid in - * different pid namespaces are sharing the same network namespace. - */ - util_init(); - more: - opts.mode = CR_SWRK; - if (recv_criu_msg(sk, &msg) != 0) { pr_perror("Can't recv request"); goto err; @@ -1409,9 +1271,6 @@ more: case CRIU_REQ_TYPE__VERSION: ret = handle_version(sk, msg); break; - case CRIU_REQ_TYPE__SINGLE_PRE_DUMP: - ret = pre_dump_using_req(sk, msg->opts, true); - break; default: send_criu_err(sk, "Invalid req"); diff --git a/criu/crtools.c b/criu/crtools.c index 4dc55a065..6a75cd1ea 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,74 +54,16 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } -static int image_dir_mode(void) +static int image_dir_mode(char *argv[], int optind) { - switch (opts.mode) { - case CR_DUMP: - /* fallthrough */ - case CR_CPUINFO_DUMP: - /* fallthrough */ - case CR_PRE_DUMP: + if (!strcmp(argv[optind], "dump") || !strcmp(argv[optind], "pre-dump") || + (!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "dump"))) return O_DUMP; - case CR_RESTORE: + + if (!strcmp(argv[optind], "restore") || + (!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "restore"))) return O_RSTR; - default: - return -1; - } - /* never reached */ - BUG(); - return -1; -} - -struct { - char *cmd; - int mode; -} commands[] = { - { "dump", CR_DUMP }, - { "pre-dump", CR_PRE_DUMP }, - { "restore", CR_RESTORE }, - { "lazy-pages", CR_LAZY_PAGES }, - { "check", CR_CHECK }, - { "page-server", CR_PAGE_SERVER }, - { "service", CR_SERVICE }, - { "swrk", CR_SWRK }, - { "dedup", CR_DEDUP }, - { "exec", CR_EXEC_DEPRECATED }, - { "show", CR_SHOW_DEPRECATED }, -}; - -static int parse_criu_mode(int argc, char **argv, int *optind) -{ - char *cmd = argv[*optind]; - bool has_sub_command = (argc - *optind) > 1; - char *subcommand = has_sub_command ? argv[*optind + 1] : NULL; - int i; - - for (i = 0; i < ARRAY_SIZE(commands); i++) { - if (strcmp(cmd, commands[i].cmd)) - continue; - opts.mode = commands[i].mode; - return 0; - } - - if (!strcmp(cmd, "cpuinfo")) { - if (subcommand == NULL) { - pr_err("cpuinfo requires an action: dump or check\n"); - return -1; - } - if (!strcmp(subcommand, "dump")) - opts.mode = CR_CPUINFO_DUMP; - else if (!strcmp(subcommand, "check")) - opts.mode = CR_CPUINFO_CHECK; - else { - pr_err("unknown cpuinfo sub-command: %s\n", subcommand); - return -1; - } - (*optind)++; - return 0; - } - pr_err("unknown command: %s\n", argv[*optind]); return -1; } @@ -132,7 +74,6 @@ int main(int argc, char *argv[], char *envp[]) bool has_exec_cmd = false; bool has_sub_command; int state = PARSING_GLOBAL_CONF; - char *cmd; BUILD_BUG_ON(CTL_32 != SYSCTL_TYPE__CTL_32); BUILD_BUG_ON(__CTL_STR != SYSCTL_TYPE__CTL_STR); @@ -145,7 +86,7 @@ int main(int argc, char *argv[], char *envp[]) } cr_pb_init(); - __setproctitle_init(argc, argv, envp); + setproctitle_init(argc, argv, envp); if (argc < 2) goto usage; @@ -165,37 +106,7 @@ int main(int argc, char *argv[], char *envp[]) log_set_loglevel(opts.log_level); - /* - * There kernel might send us lethal signals in the following cases: - * 1) Writing a pipe which reader has disappeared. - * 2) Writing to a socket of type SOCK_STREAM which is no longer connected. - * We deal with write()/Send() failures on our own, and prefer not to get killed. - * So we ignore SIGPIPEs. - * - * Pipes are used in various places: - * 1) Receiving application page data - * 2) Transmitting data to the image streamer - * 3) Emitting logs (potentially to a pipe). - * Sockets are mainly used in transmitting memory data. - */ - if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { - pr_perror("Failed to set a SIGPIPE signal ignore."); - return 1; - } - - cmd = argv[optind]; - ret = parse_criu_mode(argc, argv, &optind); - if (ret) - goto usage; - - /* - * util_init initializes criu_run_id and compel_run_id so that sockets - * are generated with an unique name identifying the specific process - * even in cases where multiple processes with the same pid in - * different pid namespaces are sharing the same network namespace. - */ - util_init(); - if (opts.mode == CR_SWRK) { + if (optind < argc && !strcmp(argv[optind], "swrk")) { if (argc != optind + 2) { fprintf(stderr, "Usage: criu swrk \n"); return 1; @@ -210,7 +121,7 @@ int main(int argc, char *argv[], char *envp[]) return cr_service_work(atoi(argv[optind + 1])); } - if (check_caps()) + if (check_options()) return 1; if (opts.imgs_dir == NULL) @@ -227,7 +138,7 @@ int main(int argc, char *argv[], char *envp[]) goto usage; } - if (opts.mode != CR_RESTORE) { + if (strcmp(argv[optind], "restore")) { pr_err("--exec-cmd is available for the restore command only\n"); goto usage; } @@ -242,30 +153,49 @@ int main(int argc, char *argv[], char *envp[]) return 1; memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *)); opts.exec_cmd[argc - optind - 1] = NULL; - } else if (has_sub_command) { - pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", cmd); - goto usage; + } else { + /* No subcommands except for cpuinfo and restore --exec-cmd */ + if (strcmp(argv[optind], "cpuinfo") && has_sub_command) { + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); + goto usage; + } } - if (opts.stream && image_dir_mode() == -1) { - pr_err("--stream cannot be used with the %s command\n", cmd); + if (opts.stream && image_dir_mode(argv, optind) == -1) { + pr_err("--stream cannot be used with the %s command\n", argv[optind]); goto usage; } /* We must not open imgs dir, if service is called */ - if (opts.mode != CR_SERVICE) { - ret = open_image_dir(opts.imgs_dir, image_dir_mode()); + if (strcmp(argv[optind], "service")) { + ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); if (ret < 0) { pr_err("Couldn't open image dir %s\n", opts.imgs_dir); return 1; } } + /* + * The kernel might send us lethal signals when writing to a pipe + * which reader has disappeared. We deal with write() failures on our + * own, and prefer not to get killed. So we ignore SIGPIPEs. + * + * Pipes are used in various places: + * 1) Receiving application page data + * 2) Transmitting data to the image streamer + * 3) Emitting logs (potentially to a pipe). + */ + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { + pr_perror("Failed to set a SIGPIPE signal ignore."); + return 1; + } + /* * When a process group becomes an orphan, * its processes are sent a SIGHUP signal */ - if (opts.mode == CR_RESTORE && opts.restore_detach && opts.final_state == TASK_STOPPED && opts.shell_job) + if (!strcmp(argv[optind], "restore") && opts.restore_detach && opts.final_state == TASK_STOPPED && + opts.shell_job) pr_warn("Stopped and detached shell job will get SIGHUP from OS.\n"); if (chdir(opts.work_dir)) { @@ -281,14 +211,11 @@ int main(int argc, char *argv[], char *envp[]) return 1; } - if (check_options()) - return 1; - if (fault_injected(FI_CANNOT_MAP_VDSO)) kdat.can_map_vdso = 0; if (!list_empty(&opts.inherit_fds)) { - if (opts.mode != CR_RESTORE) { + if (strcmp(argv[optind], "restore")) { pr_err("--inherit-fd is restore-only option\n"); return 1; } @@ -299,13 +226,13 @@ int main(int argc, char *argv[], char *envp[]) if (opts.img_parent) pr_info("Will do snapshot from %s\n", opts.img_parent); - switch (opts.mode) { - case CR_DUMP: + if (!strcmp(argv[optind], "dump")) { if (!opts.tree_id) goto opt_pid_missing; - return cr_dump_tasks(opts.tree_id); - case CR_PRE_DUMP: + } + + if (!strcmp(argv[optind], "pre-dump")) { if (!opts.tree_id) goto opt_pid_missing; @@ -315,7 +242,9 @@ int main(int argc, char *argv[], char *envp[]) } return cr_pre_dump_tasks(opts.tree_id) != 0; - case CR_RESTORE: + } + + if (!strcmp(argv[optind], "restore")) { if (opts.tree_id) pr_warn("Using -t with criu restore is obsoleted\n"); @@ -328,41 +257,46 @@ int main(int argc, char *argv[], char *envp[]) } return ret != 0; + } - case CR_LAZY_PAGES: + if (!strcmp(argv[optind], "lazy-pages")) return cr_lazy_pages(opts.daemon_mode) != 0; - case CR_CHECK: + if (!strcmp(argv[optind], "check")) return cr_check() != 0; - case CR_PAGE_SERVER: + if (!strcmp(argv[optind], "page-server")) return cr_page_server(opts.daemon_mode, false, -1) != 0; - case CR_SERVICE: + if (!strcmp(argv[optind], "service")) return cr_service(opts.daemon_mode); - case CR_DEDUP: + if (!strcmp(argv[optind], "dedup")) return cr_dedup() != 0; - case CR_CPUINFO_DUMP: - return cpuinfo_dump(); + if (!strcmp(argv[optind], "cpuinfo")) { + if (!argv[optind + 1]) { + pr_err("cpuinfo requires an action: dump or check\n"); + goto usage; + } + if (!strcmp(argv[optind + 1], "dump")) + return cpuinfo_dump(); + else if (!strcmp(argv[optind + 1], "check")) + return cpuinfo_check(); + } - case CR_CPUINFO_CHECK: - return cpuinfo_check(); - - case CR_EXEC_DEPRECATED: + if (!strcmp(argv[optind], "exec")) { pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; + } - case CR_SHOW_DEPRECATED: + if (!strcmp(argv[optind], "show")) { pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; - - case CR_UNSET: - default: - pr_err("unknown command: %s\n", cmd); } + + pr_err("unknown command: %s\n", argv[optind]); usage: pr_msg("\n" "Usage:\n" @@ -420,13 +354,9 @@ usage: " in lazy-pages mode: 'criu lazy-pages -D DIR'\n" " --lazy-pages and lazy-pages mode require userfaultfd\n" " --stream dump/restore images using criu-image-streamer\n" - " --mntns-compat-mode Use mount engine in compatibility mode. By default criu\n" - " tries to use mount-v2 mode with more reliable algorithm\n" - " based on MOVE_MOUNT_SET_GROUP kernel feature\n" - " --network-lock METHOD network locking/unlocking method; argument\n" - " can be 'nftables' or 'iptables' (default).\n" - " --unprivileged accept limitations when running as non-root\n" - " --allow-uprobes allow dump/restore with uprobes vma\n" + " --network-lock METHOD\n" + " network locking/unlocking method; argument\n" + " can be 'nftables' or 'iptables' (default).\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" @@ -454,13 +384,10 @@ usage: " is inaccessible\n" " --link-remap allow one to link unlinked files back when possible\n" " --ghost-limit size limit max size of deleted file contents inside image\n" - " --ghost-fiemap enable dumping of deleted files using fiemap\n" " --action-script FILE add an external action script\n" " -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n" " -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n" " -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n" - " --timeout NUM a timeout (in seconds) on collecting tasks during dump\n" - " (default 10 seconds)\n" " --force-irmap force resolving names for inotify/fsnotify watches\n" " --irmap-scan-path FILE\n" " add a path the irmap hints to scan\n" @@ -503,8 +430,8 @@ usage: " Inherit file descriptors, treating fd NUM as being\n" " already opened via an existing RES, which can be:\n" " tty[rdev:dev]\n" - " pipe:[inode]\n" - " socket:[inode]\n" + " pipe[inode]\n" + " socket[inode]\n" " file[mnt_id:inode]\n" " /memfd:name\n" " path/to/file\n" @@ -518,9 +445,6 @@ usage: " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" - " --skip-file-rwx-check\n" - " Skip checking file permissions\n" - " (r/w/x for u/g/o) on restore.\n" "\n" "Check options:\n" " Without options, \"criu check\" checks availability of absolutely required\n" diff --git a/criu/eventpoll.c b/criu/eventpoll.c index ca5ee9c59..978dca5be 100644 --- a/criu/eventpoll.c +++ b/criu/eventpoll.c @@ -308,7 +308,7 @@ static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p) * files is tricky: we need to use kcmp * to find out where file came from. Until * it's implemented lets use simpler approach - * just check the targets are belonging to the + * just check the targets are blonging to the * pid's file set. */ if (p->dfds) { diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 5dd9acf60..83dc1fc8d 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -1,7 +1,6 @@ #include #include "criu-log.h" #include "fault-injection.h" -#include "seize.h" enum faults fi_strategy; @@ -22,13 +21,5 @@ int fault_injection_init(void) } fi_strategy = start; - - switch (fi_strategy) { - case FI_COMPEL_INTERRUPT_ONLY_MODE: - set_compel_interrupt_only_mode(); - break; - default: - break; - }; return 0; } diff --git a/criu/fdstore.c b/criu/fdstore.c index 6ac639c55..77935484f 100644 --- a/criu/fdstore.c +++ b/criu/fdstore.c @@ -12,17 +12,11 @@ #include "xmalloc.h" #include "rst-malloc.h" #include "log.h" -#include "util.h" -#include "cr_options.h" -#include "util-caps.h" -#include "sockets.h" -/* clang-format off */ static struct fdstore_desc { int next_id; mutex_t lock; /* to protect a peek offset */ -} *desc; -/* clang-format on */ +} * desc; int fdstore_init(void) { @@ -52,14 +46,15 @@ int fdstore_init(void) return -1; } - if (sk_setbufs(sk, buf)) { + if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || + setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { + pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); close(sk); return -1; } addr.sun_family = AF_UNIX; - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64 "-%s", st.st_ino, - criu_run_id); + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-fdstore-%" PRIx64, st.st_ino); addrlen += sizeof(addr.sun_family); addr.sun_path[0] = 0; diff --git a/criu/file-ids.c b/criu/file-ids.c index 772bd92cf..1b9d68888 100644 --- a/criu/file-ids.c +++ b/criu/file-ids.c @@ -77,14 +77,8 @@ int fd_id_generate_special(struct fd_parms *p, u32 *id) fi = fd_id_cache_lookup(p); if (fi) { - if (p->stat.st_mode & (S_IFCHR | S_IFBLK)) { - /* Don't cache the id for mapped devices */ - *id = fd_tree.subid++; - return 1; - } else { - *id = fi->id; - return 0; - } + *id = fi->id; + return 0; } } diff --git a/criu/files-ext.c b/criu/files-ext.c index 4cc99d921..95ec8e37c 100644 --- a/criu/files-ext.c +++ b/criu/files-ext.c @@ -45,11 +45,10 @@ static int open_fd(struct file_desc *d, int *new_fd) { struct ext_file_info *xfi; int fd; - bool retry_needed; xfi = container_of(d, struct ext_file_info, d); - fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id, &retry_needed); + fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id); if (fd < 0) { pr_err("Unable to restore %#x\n", xfi->xfe->id); return -1; @@ -58,11 +57,8 @@ static int open_fd(struct file_desc *d, int *new_fd) if (restore_fown(fd, xfi->xfe->fown)) return -1; - if (!retry_needed) - *new_fd = fd; - else - *new_fd = -1; - return retry_needed; + *new_fd = fd; + return 0; } static struct file_desc_ops ext_desc_ops = { diff --git a/criu/files-reg.c b/criu/files-reg.c index 66c0e6cda..ee54d1d7d 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -11,13 +11,8 @@ #include #include #include -#include +#include #include -#include -#include - -#include "tty.h" -#include "stats.h" #ifndef SEEK_DATA #define SEEK_DATA 3 @@ -34,8 +29,6 @@ * and checked. */ #define BUILD_ID_MAP_SIZE 1048576 -#define ST_UNIT 512 -#define EXTENT_MAX_COUNT 512 #include "cr_options.h" #include "imgset.h" @@ -60,7 +53,6 @@ #include "files-reg.h" #include "plugin.h" -#include "string.h" int setfsuid(uid_t fsuid); int setfsgid(gid_t fsuid); @@ -86,7 +78,7 @@ static LIST_HEAD(ghost_files); /* * When opening remaps we first create a link on the remap * target, then open one, then unlink. In case the remap - * source has more than one instance, these three steps + * source has more than one instance, these tree steps * should be serialized with each other. */ static mutex_t *remap_open_lock; @@ -225,92 +217,6 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) return 0; } -static int skip_outstanding(struct fiemap_extent *fe, size_t file_size) -{ - /* Skip outstanding extent */ - if (fe->fe_logical > file_size) - return 1; - - /* Skip outstanding part of the extent */ - if (fe->fe_logical + fe->fe_length > file_size) - fe->fe_length = file_size - fe->fe_logical; - return 0; -} - -static int copy_file_to_chunks_fiemap(int fd, struct cr_img *img, size_t file_size) -{ - GhostChunkEntry ce = GHOST_CHUNK_ENTRY__INIT; - struct fiemap *fiemap_buf; - struct fiemap_extent *ext_buf; - int ext_buf_size, fie_buf_size; - off_t pos = 0; - unsigned int i; - int ret = 0; - int exit_code = 0; - - ext_buf_size = EXTENT_MAX_COUNT * sizeof(struct fiemap_extent); - fie_buf_size = sizeof(struct fiemap) + ext_buf_size; - - fiemap_buf = xzalloc(fie_buf_size); - if (!fiemap_buf) { - pr_perror("Out of memory when allocating fiemap"); - return -1; - } - - ext_buf = fiemap_buf->fm_extents; - fiemap_buf->fm_length = FIEMAP_MAX_OFFSET; - fiemap_buf->fm_flags |= FIEMAP_FLAG_SYNC; - fiemap_buf->fm_extent_count = EXTENT_MAX_COUNT; - - do { - fiemap_buf->fm_start = pos; - memzero(ext_buf, ext_buf_size); - ret = ioctl(fd, FS_IOC_FIEMAP, fiemap_buf); - if (ret < 0) { - if (errno == EOPNOTSUPP) { - exit_code = -EOPNOTSUPP; - } else { - exit_code = -1; - pr_perror("fiemap ioctl() failed"); - } - goto out; - } else if (fiemap_buf->fm_mapped_extents == 0) { - goto out; - } - - for (i = 0; i < fiemap_buf->fm_mapped_extents; i++) { - if (skip_outstanding(&fiemap_buf->fm_extents[i], file_size)) - continue; - - ce.len = fiemap_buf->fm_extents[i].fe_length; - ce.off = fiemap_buf->fm_extents[i].fe_logical; - - if (pb_write_one(img, &ce, PB_GHOST_CHUNK)) { - exit_code = -1; - goto out; - } - - if (copy_chunk_from_file(fd, img_raw_fd(img), ce.off, ce.len)) { - exit_code = -1; - goto out; - } - - if (fiemap_buf->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST) { - /* there are no extents left, break. */ - goto out; - } - } - - /* Record file's logical offset as pos */ - pos = ce.len + ce.off; - - /* Since there are still extents left, continue. */ - } while (fiemap_buf->fm_mapped_extents == EXTENT_MAX_COUNT); -out: - xfree(fiemap_buf); - return exit_code; -} - static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { int ret; @@ -407,31 +313,74 @@ static int mklnk_ghost(char *path, GhostFileEntry *gfe) static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; + int ret = -1; - if (cr_fchpermat(AT_FDCWD, path, gfe->uid, gfe->gid, gfe->mode, AT_SYMLINK_NOFOLLOW) < 0) - return -1; + if (S_ISLNK(gfe->mode)) { + if (lchown(path, gfe->uid, gfe->gid) < 0) { + pr_perror("Can't reset user/group on ghost %s", path); + goto err; + } - if (!gfe->atim) - return 0; + /* + * We have no lchmod() function, and fchmod() will fail on + * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() + * function and flag AT_SYMLINK_NOFOLLOW described in + * man 2 fchmodat, but it is not currently implemented. %) + */ + } else { + if (chown(path, gfe->uid, gfe->gid) < 0) { + pr_perror("Can't reset user/group on ghost %s", path); + goto err; + } - tv[0].tv_sec = gfe->atim->tv_sec; - tv[0].tv_usec = gfe->atim->tv_usec; - tv[1].tv_sec = gfe->mtim->tv_sec; - tv[1].tv_usec = gfe->mtim->tv_usec; - - if (lutimes(path, tv)) { - pr_perror("Can't set access and modification times on ghost %s", path); - return -1; + if (chmod(path, gfe->mode)) { + pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); + goto err; + } } - return 0; + if (gfe->atim) { + tv[0].tv_sec = gfe->atim->tv_sec; + tv[0].tv_usec = gfe->atim->tv_usec; + tv[1].tv_sec = gfe->mtim->tv_sec; + tv[1].tv_usec = gfe->mtim->tv_usec; + if (lutimes(path, tv)) { + pr_perror("Can't set access and modification times on ghost %s", path); + goto err; + } + } + + ret = 0; +err: + return ret; } -static int create_ghost_dentry(char *path, GhostFileEntry *gfe, struct cr_img *img) +static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) { - int ret = -1; + struct mount_info *mi; + char path[PATH_MAX]; + int ret, root_len; char *msg; + root_len = ret = rst_get_mnt_root(gf->remap.rmnt_id, path, sizeof(path)); + if (ret < 0) { + pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id); + goto err; + } + + /* Add a '/' only if we have no at the end */ + if (path[root_len - 1] != '/') { + path[root_len++] = '/'; + path[root_len] = '\0'; + } + + snprintf(path + root_len, sizeof(path) - root_len, "%s", gf->remap.rpath); + ret = -1; + + mi = lookup_mnt_id(gf->remap.rmnt_id); + /* We get here while in service mntns */ + if (mi && try_remount_writable(mi, false)) + goto err; again: if (S_ISFIFO(gfe->mode)) { if ((ret = mknod(path, gfe->mode, 0)) < 0) @@ -468,83 +417,18 @@ again: goto err; } + strcpy(gf->remap.rpath, path + root_len); + pr_debug("Remap rpath is %s\n", gf->remap.rpath); + + ret = -1; + if (ghost_apply_metadata(path, gfe)) + goto err; + ret = 0; err: return ret; } -static int nomntns_create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) -{ - char path[PATH_MAX]; - - snprintf(path, sizeof(path), "/%s", gf->remap.rpath); - - if (create_ghost_dentry(path, gfe, img)) - return -1; - - if (ghost_apply_metadata(path, gfe)) - return -1; - - __strlcpy(gf->remap.rpath, path + 1, PATH_MAX); - pr_debug("Remap rpath is %s\n", gf->remap.rpath); - return 0; -} - -static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img) -{ - struct mount_info *mi; - char path[PATH_MAX], *rel_path, *rel_mp; - - if (!(root_ns_mask & CLONE_NEWNS)) - return nomntns_create_ghost(gf, gfe, img); - - mi = lookup_mnt_id(gf->remap.rmnt_id); - if (!mi) { - pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id); - return -1; - } - - /* Get path relative to mountpoint from path relative to mntns */ - rel_path = get_relative_path(gf->remap.rpath, mi->ns_mountpoint); - if (!rel_path) { - pr_err("Can't get path %s relative to %s\n", gf->remap.rpath, mi->ns_mountpoint); - return -1; - } - - snprintf(path, sizeof(path), "%s%s%s", service_mountpoint(mi), rel_path[0] ? "/" : "", rel_path); - pr_debug("Trying to create ghost on path %s\n", path); - - /* We get here while in service mntns */ - if (try_remount_writable(mi, false)) - return -1; - - if (create_ghost_dentry(path, gfe, img)) - return -1; - - if (ghost_apply_metadata(path, gfe)) - return -1; - - /* - * Convert the path back to mntns relative, as create_ghost_dentry - * might have changed it. - */ - rel_path = get_relative_path(path, service_mountpoint(mi)); - if (!rel_path) { - pr_err("Can't get path %s relative to %s\n", path, service_mountpoint(mi)); - return -1; - } - - rel_mp = get_relative_path(mi->ns_mountpoint, "/"); - if (!rel_mp) { - pr_err("Can't get path %s relative to %s\n", mi->ns_mountpoint, "/"); - return -1; - } - - snprintf(gf->remap.rpath, PATH_MAX, "%s%s%s", rel_mp, (rel_mp[0] && rel_path[0]) ? "/" : "", rel_path); - pr_debug("Remap rpath is %s\n", gf->remap.rpath); - return 0; -} - static inline void ghost_path(char *path, int plen, struct reg_file_info *rfi, RemapFilePathEntry *rpe) { snprintf(path, plen, "%s.cr.%x.ghost", rfi->path, rpe->remap_id); @@ -616,7 +500,7 @@ static int open_remap_ghost(struct reg_file_info *rfi, RemapFilePathEntry *rpe) gf->remap.rmnt_id = rfi->rfe->mnt_id; if (S_ISDIR(gfe->mode)) - __strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); + strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); else ghost_path(gf->remap.rpath, PATH_MAX, rfi, rpe); @@ -819,50 +703,44 @@ int prepare_remaps(void) static int clean_one_remap(struct remap_info *ri) { struct file_remap *remap = ri->rfi->remap; - int mnt_id, ret; + int mnt_id, ret, rmntns_root; struct mount_info *mi; - char path[PATH_MAX], *rel_path; + char path[PATH_MAX]; if (remap->rpath[0] == 0) return 0; - if (!(root_ns_mask & CLONE_NEWNS)) { - snprintf(path, sizeof(path), "/%s", remap->rpath); - goto nomntns; - } - mnt_id = ri->rfi->rfe->mnt_id; /* rirfirfe %) */ + ret = rst_get_mnt_root(mnt_id, path, sizeof(path)); + if (ret < 0) + return -1; + if (ret >= sizeof(path) - 1) { + pr_err("The path buffer is too small\n"); + return -1; + } + + rmntns_root = open(path, O_RDONLY); + if (rmntns_root < 0) { + pr_perror("Unable to open %s", path); + return -1; + } + mi = lookup_mnt_id(mnt_id); - if (!mi) { - pr_err("The %d mount is not found for ghost\n", mnt_id); - return -1; - } - - rel_path = get_relative_path(remap->rpath, mi->ns_mountpoint); - if (!rel_path) { - pr_err("Can't get path %s relative to %s\n", remap->rpath, mi->ns_mountpoint); - return -1; - } - - snprintf(path, sizeof(path), "%s%s%s", service_mountpoint(mi), strlen(rel_path) ? "/" : "", rel_path); - /* We get here while in service mntns */ - if (try_remount_writable(mi, false)) - return -1; - -nomntns: - pr_info("Unlink remap %s\n", path); - - if (remap->is_dir) - ret = rmdir(path); - else - ret = unlink(path); - - if (ret) { - pr_perror("Couldn't unlink remap %s", path); + if (mi && try_remount_writable(mi, false)) { + close(rmntns_root); return -1; } + pr_info("Unlink remap %s\n", remap->rpath); + + ret = unlinkat(rmntns_root, remap->rpath, remap->is_dir ? AT_REMOVEDIR : 0); + if (ret < 0) { + close(rmntns_root); + pr_perror("Couldn't unlink remap %s %s", path, remap->rpath); + return -1; + } + close(rmntns_root); remap->rpath[0] = 0; return 0; @@ -981,20 +859,10 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de goto err_out; } - if (gfe.chunks) { - if (opts.ghost_fiemap) { - ret = copy_file_to_chunks_fiemap(fd, img, st->st_size); - if (ret == -EOPNOTSUPP) { - pr_debug("file system don't support fiemap\n"); - ret = copy_file_to_chunks(fd, img, st->st_size); - } - } else { - ret = copy_file_to_chunks(fd, img, st->st_size); - } - } else { + if (gfe.chunks) + ret = copy_file_to_chunks(fd, img, st->st_size); + else ret = copy_file(fd, img_raw_fd(img), st->st_size); - } - close(fd); if (ret) goto err_out; @@ -1027,8 +895,8 @@ static int dump_ghost_remap(char *path, const struct stat *st, int lfd, u32 id, pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id); - if (st->st_blocks * ST_UNIT > opts.ghost_limit) { - pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_blocks * ST_UNIT); + if (st->st_size > opts.ghost_limit) { + pr_err("Can't dump ghost file %s of %" PRIu64 " size, increase limit\n", path, st->st_size); return -1; } @@ -1091,32 +959,14 @@ void free_link_remaps(void) } static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t uid, gid_t gid, int flags); -static void check_overlayfs_fallback(char *path, const struct fd_parms *parms, bool *fallback) -{ - if (!fallback || parms->fs_type != OVERLAYFS_SUPER_MAGIC) - return; - - /* - * In overlayFS, linkat() fails with ENOENT if the removed file is - * originated from lower layer. The cause of failure is that linkat() - * sees the file has st_nlink=0, which is different than st_nlink=1 we - * got from earlier fstat() on lfd. By setting *fb=true, we will fall - * back to dump_ghost_remap() as it is what should have been done to - * removed files with st_nlink=0. - */ - pr_info("Unable to link-remap %s on overlayFS, fall back to dump_ghost_remap\n", path); - *fallback = true; -} - -static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_id *nsid, const struct fd_parms *parms, - bool *fallback) +static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_id *nsid, const struct stat *st) { char link_name[PATH_MAX], *tmp; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; FownEntry fwn = FOWN_ENTRY__INIT; int mntns_root; - const struct stat *ost = &parms->stat; + int ret; if (!opts.link_remap_ok) { pr_err("Can't create link remap for %s. " @@ -1150,22 +1000,22 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i rfe.name = link_name + 1; /* Any 'unique' name works here actually. Remap works by reg-file ids. */ - snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name) - 1, "link_remap.%d", rfe.id); + snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id); mntns_root = mntns_get_root_fd(nsid); - while (linkat_hard(lfd, "", mntns_root, link_name, ost->st_uid, ost->st_gid, AT_EMPTY_PATH) < 0) { - if (errno != ENOENT) { - pr_perror("Can't link remap to %s", path); - return -1; - } - +again: + ret = linkat_hard(lfd, "", mntns_root, link_name, st->st_uid, st->st_gid, AT_EMPTY_PATH); + if (ret < 0 && errno == ENOENT) { /* Use grand parent, if parent directory does not exist. */ if (trim_last_parent(link_name) < 0) { pr_err("trim failed: @%s@\n", link_name); - check_overlayfs_fallback(path, parms, fallback); return -1; } + goto again; + } else if (ret < 0) { + pr_perror("Can't link remap to %s", path); + return -1; } if (note_link_remap(link_name, nsid)) @@ -1178,13 +1028,12 @@ static int create_link_remap(char *path, int len, int lfd, u32 *idp, struct ns_i return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); } -static int dump_linked_remap(char *path, int len, const struct fd_parms *parms, int lfd, u32 id, struct ns_id *nsid, - bool *fallback) +static int dump_linked_remap(char *path, int len, const struct stat *ost, int lfd, u32 id, struct ns_id *nsid) { u32 lid; RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT; - if (create_link_remap(path, len, lfd, &lid, nsid, parms, fallback)) + if (create_link_remap(path, len, lfd, &lid, nsid, ost)) return -1; rpe.orig_id = id; @@ -1301,7 +1150,6 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, struct stat pst; const struct stat *ost = &parms->stat; int flags = 0; - bool fallback = false; if (parms->fs_type == PROC_SUPER_MAGIC) { /* The file points to /proc/pid/ where pid is a dead @@ -1391,7 +1239,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, * links on it) to have some persistent name at hands. */ pr_debug("Dump silly-rename linked remap for %x\n", id); - return dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, NULL); + return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); } mntns_root = mntns_get_root_fd(nsid); @@ -1412,15 +1260,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, if (errno == ENOENT) { link_strip_deleted(link); - ret = dump_linked_remap(rpath + 1, plen - 1, parms, lfd, id, nsid, &fallback); - if (ret < 0 && fallback) { - /* fallback is true only if following conditions are true: - * 1. linkat() inside dump_linked_remap() failed with ENOENT - * 2. parms->fs_type == overlayFS - */ - return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid); - } - return ret; + return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid); } pr_perror("Can't stat path"); @@ -1479,7 +1319,7 @@ static int get_build_id_32(Elf32_Ehdr *file_header, unsigned char **build_id, co return -1; /* - * If the file doesn't have at least 1 program header entry, it definitely can't + * If the file doesn't have atleast 1 program header entry, it definitely can't * have a build-id. */ if (!file_header->e_phnum) { @@ -1569,7 +1409,7 @@ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, co return -1; /* - * If the file doesn't have at least 1 program header entry, it definitely can't + * If the file doesn't have atleast 1 program header entry, it definitely can't * have a build-id. */ if (!file_header->e_phnum) { @@ -1650,10 +1490,22 @@ static int get_build_id_64(Elf64_Ehdr *file_header, unsigned char **build_id, co */ static int get_build_id(const int fd, const struct stat *fd_status, unsigned char **build_id) { - char *start_addr; + char buf[SELFMAG + 1]; + void *start_addr; size_t mapped_size; int ret = -1; + if (read(fd, buf, SELFMAG + 1) != SELFMAG + 1) + return -1; + + /* + * The first 4 bytes contain a magic number identifying the file as an + * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and + * ‘F’, respectively. These characters are together defined as ELFMAG. + */ + if (strncmp(buf, ELFMAG, SELFMAG)) + return -1; + /* * If the build-id exists, then it will most likely be present in the * beginning of the file. Therefore at most only the first 1 MB of the @@ -1661,25 +1513,16 @@ static int get_build_id(const int fd, const struct stat *fd_status, unsigned cha */ mapped_size = min_t(size_t, fd_status->st_size, BUILD_ID_MAP_SIZE); start_addr = mmap(0, mapped_size, PROT_READ, MAP_PRIVATE | MAP_FILE, fd, 0); - if ((void*)start_addr == MAP_FAILED) { + if (start_addr == MAP_FAILED) { pr_warn("Couldn't mmap file with fd %d\n", fd); return -1; } - /* - * The first 4 bytes contain a magic number identifying the file as an - * ELF file. They should contain the characters ‘\x7f’, ‘E’, ‘L’, and - * ‘F’, respectively. These characters are together defined as ELFMAG. - */ - if (memcmp(start_addr, ELFMAG, SELFMAG)) - goto out; + if (buf[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32(start_addr, build_id, fd, mapped_size); + if (buf[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64(start_addr, build_id, fd, mapped_size); - if (start_addr[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32((Elf32_Ehdr *)start_addr, build_id, fd, mapped_size); - if (start_addr[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64((Elf64_Ehdr *)start_addr, build_id, fd, mapped_size); - -out: munmap(start_addr, mapped_size); return ret; } @@ -1697,7 +1540,7 @@ static int store_validation_data_build_id(RegFileEntry *rfe, int lfd, const stru int fd; /* - * Checks whether the file is at least big enough to try and read the first + * Checks whether the file is atleast big enough to try and read the first * four (SELFMAG) bytes which should correspond to the ELF magic number * and the next byte which indicates whether the file is 32-bit or 64-bit. */ @@ -1735,7 +1578,7 @@ static int store_validation_data_build_id(RegFileEntry *rfe, int lfd, const stru * This routine stores metadata about the open file (File size, build-id, CRC32C checksum) * so that validation can be done while restoring to make sure that the right file is * being restored. - * Returns true if at least some metadata was stored, if there was an error it returns false. + * Returns true if atleast some metadata was stored, if there was an error it returns false. */ static bool store_validation_data(RegFileEntry *rfe, const struct fd_parms *p, int lfd) { @@ -1764,7 +1607,6 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) int ret; FileEntry fe = FILE_ENTRY__INIT; RegFileEntry rfe = REG_FILE_ENTRY__INIT; - bool skip_for_shell_job = false; if (!p->link) { if (fill_fdlink(lfd, p, &_link)) @@ -1784,17 +1626,12 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) mi = lookup_mnt_id(p->mnt_id); if (mi == NULL) { - if (opts.shell_job && is_tty(p->stat.st_rdev, p->stat.st_dev)) { - skip_for_shell_job = true; - } else { - pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); - return -1; - } + pr_err("Can't lookup mount=%d for fd=%d path=%s\n", p->mnt_id, p->fd, link->name + 1); + return -1; } - if (!skip_for_shell_job && mnt_is_overmounted(mi)) { - pr_err("Open files on overmounted mounts are not supported yet; mount=%d fd=%d path=%s\n", - p->mnt_id, p->fd, link->name + 1); + if (mnt_is_overmounted(mi)) { + pr_err("Open files on overmounted mounts are not supported yet\n"); return -1; } @@ -1813,7 +1650,7 @@ int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p) return -1; } - if (!skip_for_shell_job && check_path_remap(link, p, lfd, id, mi->nsid)) + if (check_path_remap(link, p, lfd, id, mi->nsid)) return -1; rfe.name = &link->name[1]; ext: @@ -1955,46 +1792,34 @@ out: return ret; } -int rm_parent_dirs(int mntns_root, char *path, int count) +static void rm_parent_dirs(int mntns_root, char *path, int count) { char *p, *prev = NULL; - int ret = -1; - while (count-- > 0) { + if (!count) + return; + + while (count > 0) { + count -= 1; p = strrchr(path, '/'); - if (p) { - /* We don't handle "//" in path */ - BUG_ON(prev && (prev - p == 1)); + if (p) *p = '\0'; - } else { - /* Inconsistent path and count */ - pr_perror("Can't strrchr \"/\" in \"%s\"/\"%s\"]" - " left count=%d\n", - path, prev ? prev + 1 : "", count + 1); - goto err; - } - if (prev) *prev = '/'; - prev = p; - if (unlinkat(mntns_root, path, AT_REMOVEDIR)) { + if (unlinkat(mntns_root, path, AT_REMOVEDIR)) pr_perror("Can't remove %s AT %d", path, mntns_root); - goto err; - } - pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); + else + pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root); + prev = p; } - ret = 0; -err: if (prev) *prev = '/'; - - return ret; } /* Construct parent dir name and mkdir parent/grandparents if they're not exist */ -int make_parent_dirs_if_need(int mntns_root, char *path) +static int make_parent_dirs_if_need(int mntns_root, char *path) { char *p, *last_delim; int err, count = 0; @@ -2022,7 +1847,6 @@ int make_parent_dirs_if_need(int mntns_root, char *path) err = mkdirat(mntns_root, path, 0777); if (err && errno != EEXIST) { pr_perror("Can't create dir: %s AT %d", path, mntns_root); - /* Failing anyway -> no retcode check */ rm_parent_dirs(mntns_root, path, count); count = -1; goto out; @@ -2043,9 +1867,6 @@ out: * This routine properly resolves d's path handling ghost/link-remaps. * The open_cb is a routine that does actual open, it differs for * files, directories, fifos, etc. - * - * Return 0 on success, -1 on error and 1 to indicate soft error, which can be - * retried. */ static int rfi_remap(struct reg_file_info *rfi, int *level) @@ -2090,7 +1911,7 @@ static int rfi_remap(struct reg_file_info *rfi, int *level) BUG_ON(tmi->s_dev != rmi->s_dev); BUG_ON(tmi->s_dev != mi->s_dev); - /* Calculate paths on the device (root mount) */ + /* Calcalate paths on the device (root mount) */ convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi); convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi); @@ -2109,11 +1930,8 @@ out_root: if (linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->uid, rfi->remap->gid, 0) < 0) { int errno_saved = errno; - - if (!rm_parent_dirs(mntns_root, path, *level) && errno_saved == EEXIST) { - errno = errno_saved; - return 1; - } + rm_parent_dirs(mntns_root, path, *level); + errno = errno_saved; return -1; } @@ -2190,12 +2008,11 @@ static bool validate_file(const int fd, const struct stat *fd_status, const stru int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg) { - int tmp = -1, mntns_root, level = 0; + int tmp, mntns_root, level = 0; struct reg_file_info *rfi; char *orig_path = NULL; char path[PATH_MAX]; int inh_fd = -1; - int ret; if (inherited_fd(d, &tmp)) return tmp; @@ -2232,9 +2049,14 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil */ orig_path = rfi->path; rfi->path = rfi->remap->rpath; - } else if ((ret = rfi_remap(rfi, &level)) == 1) { + } else if (rfi_remap(rfi, &level) < 0) { static char tmp_path[PATH_MAX]; + if (errno != EEXIST) { + pr_perror("Can't link %s -> %s", rfi->remap->rpath, rfi->path); + return -1; + } + /* * The file whose name we're trying to create * exists. Need to pick some other one, we're @@ -2248,15 +2070,12 @@ int open_path(struct file_desc *d, int (*open_cb)(int mntns_root, struct reg_fil orig_path = rfi->path; rfi->path = tmp_path; snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path); - pr_debug("Fake %s -> %s link\n", rfi->remap->rpath, rfi->path); + pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->rpath); - if (rfi_remap(rfi, &level)) { + if (rfi_remap(rfi, &level) < 0) { pr_perror("Can't create even fake link!"); - goto err; + return -1; } - } else if (ret < 0) { - pr_perror("Can't link %s -> %s", rfi->remap->rpath, rfi->path); - goto err; } } @@ -2266,7 +2085,7 @@ ext: if (tmp < 0) { pr_perror("Can't open file %s", rfi->path); close_safe(&inh_fd); - goto err; + return -1; } close_safe(&inh_fd); @@ -2275,27 +2094,15 @@ ext: if (fstat(tmp, &st) < 0) { pr_perror("Can't fstat opened file"); - goto err; + return -1; } if (!validate_file(tmp, &st, rfi)) - goto err; + return -1; - if (rfi->rfe->has_mode) { - mode_t curr_mode = st.st_mode; - mode_t saved_mode = rfi->rfe->mode; - - if (opts.skip_file_rwx_check) { - curr_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); - saved_mode &= ~(S_IRWXU | S_IRWXG | S_IRWXO); - } - - if (curr_mode != saved_mode) { - pr_err("File %s has bad mode 0%o (expect 0%o)\n" - "File r/w/x checks can be skipped with the --skip-file-rwx-check option\n", - rfi->path, (int)curr_mode, saved_mode); - goto err; - } + if (rfi->rfe->has_mode && (st.st_mode != rfi->rfe->mode)) { + pr_err("File %s has bad mode 0%o (expect 0%o)\n", rfi->path, (int)st.st_mode, rfi->rfe->mode); + return -1; } /* @@ -2308,18 +2115,8 @@ ext: if (rfi->remap) { if (!rfi->remap->is_dir) { - struct mount_info *mi = lookup_mnt_id(rfi->rfe->mnt_id); - - if (mi && try_remount_writable(mi, true)) - goto err; - - pr_debug("Unlink: %d:%s\n", rfi->rfe->mnt_id, rfi->path); - if (unlinkat(mntns_root, rfi->path, 0)) { - pr_perror("Failed to unlink the remap file"); - goto err; - } - if (rm_parent_dirs(mntns_root, rfi->path, level)) - goto err; + unlinkat(mntns_root, rfi->path, 0); + rm_parent_dirs(mntns_root, rfi->path, level); } mutex_unlock(remap_open_lock); @@ -2327,17 +2124,10 @@ ext: if (orig_path) rfi->path = orig_path; - if (restore_fown(tmp, rfi->rfe->fown)) { - close(tmp); + if (restore_fown(tmp, rfi->rfe->fown)) return -1; - } return tmp; -err: - if (rfi->remap) - mutex_unlock(remap_open_lock); - close_safe(&tmp); - return -1; } int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg) @@ -2446,8 +2236,8 @@ static struct filemap_ctx ctx; void filemap_ctx_init(bool auto_close) { ctx.desc = NULL; /* to fail the first comparison in open_ */ - ctx.fd = -1; /* not to close random fd in _fini */ - ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ + ctx.fd = -1; /* not to close random fd in _fini */ + ctx.vma = NULL; /* not to put spurious VMA_CLOSE in _fini */ /* flags may remain any */ ctx.close = auto_close; } @@ -2467,7 +2257,6 @@ static int open_filemap(int pid, struct vma_area *vma) { u32 flags; int ret; - int plugin_fd = -1; /* * The vma->fd should have been assigned in collect_filemap @@ -2478,38 +2267,11 @@ static int open_filemap(int pid, struct vma_area *vma) BUG_ON((vma->vmfd == NULL) || !vma->e->has_fdflags); flags = vma->e->fdflags; - /* update the new device file page offsets and file paths set during restore */ - if (vma->e->status & VMA_EXT_PLUGIN) { - uint64_t new_pgoff; - int ret; - - struct reg_file_info *rfi = container_of(vma->vmfd, struct reg_file_info, d); - ret = run_plugins(UPDATE_VMA_MAP, rfi->rfe->name, vma->e->start, vma->e->pgoff, &new_pgoff, &plugin_fd); - if (ret == 1) { - pr_info("New mmap %#016" PRIx64 ":%#016" PRIx64 "->%#016" PRIx64 " fd %d\n", vma->e->start, - vma->e->pgoff, new_pgoff, plugin_fd); - vma->e->pgoff = new_pgoff; - } - /* Device plugin will restore vma contents, so no need for write permission */ - vma->e->status |= VMA_NO_PROT_WRITE; - } - if (ctx.flags != flags || ctx.desc != vma->vmfd) { - if (plugin_fd >= 0) { - /* - * Vma handled by device plugin. - * Some device drivers (e.g DRM) only allow the file descriptor that was used to create vma to - * be used when calling mmap. In this case, use the FD returned by plugin. FD can be copied - * using dup because dup returns a reference to the same struct file inside kernel, but we - * cannot open a new FD. - */ - ret = plugin_fd; - } else if (vma->e->status & VMA_AREA_MEMFD) { - if (!inherited_fd(vma->vmfd, &ret)) - ret = memfd_open(vma->vmfd, &flags, true); - } else { + if (vma->e->status & VMA_AREA_MEMFD) + ret = memfd_open(vma->vmfd, &flags); + else ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); - } if (ret < 0) return ret; diff --git a/criu/files.c b/criu/files.c index af4b8aeac..93754fb44 100644 --- a/criu/files.c +++ b/criu/files.c @@ -21,7 +21,7 @@ #include "image.h" #include "common/list.h" #include "rst-malloc.h" -#include "util-caps.h" +#include "util-pie.h" #include "common/lock.h" #include "sockets.h" #include "pstree.h" @@ -49,7 +49,6 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" -#include "pidfd.h" #include "protobuf.h" #include "util.h" @@ -184,18 +183,6 @@ out: return fd; } -int find_unused_fd_pid(pid_t pid) -{ - struct pstree_item *task; - - task = pstree_item_by_virt(pid); - if (!task) { - pr_err("Invalid pid:%d\n", pid); - return -1; - } - return find_unused_fd(task, -1); -} - int set_fds_event(pid_t virt) { struct pstree_item *item; @@ -299,12 +286,12 @@ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) * If the bug is present, the file path from /proc//fd * does not include the mountpoint, so we prepend it ourselves. */ - if (strcmp("./", m->ns_mountpoint) != 0) { + if (strcmp("./", m->mountpoint) != 0) { char buf[PATH_MAX]; int n; - __strlcpy(buf, link->name, PATH_MAX); - n = snprintf(link->name, PATH_MAX, "%s/%s", m->ns_mountpoint, buf + 2); + strlcpy(buf, link->name, PATH_MAX); + n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); return -1; @@ -519,7 +506,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, } p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */ - p.dfds = dfds; /* epoll needs to verify if target fd exist */ + p.dfds = dfds; /* epoll needs to verify if target fd exist */ if (S_ISSOCK(p.stat.st_mode)) return dump_socket(&p, lfd, e); @@ -545,8 +532,6 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, ops = &signalfd_dump_ops; else if (is_timerfd_link(link)) ops = &timerfd_dump_ops; - else if (is_pidfd_link(link)) - ops = &pidfd_dump_ops; #ifdef CONFIG_HAS_LIBBPF else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; @@ -557,19 +542,13 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } - if (p.fs_type == PID_FS_MAGIC) { - ops = &pidfd_dump_ops; - return do_dump_gen_file(&p, lfd, ops, e); - } - if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; p.link = &link; - /* TODO: Dump for hugetlb fd when memfd hugetlb is not supported */ - if (is_memfd(p.stat.st_dev) || (kdat.has_memfd_hugetlb && is_hugetlb_dev(p.stat.st_dev, NULL))) + if (is_memfd(p.stat.st_dev)) ops = &memfd_dump_ops; else if (link.name[1] == '/') ops = ®file_dump_ops; @@ -604,13 +583,13 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, int dump_my_file(int lfd, u32 *id, int *type) { struct pid me = {}; - struct fd_opts fdo = {}; + struct fd_opts fo = {}; FdinfoEntry e = FDINFO_ENTRY__INIT; me.real = getpid(); me.ns[0].virt = -1; /* FIXME */ - if (dump_one_file(&me, lfd, lfd, &fdo, NULL, &e, NULL)) + if (dump_one_file(&me, lfd, lfd, &fo, NULL, &e, NULL)) return -1; *id = e.id; @@ -978,7 +957,7 @@ static int receive_fd(struct fdinfo_list_entry *fle); static void transport_name_gen(struct sockaddr_un *addr, int *len, int pid) { addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%s", pid, criu_run_id); + snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d", pid); *len = SUN_LEN(addr); *addr->sun_path = '\0'; } @@ -1135,11 +1114,11 @@ int setup_and_serve_out(struct fdinfo_list_entry *fle, int new_fd) static int open_fd(struct fdinfo_list_entry *fle) { struct file_desc *d = fle->desc; - struct fdinfo_list_entry *fle_m; + struct fdinfo_list_entry *flem; int new_fd = -1, ret; - fle_m = file_master(d); - if (fle != fle_m) { + flem = file_master(d); + if (fle != flem) { BUG_ON(fle->stage != FLE_INITIALIZED); ret = receive_fd(fle); if (ret != 0) @@ -1262,7 +1241,7 @@ int close_old_fds(void) int fd, ret; /** - * Close previous /proc/self/ service fd, as we don't want to reuse it + * Close previous /proc/self/ service fd, as we don't wan't to reuse it * from a different task. Also there can be some junk fd in it's place * after we've moved our service fds (e.g. from other task of parents * shared fdtable), we need to close it before opendir_proc() below. @@ -1329,6 +1308,7 @@ int prepare_fds(struct pstree_item *me) } } + BUG_ON(current->pid->state == TASK_HELPER); ret = open_fdinfos(me); if (rsti(me)->fdt) @@ -1353,35 +1333,10 @@ static int fchroot(int fd) return chroot("."); } -static int need_chroot(int saved_root) -{ - struct stat saved_root_stat, cur_root_stat; - int psd; - - if (fstat(saved_root, &saved_root_stat) == -1) { - pr_perror("Failed to stat saved root dir"); - return -1; - } - - psd = open_pid_proc(PROC_SELF); - if (psd < 0) { - pr_perror("Failed to open PROC_SELF"); - return -1; - } - - if (fstatat(psd, "root", &cur_root_stat, 0) == -1) { - pr_perror("Failed to stat current root dir"); - return -1; - } - - return saved_root_stat.st_ino != cur_root_stat.st_ino || saved_root_stat.st_dev != cur_root_stat.st_dev; -} - int restore_fs(struct pstree_item *me) { int dd_root = -1, dd_cwd = -1, ret, err = -1; struct rst_info *ri = rsti(me); - bool do_chroot = true; /* * First -- open both descriptors. We will not @@ -1400,24 +1355,15 @@ int restore_fs(struct pstree_item *me) goto out; } - /* - * In unprivileged mode chroot() may fail if we don't have - * sufficient privileges, therefore only do it if the process - * is actually chrooted. - */ - if (opts.unprivileged) - do_chroot = need_chroot(dd_root); - /* * Now do chroot/chdir. Chroot goes first as it calls chdir into * dd_root so we'd need to fix chdir after it anyway. */ - if (do_chroot) { - ret = fchroot(dd_root); - if (ret < 0) { - pr_perror("Can't change root"); - goto out; - } + + ret = fchroot(dd_root); + if (ret < 0) { + pr_perror("Can't change root"); + goto out; } ret = fchdir(dd_cwd); @@ -1540,7 +1486,7 @@ int shared_fdt_prepare(struct pstree_item *item) struct inherit_fd { struct list_head inh_list; char *inh_id; /* file identifier */ - int inh_fd; /* criu's descriptor to inherit */ + int inh_fd; /* criu's descriptor to inherit */ int inh_fd_id; }; @@ -1606,10 +1552,8 @@ int inherit_fd_add(int fd, char *key) inh_fd_max = fd; inh->inh_id = xstrdup(key); - if (inh->inh_id == NULL) { - xfree(inh); + if (inh->inh_id == NULL) return -1; - } inh->inh_fd = fd; list_add_tail(&inh->inh_list, &opts.inherit_fds); @@ -1785,9 +1729,6 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__MEMFD: ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); break; - case FD_TYPES__PIDFD: - ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo); - break; #ifdef CONFIG_HAS_LIBBPF case FD_TYPES__BPFMAP: ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo); @@ -1810,6 +1751,5 @@ int prepare_files(void) { init_fdesc_hash(); init_sk_info_hash(); - init_dead_pidfd_hash(); return collect_image(&files_cinfo); } diff --git a/criu/filesystems.c b/criu/filesystems.c index 093e1c492..3e0ec2eb3 100644 --- a/criu/filesystems.c +++ b/criu/filesystems.c @@ -290,7 +290,7 @@ static int binfmt_misc_restore_bme(struct mount_info *mi, BinfmtMiscEntry *bme, goto bad_dump; pr_debug("binfmt_misc_pattern=%s\n", buf); - ret = write_binfmt_misc_entry(service_mountpoint(mi), buf, bme); + ret = write_binfmt_misc_entry(mi->mountpoint, buf, bme); return ret; @@ -452,7 +452,7 @@ static int tmpfs_restore(struct mount_info *pm) ret = cr_system(img_raw_fd(img), -1, -1, "tar", (char *[]){ "tar", "--extract", "--gzip", "--no-unquote", "--no-wildcards", "--directory", - service_mountpoint(pm), NULL }, + pm->mountpoint, NULL }, 0); close_image(img); @@ -547,9 +547,8 @@ static int fusectl_dump(struct mount_info *pm) } for (it = mntinfo; it; it = it->next) { - if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && - !mnt_is_external_bind(it)) { - pr_err("%s is a fuse mount but not external\n", it->ns_mountpoint); + if (it->fstype->code == FSTYPE__FUSE && id == kdev_minor(it->s_dev) && !it->external) { + pr_err("%s is a fuse mount but not external\n", it->mountpoint); goto out; } } @@ -660,7 +659,7 @@ static int dump_empty_fs(struct mount_info *pm) */ static int always_fail(struct mount_info *pm) { - pr_err("failed to dump fs %s (%s): always fail\n", pm->ns_mountpoint, pm->fstype->name); + pr_err("failed to dump fs %s (%s): always fail\n", pm->mountpoint, pm->fstype->name); return -1; } diff --git a/criu/fsnotify.c b/criu/fsnotify.c index 8572dc2f3..b5dd15dd8 100644 --- a/criu/fsnotify.c +++ b/criu/fsnotify.c @@ -132,7 +132,7 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ if (!mnt_is_dir(m)) continue; - mntfd = __open_mountpoint(m); + mntfd = __open_mountpoint(m, -1); pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n", m->mnt_id, m->root, m->ns_mountpoint, mntfd); if (mntfd < 0) @@ -183,7 +183,7 @@ static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_ return path; } } else - pr_debug("\t\t\tnot openable as %s (%s)\n", __path, strerror(errno)); + pr_debug("\t\t\tnot openable as %s (%m)\n", __path); } err: @@ -206,7 +206,7 @@ static int open_handle(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handl if (m->s_dev != s_dev || !mnt_is_dir(m)) continue; - mntfd = __open_mountpoint(m); + mntfd = __open_mountpoint(m, -1); if (mntfd < 0) { pr_warn("Can't open mount for s_dev %x, continue\n", s_dev); continue; @@ -404,7 +404,7 @@ static int check_one_mark(FanotifyMarkEntry *fme) return -1; } if (!(root_ns_mask & CLONE_NEWNS)) - fme->me->path = m->ns_mountpoint + 1; + fme->me->path = m->mountpoint + 1; fme->s_dev = m->s_dev; pr_info("mark: s_dev %#08x mnt_id %#08x mask %#08x\n", fme->s_dev, fme->me->mnt_id, fme->mask); @@ -514,7 +514,7 @@ static char *get_mark_path(const char *who, struct file_remap *remap, FhEntry *f /* * fanotify/inotify open syscalls want path to attach * watch to. But the only thing we have is an FD obtained - * via fhandle. Fortunately, when trying to attach the + * via fhandle. Fortunatelly, when trying to attach the * /proc/pid/fd/ link, we will watch the inode the link * points to, i.e. -- just what we want. */ diff --git a/criu/hugetlb.c b/criu/hugetlb.c deleted file mode 100644 index 866c4050f..000000000 --- a/criu/hugetlb.c +++ /dev/null @@ -1,60 +0,0 @@ -#include "hugetlb.h" -#include "kerndat.h" -#include "sizes.h" - -// clang-format off -struct htlb_info hugetlb_info[HUGETLB_MAX] = { - [HUGETLB_16KB] = { SZ_16K, MAP_HUGETLB_16KB }, - [HUGETLB_64KB] = { SZ_64K, MAP_HUGETLB_64KB }, - [HUGETLB_512KB] = { SZ_512K, MAP_HUGETLB_512KB }, - [HUGETLB_1MB] = { SZ_1M, MAP_HUGETLB_1MB }, - [HUGETLB_2MB] = { SZ_2M, MAP_HUGETLB_2MB }, - [HUGETLB_8MB] = { SZ_8M, MAP_HUGETLB_8MB }, - [HUGETLB_16MB] = { SZ_16M, MAP_HUGETLB_16MB }, - [HUGETLB_32MB] = { SZ_32M, MAP_HUGETLB_32MB }, - [HUGETLB_256MB] = { SZ_256M, MAP_HUGETLB_256MB }, - [HUGETLB_512MB] = { SZ_512M, MAP_HUGETLB_512MB }, - [HUGETLB_1GB] = { SZ_1G, MAP_HUGETLB_1GB }, - [HUGETLB_2GB] = { SZ_2G, MAP_HUGETLB_2GB }, - [HUGETLB_16GB] = { SZ_16G, MAP_HUGETLB_16GB }, -}; -// clang-format on - -int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag) -{ - int i; - - for (i = 0; i < HUGETLB_MAX; i++) { - if (kdat.hugetlb_dev[i] == dev) { - if (hugetlb_size_flag) - *hugetlb_size_flag = hugetlb_info[i].flag; - return 1; - } - } - - return 0; -} - -int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma) -{ - /* - * Dump the hugetlb backed mapping using memfd_hugetlb when it is not - * anonymous private mapping. - */ - if (kdat.has_memfd_hugetlb && is_hugetlb_dev(dev, hugetlb_size_flag) && - !((vma->e->flags & MAP_PRIVATE) && !strncmp(file_path, ANON_HUGEPAGE_PREFIX, ANON_HUGEPAGE_PREFIX_LEN))) - return 1; - - return 0; -} - -unsigned long get_size_from_hugetlb_flag(int flag) -{ - int i; - - for (i = 0; i < HUGETLB_MAX; i++) - if (flag == hugetlb_info[i].flag) - return hugetlb_info[i].size; - - return -1; -} diff --git a/criu/image-desc.c b/criu/image-desc.c index 2d87c7381..d65d9c098 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,7 +107,6 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), - FD_ENTRY(PIDFD, "pidfd"), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/image.c b/criu/image.c index 91101c3eb..353de48e8 100644 --- a/criu/image.c +++ b/criu/image.c @@ -25,15 +25,6 @@ bool img_common_magic = true; TaskKobjIdsEntry *root_ids; u32 root_cg_set; Lsmtype image_lsm; -char dump_criu_run_id[RUN_ID_HASH_LENGTH]; - -struct inventory_plugin { - struct list_head node; - char *name; -}; - -struct list_head inventory_plugins_list = LIST_HEAD_INIT(inventory_plugins_list); -static int n_inventory_plugins; int check_img_inventory(bool restore) { @@ -95,11 +86,6 @@ int check_img_inventory(bool restore) goto out_err; } - if (restore && he->allow_uprobes && !opts.allow_uprobes) { - pr_err("Dumped with --" OPT_ALLOW_UPROBES ". Need to set it on restore as well.\n"); - goto out_err; - } - if (restore) { if (!he->has_network_lock_method) { /* @@ -113,37 +99,6 @@ int check_img_inventory(bool restore) } else { opts.network_lock_method = he->network_lock_method; } - - if (!he->plugins_entry) { - /* backwards compatibility: if the 'plugins_entry' field is missing, - * all plugins should be enabled during restore. - */ - n_inventory_plugins = -1; - } else { - PluginsEntry *pe = he->plugins_entry; - for (int i = 0; i < pe->n_plugins; i++) { - if (add_inventory_plugin(pe->plugins[i])) - goto out_err; - } - } - - /** - * This contains the criu_run_id during dumping of the process. - * For things like removing network locking (nftables) this - * information is needed to identify the name of the network - * locking table. - */ - if (he->dump_criu_run_id) { - strncpy(dump_criu_run_id, he->dump_criu_run_id, sizeof(dump_criu_run_id) - 1); - pr_info("Dump CRIU run id = %s\n", dump_criu_run_id); - } else { - /** - * If restoring from an old image this is a marker - * that no dump_criu_run_id exists. - */ - dump_criu_run_id[0] = NO_DUMP_CRIU_RUN_ID; - } - } ret = 0; @@ -155,92 +110,8 @@ out_close: return ret; } -/** - * Check if the 'plugins' field in the inventory image contains - * the specified plugin name. If found, the plugin is removed - * from the linked list. - */ -bool check_and_remove_inventory_plugin(const char *name, size_t n) -{ - if (n_inventory_plugins == -1) - return true; /* backwards compatibility */ - - if (n_inventory_plugins > 0) { - struct inventory_plugin *p, *tmp; - - list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { - if (!strncmp(name, p->name, n)) { - xfree(p->name); - list_del(&p->node); - xfree(p); - n_inventory_plugins--; - return true; - } - } - } - - return false; -} - -/** - * We expect during restore all loaded plugins to be removed from - * the inventory_plugins_list. If the list is not empty, show an - * error message for each missing plugin. - */ -int check_inventory_plugins(void) -{ - struct inventory_plugin *p; - - if (n_inventory_plugins <= 0) - return 0; - - list_for_each_entry(p, &inventory_plugins_list, node) { - pr_err("Missing required plugin: %s\n", p->name); - } - - return -1; -} - -/** - * Add plugin name to the inventory image. These values - * can be used to identify required plugins during restore. - */ -int add_inventory_plugin(const char *name) -{ - struct inventory_plugin *p; - - p = xmalloc(sizeof(struct inventory_plugin)); - if (p == NULL) - return -1; - - p->name = xstrdup(name); - if (!p->name) { - xfree(p); - return -1; - } - list_add(&p->node, &inventory_plugins_list); - n_inventory_plugins++; - - return 0; -} - -void free_inventory_plugins_list(void) -{ - struct inventory_plugin *p, *tmp; - - if (!list_empty(&inventory_plugins_list)) { - list_for_each_entry_safe(p, tmp, &inventory_plugins_list, node) { - xfree(p->name); - list_del(&p->node); - xfree(p); - } - } - n_inventory_plugins = 0; -} - int write_img_inventory(InventoryEntry *he) { - PluginsEntry pe = PLUGINS_ENTRY__INIT; struct cr_img *img; int ret; @@ -250,27 +121,8 @@ int write_img_inventory(InventoryEntry *he) if (!img) return -1; - if (!list_empty(&inventory_plugins_list)) { - struct inventory_plugin *p; - int i = 0; - - pe.n_plugins = n_inventory_plugins; - pe.plugins = xmalloc(n_inventory_plugins * sizeof(char *)); - if (!pe.plugins) - return -1; - - list_for_each_entry(p, &inventory_plugins_list, node) { - pe.plugins[i] = p->name; - i++; - } - } - he->plugins_entry = &pe; - ret = pb_write_one(img, he, PB_INVENTORY); - free_inventory_plugins_list(); - xfree(pe.plugins); - xfree(he->root_ids); close_image(img); if (ret < 0) @@ -374,9 +226,8 @@ int prepare_inventory(InventoryEntry *he) if (get_task_ids(&crt.i)) return -1; - if (!opts.unprivileged) - he->has_root_cg_set = true; - if (dump_thread_cgroup(NULL, &he->root_cg_set, NULL, -1)) + he->has_root_cg_set = true; + if (dump_task_cgroup(NULL, &he->root_cg_set, NULL)) return -1; he->root_ids = crt.i.ids; @@ -391,17 +242,6 @@ int prepare_inventory(InventoryEntry *he) he->has_network_lock_method = true; he->network_lock_method = opts.network_lock_method; - /** - * This contains the criu_run_id during dumping of the process. - * For things like removing network locking (nftables) this - * information is needed to identify the name of the network - * locking table. - */ - he->dump_criu_run_id = xstrdup(criu_run_id); - - if (!he->dump_criu_run_id) - return -1; - return 0; } @@ -717,7 +557,7 @@ struct cr_img *img_from_fd(int fd) * This is used when opts.stream is enabled for picking the right streamer * socket name. `mode` is ignored when opts.stream is not enabled. */ -int open_image_dir(const char *dir, int mode) +int open_image_dir(char *dir, int mode) { int fd, ret; diff --git a/criu/img-streamer.c b/criu/img-streamer.c index 305e6fae5..7e36eae01 100644 --- a/criu/img-streamer.c +++ b/criu/img-streamer.c @@ -12,7 +12,6 @@ #include "rst-malloc.h" #include "common/scm.h" #include "common/lock.h" -#include "action-scripts.h" /* * We use different path names for the dump and restore sockets because: @@ -50,17 +49,10 @@ static const char *socket_name_for_mode(int mode) int img_streamer_init(const char *image_dir, int mode) { struct sockaddr_un addr; - int pre_stream_ret; int sockfd; img_streamer_mode = mode; - pre_stream_ret = run_scripts(ACT_PRE_STREAM); - if (pre_stream_ret != 0) { - pr_err("Pre-stream script failed with %d!\n", pre_stream_ret); - return -1; - } - sockfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sockfd < 0) { pr_perror("Unable to instantiate UNIX socket"); diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h index 6a331a32f..c2e8850aa 100644 --- a/criu/include/action-scripts.h +++ b/criu/include/action-scripts.h @@ -4,7 +4,6 @@ #include "asm/int.h" enum script_actions { - ACT_PRE_STREAM, ACT_PRE_DUMP, ACT_POST_DUMP, ACT_PRE_RESTORE, @@ -17,7 +16,6 @@ enum script_actions { ACT_PRE_RESUME, ACT_ORPHAN_PTS_MASTER, ACT_STATUS_READY, - ACT_QUERY_EXT_FILES, ACT_MAX }; @@ -26,8 +24,6 @@ extern int add_script(char *path); extern int add_rpc_notify(int sk); extern int run_scripts(enum script_actions); extern int rpc_send_fd(enum script_actions, int fd); -extern int rpc_query_external_files(void); -extern int exec_rpc_query_external_files(char *name, int sk); extern int send_criu_rpc_script(enum script_actions act, char *name, int sk, int fd); #endif /* __CR_ACTION_SCRIPTS_H__ */ diff --git a/criu/include/aio.h b/criu/include/aio.h index 38e704020..f8a59dfdf 100644 --- a/criu/include/aio.h +++ b/criu/include/aio.h @@ -1,7 +1,7 @@ #ifndef __CR_AIO_H__ #define __CR_AIO_H__ -#include "linux/aio_abi.h" +#include #include "images/mm.pb-c.h" unsigned int aio_estimate_nr_reqs(unsigned int size); int dump_aio_ring(MmEntry *mme, struct vma_area *vma); @@ -13,8 +13,8 @@ struct task_restore_args; int prepare_aios(struct pstree_item *t, struct task_restore_args *ta); struct aio_ring { - unsigned id; /* kernel internal index number */ - unsigned nr; /* number of io_events */ + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; diff --git a/criu/include/autofs.h b/criu/include/autofs.h index b158025c7..c4e0f23ed 100644 --- a/criu/include/autofs.h +++ b/criu/include/autofs.h @@ -96,7 +96,7 @@ struct args_ismountpoint { struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; - __u32 size; /* total size of data passed in + __u32 size; /* total size of data passed in * including this struct */ __s32 ioctlfd; /* automount command fd */ diff --git a/criu/include/bfd.h b/criu/include/bfd.h index 2846ec628..4268f74d4 100644 --- a/criu/include/bfd.h +++ b/criu/include/bfd.h @@ -5,8 +5,8 @@ struct bfd_buf; struct xbuf { - char *mem; /* buffer */ - char *data; /* position we see bytes at */ + char *mem; /* buffer */ + char *data; /* position we see bytes at */ unsigned int sz; /* bytes sitting after b->pos */ struct bfd_buf *buf; }; diff --git a/criu/include/cgroup-props.h b/criu/include/cgroup-props.h index 10a7061b8..11b677548 100644 --- a/criu/include/cgroup-props.h +++ b/criu/include/cgroup-props.h @@ -10,7 +10,6 @@ typedef struct { } cgp_t; extern cgp_t cgp_global; -extern cgp_t cgp_global_v2; extern const cgp_t *cgp_get_props(const char *name); extern bool cgp_should_skip_controller(const char *name); extern bool cgp_add_dump_controller(const char *name); diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h index dc264032e..2e9b8933c 100644 --- a/criu/include/cgroup.h +++ b/criu/include/cgroup.h @@ -7,10 +7,9 @@ struct pstree_item; struct parasite_dump_cgroup_args; extern u32 root_cg_set; -int dump_thread_cgroup(const struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args, int id); +int dump_task_cgroup(struct pstree_item *, u32 *, struct parasite_dump_cgroup_args *args); int dump_cgroups(void); -int restore_task_cgroup(struct pstree_item *); -int prepare_cgroup_namespace(struct pstree_item *); +int prepare_task_cgroup(struct pstree_item *); int prepare_cgroup(void); /* Restore things like cpu_limit in known cgroups. */ int prepare_cgroup_properties(void); @@ -61,9 +60,6 @@ struct cg_controller { /* for cgroup list in cgroup.c */ struct list_head l; - - /* controller is a threaded cgroup or not */ - int is_threaded; }; struct cg_controller *new_controller(const char *name); @@ -91,12 +87,9 @@ struct cg_ctl { */ struct list_head; struct parasite_dump_cgroup_args; -extern int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *l, - unsigned int *n); +extern int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *l, unsigned int *n); extern void put_ctls(struct list_head *); int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups); -int stop_cgroupd(void); - #endif /* __CR_CGROUP_H__ */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 8c5707b41..a34f8dbbf 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -1,12 +1,10 @@ #ifndef __CR_OPTIONS_H__ #define __CR_OPTIONS_H__ +#include #include -#include #include "common/config.h" #include "common/list.h" -#include "int.h" -#include "image.h" /* Configuration and CLI parsing order defines */ #define PARSING_GLOBAL_CONF 1 @@ -67,18 +65,9 @@ struct cg_root_opt { enum NETWORK_LOCK_METHOD { NETWORK_LOCK_IPTABLES, NETWORK_LOCK_NFTABLES, - NETWORK_LOCK_SKIP, }; -/** - * CRIU currently defaults to the iptables locking backend. - * - * It is, however, possible to change this by defining - * NETWORK_LOCK_DEFAULT to a different value on the command-line. - */ -#ifndef NETWORK_LOCK_DEFAULT #define NETWORK_LOCK_DEFAULT NETWORK_LOCK_IPTABLES -#endif /* * Ghost file size we allow to carry by default. @@ -104,9 +93,6 @@ enum FILE_VALIDATION_OPTIONS { /* This constant dictates which file validation method should be tried by default. */ #define FILE_VALIDATION_DEFAULT FILE_VALIDATION_BUILD_ID -/* This constant dictates that criu use fiemap to copy ghost file by default.*/ -#define FIEMAP_DEFAULT 1 - struct irmap; struct irmap_path_opt { @@ -114,23 +100,6 @@ struct irmap_path_opt { struct irmap *ir; }; -enum criu_mode { - CR_UNSET = 0, - CR_DUMP, - CR_PRE_DUMP, - CR_RESTORE, - CR_LAZY_PAGES, - CR_CHECK, - CR_PAGE_SERVER, - CR_SERVICE, - CR_SWRK, - CR_DEDUP, - CR_CPUINFO_DUMP, - CR_CPUINFO_CHECK, - CR_EXEC_DEPRECATED, - CR_SHOW_DEPRECATED, -}; - struct cr_options { int final_state; int check_extra_features; @@ -180,7 +149,6 @@ struct cr_options { int enable_external_masters; bool aufs; /* auto-detected, not via cli */ bool overlayfs; - int ghost_fiemap; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED bool has_binfmt_misc; /* auto-detected */ #endif @@ -195,8 +163,6 @@ struct cr_options { bool lazy_pages; char *work_dir; int network_lock_method; - int skip_file_rwx_check; - int allow_uprobes; /* * When we scheduler for removal some functionality we first @@ -222,31 +188,6 @@ struct cr_options { /* This stores which method to use for file validation. */ int file_validation_method; - - /* Shows the mode criu is running at the moment: dump/pre-dump/restore/... */ - enum criu_mode mode; - - int mntns_compat_mode; - - /* Remember the program name passed to main() so we can use it in - * error messages elsewhere. - */ - char *argv_0; - /* - * This contains the eUID of the current CRIU user. It - * will only be set to a non-zero value if CRIU has - * the necessary capabilities to run as non root. - * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN - */ - uid_t uid; - /* This contains the value from capget()->effective */ - u32 cap_eff[_LINUX_CAPABILITY_U32S_3]; - /* - * If CRIU should be running as non-root with the help of - * CAP_CHECKPOINT_RESTORE or CAP_SYS_ADMIN the user should - * explicitly request it as it comes with many limitations. - */ - int unprivileged; }; extern struct cr_options opts; diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h index 9d52fbdb1..ae2f38489 100644 --- a/criu/include/criu-log.h +++ b/criu/include/criu-log.h @@ -26,6 +26,7 @@ extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); +extern void log_closedir(void); extern int log_keep_err(void); extern char *log_first_err(void); diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h index c3bea1385..897666ecd 100644 --- a/criu/include/criu-plugin.h +++ b/criu/include/criu-plugin.h @@ -22,8 +22,6 @@ #include #include -#include -#include #define CRIU_PLUGIN_GEN_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c)) #define CRIU_PLUGIN_VERSION_MAJOR 0 @@ -50,22 +48,6 @@ enum { CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6, - CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA = 7, - - CR_PLUGIN_HOOK__UPDATE_VMA_MAP = 8, - - CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, - - CR_PLUGIN_HOOK__PAUSE_DEVICES = 10, - - CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11, - - CR_PLUGIN_HOOK__POST_FORKING = 12, - - CR_PLUGIN_HOOK__RESTORE_INIT = 13, - - CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14, - CR_PLUGIN_HOOK__MAX }; @@ -74,19 +56,10 @@ enum { DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id, bool *retry_needed); +DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct stat *stat); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, - const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void); -DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id); enum { CR_PLUGIN_STAGE__DUMP, @@ -157,10 +130,5 @@ typedef int(cr_plugin_restore_file_t)(int id); typedef int(cr_plugin_dump_ext_mount_t)(char *mountpoint, int id); typedef int(cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file); typedef int(cr_plugin_dump_ext_link_t)(int index, int type, char *kind); -typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat); -typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff, - uint64_t *new_pgoff, int *plugin_fd); -typedef int(cr_plugin_resume_devices_late_t)(int pid); -typedef int(cr_plugin_post_forking_t)(void); #endif /* __CRIU_PLUGIN_H__ */ diff --git a/criu/include/crtools.h b/criu/include/crtools.h index b54b9d929..b9309654f 100644 --- a/criu/include/crtools.h +++ b/criu/include/crtools.h @@ -26,7 +26,6 @@ extern int cr_pre_dump_tasks(pid_t pid); extern int cr_restore_tasks(void); extern int convert_to_elf(char *elf_path, int fd_core); extern int cr_check(void); -extern int check_caps(void); extern int cr_dedup(void); extern int cr_lazy_pages(bool daemon); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index e987c18ce..f33918de8 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -19,15 +19,19 @@ enum faults { FI_HUGE_ANON_SHMEM_ID = 132, FI_CANNOT_MAP_VDSO = 133, FI_CORRUPT_EXTREGS = 134, - FI_DONT_USE_PAGEMAP_SCAN = 135, - FI_DUMP_CRASH = 136, - FI_COMPEL_INTERRUPT_ONLY_MODE = 137, - FI_PLUGIN_CUDA_FORCE_ENABLE = 138, FI_MAX, }; static inline bool __fault_injected(enum faults f, enum faults fi_strategy) { + /* + * Temporary workaround for Xen guests. Breakpoints degrade + * performance linearly, so until we find out the reason, + * let's disable them. + */ + if (f == FI_NO_BREAKPOINTS) + return true; + return fi_strategy == f; } diff --git a/criu/include/file-lock.h b/criu/include/file-lock.h index 9ab79b66b..0ce2fa340 100644 --- a/criu/include/file-lock.h +++ b/criu/include/file-lock.h @@ -30,12 +30,12 @@ #define LOCK_SH 1 /* shared lock */ #define LOCK_EX 2 /* exclusive lock */ #define LOCK_NB \ - 4 /* or'd with one of the above to prevent + 4 /* or'd with one of the above to prevent blocking */ #define LOCK_UN 8 /* remove lock */ -#define LOCK_MAND 32 /* This is a mandatory flock ... */ -#define LOCK_READ 64 /* which allows concurrent read operations */ +#define LOCK_MAND 32 /* This is a mandatory flock ... */ +#define LOCK_READ 64 /* which allows concurrent read operations */ #define LOCK_WRITE 128 /* which allows concurrent write operations */ #define LOCK_RW 192 /* which allows concurrent read & write ops */ @@ -47,7 +47,7 @@ struct file_lock { int fl_kind; int fl_ltype; - pid_t fl_owner; /* process, which created the lock */ + pid_t fl_owner; /* process, which created the lock */ pid_t fl_holder; /* pid of fd on whose the lock is found */ int maj, min; unsigned long i_no; diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h index d4934c4ae..953d50288 100644 --- a/criu/include/files-reg.h +++ b/criu/include/files-reg.h @@ -58,7 +58,4 @@ static inline int link_strip_deleted(struct fd_link *link) extern int dead_pid_conflict(void); -extern int rm_parent_dirs(int mntns_root, char *path, int count); -extern int make_parent_dirs_if_need(int mntns_root, char *path); - #endif /* __CR_FILES_REG_H__ */ diff --git a/criu/include/files.h b/criu/include/files.h index 31ebb0ca0..96face71b 100644 --- a/criu/include/files.h +++ b/criu/include/files.h @@ -82,8 +82,8 @@ enum { struct fdinfo_list_entry { struct list_head desc_list; /* To chain on @fd_info_head */ - struct file_desc *desc; /* Associated file descriptor */ - struct list_head ps_list; /* To chain per-task files */ + struct file_desc *desc; /* Associated file descriptor */ + struct list_head ps_list; /* To chain per-task files */ struct pstree_item *task; FdinfoEntry *fe; int pid; @@ -121,12 +121,12 @@ unsigned int find_unused_fd(struct pstree_item *, int hint_fd); struct fdinfo_list_entry *find_used_fd(struct pstree_item *, int fd); struct file_desc { - u32 id; /* File id, unique */ - struct hlist_node hash; /* Descriptor hashing and lookup */ - struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ - struct file_desc_ops *ops; /* Associated operations */ + u32 id; /* File id, unique */ + struct hlist_node hash; /* Descriptor hashing and lookup */ + struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */ + struct file_desc_ops *ops; /* Associated operations */ struct list_head fake_master_list; /* To chain in the list of file_desc, which don't - * have a fle in a task, that having permissions */ + have a fle in a task, that having permissions */ }; struct fdtype_ops { @@ -195,5 +195,4 @@ extern int open_transport_socket(void); extern int set_fds_event(pid_t virt); extern void wait_fds_event(void); -int find_unused_fd_pid(pid_t pid); #endif /* __CR_FILES_H__ */ diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h index ffc0455d5..46ac8aa27 100644 --- a/criu/include/fs-magic.h +++ b/criu/include/fs-magic.h @@ -53,12 +53,4 @@ #define AUTOFS_SUPER_MAGIC 0x0187 #endif -#ifndef OVERLAYFS_SUPER_MAGIC -#define OVERLAYFS_SUPER_MAGIC 0x794c7630 -#endif - -#ifndef PID_FS_MAGIC -#define PID_FS_MAGIC 0x50494446 -#endif - #endif /* __CR_FS_MAGIC_H__ */ diff --git a/criu/include/hugetlb.h b/criu/include/hugetlb.h deleted file mode 100644 index 9aee5bed3..000000000 --- a/criu/include/hugetlb.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef __CR_HUGETLB_H_ -#define __CR_HUGETLB_H_ - -#include -#include - -#include "vma.h" - -#define ANON_HUGEPAGE_PREFIX "/anon_hugepage" -#define ANON_HUGEPAGE_PREFIX_LEN (sizeof(ANON_HUGEPAGE_PREFIX) - 1) - -enum hugepage_size { - HUGETLB_16KB, - HUGETLB_64KB, - HUGETLB_512KB, - HUGETLB_1MB, - HUGETLB_2MB, - HUGETLB_8MB, - HUGETLB_16MB, - HUGETLB_32MB, - HUGETLB_256MB, - HUGETLB_512MB, - HUGETLB_1GB, - HUGETLB_2GB, - HUGETLB_16GB, - HUGETLB_MAX -}; - -#define MAP_HUGETLB_SHIFT 26 -#define MAP_HUGETLB_SIZE_MASK (0x3f << MAP_HUGETLB_SHIFT) - -#define MAP_HUGETLB_16KB (14 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_64KB (16 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_512KB (19 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_1MB (20 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_2MB (21 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_8MB (23 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_16MB (24 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_32MB (25 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_256MB (28 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_512MB (29 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_1GB (30 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_2GB (31 << MAP_HUGETLB_SHIFT) -#define MAP_HUGETLB_16GB (34 << MAP_HUGETLB_SHIFT) - -struct htlb_info { - unsigned long long size; - int flag; -}; - -extern struct htlb_info hugetlb_info[HUGETLB_MAX]; - -int is_hugetlb_dev(dev_t dev, int *hugetlb_size_flag); -int can_dump_with_memfd_hugetlb(dev_t dev, int *hugetlb_size_flag, const char *file_path, struct vma_area *vma); -unsigned long get_size_from_hugetlb_flag(int flag); - -#ifndef MFD_HUGETLB -#define MFD_HUGETLB 4 -#endif - -#endif diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 79e1ac111..5045baee8 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -113,7 +113,6 @@ enum { CR_FD_PIPES, CR_FD_TTY_FILES, CR_FD_MEMFD_FILE, - CR_FD_PIDFD, CR_FD_AUTOFS, @@ -123,8 +122,8 @@ enum { /* file descriptors template */ struct cr_fd_desc_tmpl { const char *fmt; /* format for the name */ - u32 magic; /* magic in the header */ - int oflags; /* flags for image_open */ + u32 magic; /* magic in the header */ + int oflags; /* flags for image_open */ }; extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX]; diff --git a/criu/include/image.h b/criu/include/image.h index 30e32323d..14659dbd2 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -35,15 +35,13 @@ * - stack * the memory area is used in application stack so we * should be careful about guard page here - * - shadow stack - * the memory area is used by shadow stack * - vsyscall * special memory area injected into the task memory * space by the kernel itself, represent virtual syscall * implementation and it is specific to every kernel version, * its contents should not be dumped ever * - vdso,vvar - * the vDSO area, it might require additional memory + * the vDSO area, it might reqire additional memory * contents modification especially when tasks are * migrating between different kernel versions * - heap @@ -68,18 +66,6 @@ * processing exiting with error; while the rest of bits * are part of image ABI, this particular one must never * be used in image. - * - guard - * stands for a fake VMA (not represented in the kernel - * by a struct vm_area_struct). Used to keep an information - * about virtual address space ranges covered by - * MADV_GUARD_INSTALL guards. These ones must be always at - * the end of the vma_area_list and properly skipped a.e. - * - uprobes - * stands for a "[uprobes]" vma that's automatically mapped by - * the kernel when an active uprobe is hit. Contents of this vma - * are not dumped and neither are its madvise bits restored, - * because the kernel is in complete control of this vma. This is - * just used to track the existence of the uprobes vma. */ #define VMA_AREA_NONE (0 << 0) #define VMA_AREA_REGULAR (1 << 0) @@ -98,11 +84,7 @@ #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) #define VMA_AREA_MEMFD (1 << 14) -#define VMA_AREA_SHSTK (1 << 15) -#define VMA_AREA_GUARD (1 << 16) -#define VMA_AREA_UPROBES (1 << 17) -#define VMA_EXT_PLUGIN (1 << 27) #define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) #define VMA_PREMMAPED (1 << 30) @@ -114,8 +96,6 @@ #define CR_PARENT_LINK "parent" -#define OPT_ALLOW_UPROBES "allow-uprobes" - extern bool ns_per_id; extern bool img_common_magic; @@ -165,7 +145,7 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(const char *dir, int mode); +extern int open_image_dir(char *dir, int mode); extern void close_image_dir(void); /* * Return -1 -- parent symlink points to invalid target @@ -193,8 +173,4 @@ extern int read_img_str(struct cr_img *, char **pstr, int size); extern void close_image(struct cr_img *); -extern int add_inventory_plugin(const char *name); -extern int check_inventory_plugins(void); -extern bool check_and_remove_inventory_plugin(const char *name, size_t n); - #endif /* __CR_IMAGE_H__ */ diff --git a/criu/include/inet_diag.h b/criu/include/inet_diag.h index 4996dd556..ea6f5e14e 100644 --- a/criu/include/inet_diag.h +++ b/criu/include/inet_diag.h @@ -31,7 +31,7 @@ struct inet_diag_req_compat { struct inet_diag_sockid id; __u32 idiag_states; /* States to dump */ - __u32 idiag_dbs; /* Tables to dump (NI) */ + __u32 idiag_dbs; /* Tables to dump (NI) */ }; struct inet_diag_req_v2 { diff --git a/criu/include/kcmp.h b/criu/include/kcmp.h index 575135f80..a6774be47 100644 --- a/criu/include/kcmp.h +++ b/criu/include/kcmp.h @@ -18,8 +18,8 @@ enum kcmp_type { /* Slot for KCMP_EPOLL_TFD */ typedef struct { - uint32_t efd; /* epoll file descriptor */ - uint32_t tfd; /* target file number */ + uint32_t efd; /* epoll file descriptor */ + uint32_t tfd; /* target file number */ uint32_t toff; /* target offset within same numbered sequence */ } kcmp_epoll_slot_t; diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index e4922f401..80bad7f11 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -6,8 +6,6 @@ #include "common/config.h" #include "asm/kerndat.h" #include "util-vdso.h" -#include "hugetlb.h" -#include struct stat; @@ -20,7 +18,7 @@ extern int kerndat_init(void); enum pagemap_func { PM_UNKNOWN, - PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ + PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */ PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */ PM_FULL, }; @@ -38,7 +36,6 @@ struct kerndat_s { u64 zero_page_pfn; bool has_dirty_track; bool has_memfd; - bool has_memfd_hugetlb; bool has_fdinfo_lock; unsigned long task_size; bool ipv6; @@ -77,22 +74,6 @@ struct kerndat_s { bool has_pidfd_getfd; bool has_nspid; bool has_nftables_concat; - bool has_sockopt_buf_lock; - dev_t hugetlb_dev[HUGETLB_MAX]; - bool has_move_mount_set_group; - bool has_openat2; - bool has_rseq; - bool has_ptrace_get_rseq_conf; - struct __ptrace_rseq_configuration libc_rseq_conf; - bool has_ipv6_freebind; - bool has_membarrier_get_registrations; - bool has_pagemap_scan; - bool has_shstk; - bool has_close_range; - bool has_timer_cr_ids; - bool has_breakpoints; - bool has_madv_guard; - bool has_pagemap_scan_guard_pages; }; extern struct kerndat_s kdat; @@ -115,6 +96,4 @@ extern int kerndat_fs_virtualized(unsigned int which, u32 kdev); extern int kerndat_has_nspid(void); -extern void kerndat_warn_about_madv_guards(void); - #endif /* __CR_KERNDAT_H__ */ diff --git a/criu/include/linux/aio_abi.h b/criu/include/linux/aio_abi.h deleted file mode 100644 index d9ce78720..000000000 --- a/criu/include/linux/aio_abi.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __LINUX__AIO_ABI_H -#define __LINUX__AIO_ABI_H - -typedef __kernel_ulong_t aio_context_t; - -/* read() from /dev/aio returns these structures. */ -struct io_event { - __u64 data; /* the data field from the iocb */ - __u64 obj; /* what iocb this event came from */ - __s64 res; /* result code for this event */ - __s64 res2; /* secondary result */ -}; - -#endif /* __LINUX__AIO_ABI_H */ diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h index fefafa89e..840d6277e 100644 --- a/criu/include/linux/mount.h +++ b/criu/include/linux/mount.h @@ -4,40 +4,32 @@ #include "common/config.h" #include "compel/plugins/std/syscall-codes.h" -/* Copied from /usr/include/sys/mount.h */ - -#ifndef FSOPEN_CLOEXEC -/* The type of fsconfig call made. */ +#ifdef CONFIG_HAS_FSCONFIG +#include +#else enum fsconfig_command { FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ -#define FSCONFIG_SET_FLAG FSCONFIG_SET_FLAG FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ -#define FSCONFIG_SET_STRING FSCONFIG_SET_STRING FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ -#define FSCONFIG_SET_BINARY FSCONFIG_SET_BINARY FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ -#define FSCONFIG_SET_PATH FSCONFIG_SET_PATH FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ -#define FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_PATH_EMPTY FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ -#define FSCONFIG_SET_FD FSCONFIG_SET_FD FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ -#define FSCONFIG_CMD_CREATE FSCONFIG_CMD_CREATE FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ -#define FSCONFIG_CMD_RECONFIGURE FSCONFIG_CMD_RECONFIGURE }; - -#endif // FSOPEN_CLOEXEC - -/* fsopen flags. With the redundant definition, we check if the kernel, - * glibc value and our value still match. - */ -#define FSOPEN_CLOEXEC 0x00000001 - -#ifndef MS_MGC_VAL -/* Magic mount flag number. Has to be or-ed to the flag values. */ -#define MS_MGC_VAL 0xc0ed0000 /* Magic flag number to indicate "new" flags */ -#define MS_MGC_MSK 0xffff0000 /* Magic flag number mask */ #endif +static inline int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} +static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} +static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + #endif diff --git a/criu/include/linux/openat2.h b/criu/include/linux/openat2.h deleted file mode 100644 index 1e9ccff05..000000000 --- a/criu/include/linux/openat2.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _CRIU_LINUX_OPENAT2_H -#define _CRIU_LINUX_OPENAT2_H - -#include - -#include "common/config.h" - -#ifdef CONFIG_HAS_OPENAT2 -#include -#else -struct open_how { - __u64 flags; - __u64 mode; - __u64 resolve; -}; -#endif - -#endif diff --git a/criu/include/linux/rseq.h b/criu/include/linux/rseq.h deleted file mode 100644 index 5ceefbf8e..000000000 --- a/criu/include/linux/rseq.h +++ /dev/null @@ -1,154 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_RSEQ_H -#define _UAPI_LINUX_RSEQ_H - -#ifdef __has_include -#if __has_include("sys/rseq.h") -#include -#include "asm/thread_pointer.h" -#endif -#endif - -#include -#include - -#include "common/config.h" - -#ifdef CONFIG_HAS_NO_LIBC_RSEQ_DEFS -/* - * linux/rseq.h - * - * Restartable sequences system call API - * - * Copyright (c) 2015-2018 Mathieu Desnoyers - */ - -enum rseq_cpu_id_state { - RSEQ_CPU_ID_UNINITIALIZED = -1, - RSEQ_CPU_ID_REGISTRATION_FAILED = -2, -}; - -enum rseq_flags { - RSEQ_FLAG_UNREGISTER = (1 << 0), -}; - -enum rseq_cs_flags_bit { - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, -}; - -enum rseq_cs_flags { - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), -}; -#endif /* CONFIG_HAS_NO_LIBC_RSEQ_DEFS */ - -/* - * Let's use our own definition of struct rseq_cs because some distros - * (for example Mariner GNU/Linux) declares this structure their-own way. - * This makes trouble with inconsistency between printf formatters and - * struct rseq_cs field types. - */ -/* - * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always - * contained within a single cache-line. It is usually declared as - * link-time constant data. - */ -struct criu_rseq_cs { - /* Version of this structure. */ - __u32 version; - /* enum rseq_cs_flags */ - __u32 flags; - __u64 start_ip; - /* Offset from start_ip. */ - __u64 post_commit_offset; - __u64 abort_ip; -} __attribute__((aligned(4 * sizeof(__u64)))); - -/* - * We have to have our own copy of struct rseq definition because - * of breaking UAPI change: - * https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/commit/?id=bfdf4e6208051ed7165b2e92035b4bf11f43eb63 - */ -/* - * struct rseq is aligned on 4 * 8 bytes to ensure it is always - * contained within a single cache-line. - * - * A single struct rseq per thread is allowed. - */ -struct criu_rseq { - /* - * Restartable sequences cpu_id_start field. Updated by the - * kernel. Read by user-space with single-copy atomicity - * semantics. This field should only be read by the thread which - * registered this data structure. Aligned on 32-bit. Always - * contains a value in the range of possible CPUs, although the - * value may not be the actual current CPU (e.g. if rseq is not - * initialized). This CPU number value should always be compared - * against the value of the cpu_id field before performing a rseq - * commit or returning a value read from a data structure indexed - * using the cpu_id_start value. - */ - __u32 cpu_id_start; - /* - * Restartable sequences cpu_id field. Updated by the kernel. - * Read by user-space with single-copy atomicity semantics. This - * field should only be read by the thread which registered this - * data structure. Aligned on 32-bit. Values - * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED - * have a special semantic: the former means "rseq uninitialized", - * and latter means "rseq initialization failed". This value is - * meant to be read within rseq critical sections and compared - * with the cpu_id_start value previously read, before performing - * the commit instruction, or read and compared with the - * cpu_id_start value before returning a value loaded from a data - * structure indexed using the cpu_id_start value. - */ - __u32 cpu_id; - /* - * Restartable sequences rseq_cs field. - * - * Contains NULL when no critical section is active for the current - * thread, or holds a pointer to the currently active struct rseq_cs. - * - * Updated by user-space, which sets the address of the currently - * active rseq_cs at the beginning of assembly instruction sequence - * block, and set to NULL by the kernel when it restarts an assembly - * instruction sequence block, as well as when the kernel detects that - * it is preempting or delivering a signal outside of the range - * targeted by the rseq_cs. Also needs to be set to NULL by user-space - * before reclaiming memory that contains the targeted struct rseq_cs. - * - * Read and set by the kernel. Set by user-space with single-copy - * atomicity semantics. This field should only be updated by the - * thread which registered this data structure. Aligned on 64-bit. - * - * 32-bit architectures should update the low order bits of the - * rseq_cs field, leaving the high order bits initialized to 0. - */ - __u64 rseq_cs; - - /* - * Restartable sequences flags field. - * - * This field should only be updated by the thread which - * registered this data structure. Read by the kernel. - * Mainly used for single-stepping through rseq critical sections - * with debuggers. - * - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart on preemption - * for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart on signal - * delivery for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart on migration for - * this thread. - */ - __u32 flags; -} __attribute__((aligned(4 * sizeof(__u64)))); - -#endif /* _UAPI_LINUX_RSEQ_H */ diff --git a/criu/include/log.h b/criu/include/log.h index cbed33007..85e6dc2e7 100644 --- a/criu/include/log.h +++ b/criu/include/log.h @@ -60,8 +60,6 @@ void flush_early_log_buffer(int fd); #define pr_perror(fmt, ...) pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) -#define pr_pwarn(fmt, ...) pr_warn(fmt ": %s\n", ##__VA_ARGS__, strerror(errno)) - #endif /* CR_NOGLIBC */ #endif /* __CR_LOG_H__ */ diff --git a/criu/include/magic.h b/criu/include/magic.h index 6f0aff26d..22d7218e4 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -29,7 +29,7 @@ /* * The magic-s below correspond to coordinates - * of various towns in the NNNNEEEE form. + * of various Russian towns in the NNNNEEEE form. */ #define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */ @@ -100,7 +100,6 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ -#define PIDFD_MAGIC 0x54435556 /* Ufa */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/mem.h b/criu/include/mem.h index e9ce3518a..03574ea3d 100644 --- a/criu/include/mem.h +++ b/criu/include/mem.h @@ -7,7 +7,6 @@ #include "pid.h" #include "proc_parse.h" #include "inventory.pb-c.h" -#include "pagemap-cache.h" struct parasite_ctl; struct vm_area_list; @@ -31,12 +30,10 @@ extern int do_task_reset_dirty_track(int pid); extern unsigned long dump_pages_args_size(struct vm_area_list *vmas); extern int parasite_dump_pages_seized(struct pstree_item *item, struct vm_area_list *vma_area_list, struct mem_dump_ctl *mdc, struct parasite_ctl *ctl); -extern int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list); #define PME_PRESENT (1ULL << 63) #define PME_SWAP (1ULL << 62) #define PME_FILE (1ULL << 61) -#define PME_GUARD_REGION (1ULL << 58) #define PME_SOFT_DIRTY (1ULL << 55) #define PME_PSHIFT_BITS (6) #define PME_STATUS_BITS (3) @@ -50,12 +47,5 @@ int open_vmas(struct pstree_item *t); int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta); int unmap_guard_pages(struct pstree_item *t); int prepare_mappings(struct pstree_item *t); - -struct page_info { - u64 next; - bool softdirty; -}; - -int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info); - +bool should_dump_page(VmaEntry *vmae, u64 pme); #endif /* __CR_MEM_H__ */ diff --git a/criu/include/memfd.h b/criu/include/memfd.h index 78d810019..1b1dc79bb 100644 --- a/criu/include/memfd.h +++ b/criu/include/memfd.h @@ -1,9 +1,7 @@ #ifndef __CR_MEMFD_H__ #define __CR_MEMFD_H__ -#include #include - #include "int.h" #include "common/config.h" @@ -14,7 +12,7 @@ extern int is_memfd(dev_t dev); extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); extern const struct fdtype_ops memfd_dump_ops; -extern int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap); +extern int memfd_open(struct file_desc *d, u32 *fdflags); extern struct collect_image_info memfd_cinfo; extern struct file_desc *collect_memfd(u32 id); extern int apply_memfd_seals(void); diff --git a/criu/include/mman.h b/criu/include/mman.h index 43e0b6cc7..8ca71fadf 100644 --- a/criu/include/mman.h +++ b/criu/include/mman.h @@ -4,9 +4,6 @@ #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif -#ifndef MAP_DROPPABLE -#define MAP_DROPPABLE 0x08 -#endif #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -16,11 +13,5 @@ #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif -#ifndef MADV_WIPEONFORK -#define MADV_WIPEONFORK 18 -#endif -#ifndef MADV_GUARD_INSTALL -#define MADV_GUARD_INSTALL 102 -#endif #endif /* __CR_MMAN_H__ */ diff --git a/criu/include/mount-v2.h b/criu/include/mount-v2.h deleted file mode 100644 index 096f08f3b..000000000 --- a/criu/include/mount-v2.h +++ /dev/null @@ -1,95 +0,0 @@ -#ifndef __CR_MOUNT_V2_H__ -#define __CR_MOUNT_V2_H__ - -#include "linux/mount.h" -#include "linux/openat2.h" - -#include "common/list.h" - -#include - -#ifndef MOVE_MOUNT_SET_GROUP -#define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ -#endif -#ifndef MOVE_MOUNT_F_EMPTY_PATH -#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */ -#endif -#ifndef MOVE_MOUNT_T_EMPTY_PATH -#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ -#endif - -static inline int sys_move_mount(int from_dirfd, const char *from_pathname, int to_dirfd, const char *to_pathname, - unsigned int flags) -{ - return syscall(__NR_move_mount, from_dirfd, from_pathname, to_dirfd, to_pathname, flags); -} - -#ifndef OPEN_TREE_CLONE -#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ -#endif -#ifndef OPEN_TREE_CLOEXEC -#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ -#endif -#ifndef AT_SYMLINK_NOFOLLOW -#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */ -#endif -#ifndef AT_NO_AUTOMOUNT -#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */ -#endif -#ifndef AT_EMPTY_PATH -#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */ -#endif -#ifndef AT_RECURSIVE -#define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ -#endif - -static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) -{ - return syscall(__NR_open_tree, dfd, filename, flags); -} - -#ifndef RESOLVE_NO_XDEV -#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings (includes bind-mounts). */ -#endif - -static inline long sys_openat2(int dirfd, const char *pathname, struct open_how *how, size_t size) -{ - return syscall(__NR_openat2, dirfd, pathname, how, size); -} - -extern int check_mount_v2(void); - -struct sharing_group { - /* This pair identifies the group */ - int shared_id; - int master_id; - - /* List of shared groups */ - struct list_head list; - - /* List of mounts in this group */ - struct list_head mnt_list; - - /* - * List of dependent shared groups: - * - all siblings have equal master_id - * - the parent has shared_id equal to children's master_id - * - * This is a bit tricky: parent pointer indicates if there is one - * parent sharing_group in list or only siblings. - * So for traversal if parent pointer is set we can do: - * list_for_each_entry(t, &sg->parent->children, siblings) - * and otherwise we can do: - * list_for_each_entry(t, &sg->siblings, siblings) - */ - struct list_head children; - struct list_head siblings; - struct sharing_group *parent; - - char *source; -}; - -extern int resolve_shared_mounts_v2(void); -extern int prepare_mnt_ns_v2(void); - -#endif /* __CR_MOUNT_V2_H__ */ diff --git a/criu/include/mount.h b/criu/include/mount.h index 6587c63b2..29d80c2a7 100644 --- a/criu/include/mount.h +++ b/criu/include/mount.h @@ -10,20 +10,6 @@ struct pstree_item; struct fstype; struct ns_id; -#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE) - -/* - * Here are a set of flags which we know how to handle for the one mount call. - * All of them except MS_RDONLY are set only as mnt flags. - * MS_RDONLY is set for both mnt and sb flags, so we can restore it for one - * mount call only if it set for both masks. - */ -#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_RDONLY) - -#define BINFMT_MISC_HOME "proc/sys/fs/binfmt_misc" - -#define HELPER_MNT_ID 0 - #define MOUNT_INVALID_DEV (0) #define MNT_UNREACHABLE INT_MIN @@ -40,10 +26,6 @@ struct ns_id; */ #define REMOUNTED_RW_SERVICE 2 -struct rst_mount_info { - int remounted_rw; -}; - struct mount_info { int mnt_id; int parent_mnt_id; @@ -62,15 +44,6 @@ struct mount_info { */ char *mountpoint; char *ns_mountpoint; - - /* Mount-v2 specific */ - char *plain_mountpoint; - int is_dir; - int mp_fd_id; - int mnt_fd_id; - struct sharing_group *sg; - struct list_head mnt_sharing; - int fd; unsigned flags; unsigned sb_flags; @@ -87,8 +60,6 @@ struct mount_info { bool need_plugin; bool is_ns_root; bool deleted; - int deleted_level; - struct list_head deleted_list; struct mount_info *next; struct ns_id *nsid; @@ -101,49 +72,24 @@ struct mount_info { struct list_head children; struct list_head siblings; - struct list_head mnt_bind; /* circular list of derivatives of one real mount */ - bool mnt_bind_is_populated; /* indicate that mnt_bind list is ready to use */ - struct list_head mnt_share; /* circular list of shared mounts */ + struct list_head mnt_bind; /* circular list of derivatives of one real mount */ + struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list; /* list of slave mounts */ - struct list_head mnt_slave; /* slave list entry */ - struct list_head mnt_ext_slave; /* external slave list entry */ - struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ - struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ - struct list_head mnt_notprop; /* temporary list used in can_mount_now */ + struct list_head mnt_slave; /* slave list entry */ + struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */ + struct list_head mnt_propagate; /* circular list of mounts which propagate from each other */ + struct list_head mnt_notprop; /* temporary list used in can_mount_now */ struct list_head mnt_unbindable; /* list of mounts with delayed unbindable */ struct list_head postpone; int is_overmounted; - - struct rst_mount_info *rmi; + int remounted_rw; void *private; /* associated filesystem data */ }; extern struct mount_info *mntinfo; - -extern void mntinfo_add_list_before(struct mount_info **head, struct mount_info *new); - -/* - * Put a : in here since those are invalid on - * the cli, so we know it's autogenerated in - * debugging. - */ -#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED" -#define EXTERNAL_DEV_MOUNT "CRIU:EXTERNAL_DEV" -#define NO_ROOT_MOUNT "CRIU:NO_ROOT" - -static inline bool mnt_is_dev_external(struct mount_info *mi) -{ - return mi->external && !strcmp(mi->external, EXTERNAL_DEV_MOUNT); -} - -static inline bool mnt_is_nodev_external(struct mount_info *mi) -{ - return mi->external && strcmp(mi->external, EXTERNAL_DEV_MOUNT); -} - extern struct ns_desc mnt_ns_desc; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED extern int collect_binfmt_misc(void); @@ -154,7 +100,7 @@ static inline int collect_binfmt_misc(void) } #endif -extern struct mount_info *mnt_entry_alloc(bool rst); +extern struct mount_info *mnt_entry_alloc(void); extern void mnt_entry_free(struct mount_info *mi); extern int __mntns_get_root_fd(pid_t pid); @@ -163,9 +109,7 @@ extern int mntns_get_root_by_mnt_id(int mnt_id); extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id); extern int open_mount(unsigned int s_dev); -extern int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo); -extern int check_mountpoint_fd(struct mount_info *pm, int mnt_fd); -extern int __open_mountpoint(struct mount_info *pm); +extern int __open_mountpoint(struct mount_info *pm, int mnt_fd); extern int mnt_is_dir(struct mount_info *pm); extern int open_mountpoint(struct mount_info *pm); @@ -193,10 +137,8 @@ extern int read_mnt_ns_img(void); extern void cleanup_mnt_ns(void); extern void clean_cr_time_mounts(void); -extern char *get_plain_mountpoint(int mnt_id, char *name); - extern bool add_skip_mount(const char *mountpoint); -extern int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo); +struct ns_id; extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump); extern int check_mnt_id(void); @@ -205,35 +147,4 @@ extern int remount_readonly_mounts(void); extern int try_remount_writable(struct mount_info *mi, bool ns); extern bool mnt_is_overmounted(struct mount_info *mi); -extern struct mount_info *mnt_get_external_bind(struct mount_info *mi); -extern bool mnt_is_external_bind(struct mount_info *mi); -extern bool has_mounted_external_bind(struct mount_info *mi); -extern bool rst_mnt_is_root(struct mount_info *mi); -extern struct mount_info *mnt_get_root_bind(struct mount_info *mi); -extern bool mnt_is_root_bind(struct mount_info *mi); -extern struct mount_info *mnt_get_external_bind_nodev(struct mount_info *mi); - -extern struct mount_info *mnt_bind_pick(struct mount_info *mi, - bool (*pick)(struct mount_info *mi, struct mount_info *bind)); - -extern int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)); - -extern char *service_mountpoint(const struct mount_info *mi); - -extern int validate_mounts(struct mount_info *info, bool for_dump); -extern __maybe_unused struct mount_info *add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, - unsigned int s_dev, bool rst); -extern char *resolve_source(struct mount_info *mi); -extern int fetch_rt_stat(struct mount_info *m, const char *where); -extern int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags); -extern char *mnt_fsname(struct mount_info *mi); -extern int apply_sb_flags(void *args, int fd, pid_t pid); -extern int mount_root(void *args, int fd, pid_t pid); -extern int restore_ext_mount(struct mount_info *mi); -extern int cr_pivot_root(char *root); -extern int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs); - -extern struct mount_info *root_yard_mp; -extern char *mnt_roots; - #endif /* __CR_MOUNT_H__ */ diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index 183a3b852..034605917 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -1,8 +1,6 @@ #ifndef __CR_NS_H__ #define __CR_NS_H__ -#include - #include "common/compiler.h" #include "files.h" #include "common/list.h" @@ -130,9 +128,9 @@ struct ns_id { */ union { int nsfd_id; /* a namespace descriptor id in fdstore */ - int ns_fd; /* a namespace file descriptor */ + int ns_fd; /* a namespace file descriptor */ }; - int nlsk; /* for sockets collection */ + int nlsk; /* for sockets collection */ int seqsk; /* to talk to parasite daemons */ struct list_head ids; struct list_head links; @@ -226,19 +224,4 @@ extern int add_ns_shared_cb(int (*actor)(void *data), void *data); extern struct ns_id *get_socket_ns(int lfd); extern struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd); -struct unsc_msg { - struct msghdr h; - /* - * 0th is the call address - * 1st is the flags - * 2nd is the optional (NULL in response) arguments - */ - struct iovec iov[3]; - char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; -}; - -extern void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid); -extern void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd); -extern int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)); - #endif /* __CR_NS_H__ */ diff --git a/criu/include/net.h b/criu/include/net.h index 7c5ede21e..0da4cad13 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(bool restore); +extern int network_lock_internal(void); extern struct ns_desc net_ns_desc; @@ -50,6 +50,7 @@ extern int kerndat_has_newifindex(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); extern struct ns_id *net_get_root_ns(void); +extern int kerndat_nsid(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); extern struct ns_id *get_root_netns(void); diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h index 65292b7ab..a5f97678e 100644 --- a/criu/include/page-pipe.h +++ b/criu/include/page-pipe.h @@ -90,14 +90,14 @@ struct kernel_pipe_buffer { */ struct page_pipe_buf { - int p[2]; /* pipe with pages */ + int p[2]; /* pipe with pages */ unsigned int pipe_size; /* how many pages can be fit into pipe */ - unsigned int nr_segs; /* how many iov-s are busy */ - unsigned long pipe_off; /* where this buf is started in a pipe */ - unsigned long pages_in; /* how many pages are there */ + unsigned int pipe_off; /* where this buf is started in a pipe */ + unsigned int pages_in; /* how many pages are there */ + unsigned int nr_segs; /* how many iov-s are busy */ #define PPB_LAZY (1 << 0) unsigned int flags; - struct iovec *iov; /* vaddr:len map */ + struct iovec *iov; /* vaddr:len map */ struct list_head l; /* links into page_pipe->bufs */ }; @@ -113,25 +113,27 @@ struct page_pipe_buf { #define PP_HOLE_PARENT (1 << 0) struct page_pipe { - unsigned int nr_pipes; /* how many page_pipe_bufs in there */ - struct list_head bufs; /* list of bufs */ - struct list_head free_bufs; /* list of bufs */ + unsigned int nr_pipes; /* how many page_pipe_bufs in there */ + struct list_head bufs; /* list of bufs */ + struct list_head free_bufs; /* list of bufs */ struct page_pipe_buf *prev[PP_PIPE_TYPES]; /* last ppb of each type for pipe sharing */ - unsigned int nr_iovs; /* number of iovs */ - unsigned int free_iov; /* first free iov */ + unsigned int nr_iovs; /* number of iovs */ + unsigned int free_iov; /* first free iov */ struct iovec *iovs; /* iovs. They are provided into create_page_pipe and all bufs have their iov-s in there */ - unsigned int nr_holes; /* number of holes allocated */ + unsigned int nr_holes; /* number of holes allocated */ unsigned int free_hole; /* number of holes in use */ - struct iovec *holes; /* holes */ + struct iovec *holes; /* holes */ unsigned int *hole_flags; unsigned int flags; /* PP_FOO flags below */ }; -#define PP_CHUNK_MODE 0x1 /* Restrict the maximum buffer size of pipes and dump memory for a few iterations */ -#define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ +#define PP_CHUNK_MODE \ + 0x1 /* Restrict the maximum buffer size of pipes + and dump memory for a few iterations */ +#define PP_OWN_IOVS 0x4 /* create_page_pipe allocated IOVs memory */ struct page_pipe *create_page_pipe(unsigned int nr_segs, struct iovec *iovs, unsigned flags); extern void destroy_page_pipe(struct page_pipe *p); @@ -149,7 +151,7 @@ struct pipe_read_dest { }; extern int pipe_read_dest_init(struct pipe_read_dest *prd); -extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long *nr_pages, +extern int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, unsigned int ppb_flags); #endif /* __CR_PAGE_PIPE_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index 0d9b35019..e0303dfe0 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -10,7 +10,7 @@ struct ps_info { extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); /* User buffer for read-mode pre-dump*/ -#define PIPE_MAX_BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) +#define BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) /* * page_xfer -- transfer pages into image file. @@ -36,7 +36,7 @@ struct page_xfer { union { struct /* local */ { struct cr_img *pmi; /* pagemaps */ - struct cr_img *pi; /* pages */ + struct cr_img *pi; /* pages */ }; struct /* page-server */ { @@ -69,9 +69,9 @@ extern int check_parent_page_xfer(int fd_type, unsigned long id); */ /* async request/receive of remote pages */ -extern int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages); +extern int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages); -typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, unsigned long nr_pages, void *); -extern int page_server_start_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); +typedef int (*ps_async_read_complete)(unsigned long img_id, unsigned long vaddr, int nr_pages, void *); +extern int page_server_start_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv, unsigned flags); #endif /* __CR_PAGE_XFER__H__ */ diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h index 875e69e56..7612ee0f4 100644 --- a/criu/include/pagemap-cache.h +++ b/criu/include/pagemap-cache.h @@ -1,31 +1,23 @@ #ifndef __CR_PAGEMAP_H__ #define __CR_PAGEMAP_H__ -#include #include #include "int.h" #include "common/list.h" -#include "pagemap_scan.h" struct vma_area; #define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64)) typedef struct { - pid_t pid; /* which process it belongs */ - unsigned long start; /* start of area */ - unsigned long end; /* end of area */ + pid_t pid; /* which process it belongs */ + unsigned long start; /* start of area */ + unsigned long end; /* end of area */ const struct list_head *vma_head; /* list head of VMAs we're serving */ - int fd; /* file to read PMs from */ - - u64 *map; /* local buffer */ - size_t map_len; /* length of a buffer */ - - struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */ - size_t regs_len; /* actual length of regs */ - size_t regs_max_len; /* maximum length of regs */ - size_t regs_idx; /* current index in the regs array */ + u64 *map; /* local buffer */ + size_t map_len; /* length of a buffer */ + int fd; /* file to read PMs from */ } pmc_t; #define PMC_INIT \ @@ -34,8 +26,7 @@ typedef struct { } extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size); -extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma); +extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma); extern void pmc_fini(pmc_t *pmc); -extern int pmc_fill(pmc_t *pmc, u64 start, u64 end); #endif /* __CR_PAGEMAP_H__ */ diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h index 4cbc87cc6..c39c25d0c 100644 --- a/criu/include/pagemap.h +++ b/criu/include/pagemap.h @@ -44,7 +44,7 @@ struct page_read { /* reads page from current pagemap */ - int (*read_pages)(struct page_read *, unsigned long vaddr, unsigned long nr, void *, unsigned flags); + int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *, unsigned flags); /* Advance page_read to the next entry */ int (*advance)(struct page_read *pr); void (*close)(struct page_read *); @@ -52,28 +52,28 @@ struct page_read { int (*sync)(struct page_read *pr); int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr); void (*reset)(struct page_read *pr); - int (*io_complete)(struct page_read *, unsigned long vaddr, unsigned long nr); - int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags); + int (*io_complete)(struct page_read *, unsigned long vaddr, int nr); + int (*maybe_read_page)(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags); /* Whether or not pages can be read in PIE code */ bool pieok; - /* Whether or not disable image deduplication*/ - bool disable_dedup; - /* Private data of reader */ struct cr_img *pmi; struct cr_img *pi; u32 pages_img_id; - PagemapEntry *pe; /* current pagemap we are on */ - struct page_read *parent; /* parent pagemap (if ->in_parent pagemap is met in image, - * then go to this guy for page, see read_pagemap_page */ - unsigned long cvaddr; /* vaddr we are on */ - off_t pi_off; /* current offset in pages file */ + PagemapEntry *pe; /* current pagemap we are on */ + struct page_read *parent; /* parent pagemap (if ->in_parent + pagemap is met in image, then + go to this guy for page, see + read_pagemap_page */ + unsigned long cvaddr; /* vaddr we are on */ + off_t pi_off; /* current offset in pages file */ - struct iovec bunch; /* record consequent neighbour iovecs to punch together */ - unsigned id; /* for logging */ + struct iovec bunch; /* record consequent neighbour + iovecs to punch together */ + unsigned id; /* for logging */ unsigned long img_id; /* pagemap image file ID */ PagemapEntry **pmes; @@ -115,8 +115,6 @@ int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta); */ extern void dup_page_read(struct page_read *src, struct page_read *dst); -extern void page_read_disable_dedup(struct page_read *pr); - extern int dedup_one_iovec(struct page_read *pr, unsigned long base, unsigned long len); static inline unsigned long pagemap_len(PagemapEntry *pe) diff --git a/criu/include/pagemap_scan.h b/criu/include/pagemap_scan.h deleted file mode 100644 index 9046e01ed..000000000 --- a/criu/include/pagemap_scan.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __CR_PAGEMAP_SCAN_H__ -#define __CR_PAGEMAP_SCAN_H__ - -#ifndef PAGEMAP_SCAN -#include -#include "int.h" - -/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ -#define PAGE_IS_WPALLOWED (1 << 0) -#define PAGE_IS_WRITTEN (1 << 1) -#define PAGE_IS_FILE (1 << 2) -#define PAGE_IS_PRESENT (1 << 3) -#define PAGE_IS_SWAPPED (1 << 4) -#define PAGE_IS_PFNZERO (1 << 5) -#define PAGE_IS_HUGE (1 << 6) -#define PAGE_IS_SOFT_DIRTY (1 << 7) -#define PAGE_IS_GUARD (1 << 8) - -/* - * struct page_region - Page region with flags - * @start: Start of the region - * @end: End of the region (exclusive) - * @categories: PAGE_IS_* category bitmask for the region - */ -struct page_region { - u64 start; - u64 end; - u64 categories; -}; - -#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) - -/* Flags for PAGEMAP_SCAN ioctl */ -#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ -#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ - -/* - * struct pm_scan_arg - Pagemap ioctl argument - * @size: Size of the structure - * @flags: Flags for the IOCTL - * @start: Starting address of the region - * @end: Ending address of the region - * @walk_end Address where the scan stopped (written by kernel). - * walk_end == end (address tags cleared) informs that the scan completed on entire range. - * @vec: Address of page_region struct array for output - * @vec_len: Length of the page_region struct array - * @max_pages: Optional limit for number of returned pages (0 = disabled) - * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 - * @category_mask: Skip pages for which any category doesn't match - * @category_anyof_mask: Skip pages for which no category matches - * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned - */ -struct pm_scan_arg { - u64 size; - u64 flags; - u64 start; - u64 end; - u64 walk_end; - u64 vec; - u64 vec_len; - u64 max_pages; - u64 category_inverted; - u64 category_mask; - u64 category_anyof_mask; - u64 return_mask; -}; -#endif /* PAGEMAP_SCAN */ - -#endif /* __CR_PAGEMAP_SCAN_H__ */ diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h index 4a8ec2fee..4540e11ee 100644 --- a/criu/include/parasite-syscall.h +++ b/criu/include/parasite-syscall.h @@ -21,6 +21,13 @@ struct rt_sigframe; struct parasite_ctl; struct parasite_thread_ctl; +extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); +extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *); + +struct proc_posix_timers_stat; +extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *); + extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc); extern int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce); extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core); diff --git a/criu/include/parasite.h b/criu/include/parasite.h index 176357711..8107aa49d 100644 --- a/criu/include/parasite.h +++ b/criu/include/parasite.h @@ -10,8 +10,6 @@ #include #include -#include "linux/rseq.h" - #include "image.h" #include "util-pie.h" #include "common/lock.h" @@ -63,7 +61,7 @@ struct parasite_dump_pages_args { unsigned int add_prot; unsigned int off; unsigned int nr_segs; - unsigned long nr_pages; + unsigned int nr_pages; }; static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a) @@ -118,8 +116,6 @@ static inline int posix_timers_dump_size(int timer_n) */ struct parasite_dump_misc { - bool has_membarrier_get_registrations; /* this is sent from criu to parasite. */ - unsigned long brk; u32 pid; @@ -130,7 +126,6 @@ struct parasite_dump_misc { int dumpable; int thp_disabled; int child_subreaper; - int membarrier_registration_mask; }; /* @@ -148,11 +143,9 @@ struct parasite_dump_creds { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; - u32 cap_amb[CR_CAP_SIZE]; int uids[4]; int gids[4]; - int no_new_privs; unsigned int secbits; unsigned int ngroups; /* @@ -171,17 +164,10 @@ struct parasite_dump_creds { unsigned int groups[0]; }; -struct parasite_check_rseq { - bool has_rseq; - bool has_ptrace_get_rseq_conf; /* no need to check if supported */ - bool rseq_inited; -}; - struct parasite_dump_thread { unsigned int *tid_addr; pid_t tid; tls_t tls; - struct parasite_check_rseq rseq; stack_t sas; int pdeath_sig; char comm[TASK_COMM_LEN]; @@ -246,12 +232,7 @@ struct parasite_dump_cgroup_args { * * The string is null terminated. */ - char contents[(1 << 12) - 32]; - /* - * Contains the path to thread cgroup procfs. - * "self/task//cgroup" - */ - char thread_cgrp[32]; + char contents[1 << 12]; }; #endif /* !__ASSEMBLY__ */ diff --git a/criu/include/pid.h b/criu/include/pid.h index b2b7a361a..49cb2d322 100644 --- a/criu/include/pid.h +++ b/criu/include/pid.h @@ -31,10 +31,6 @@ struct pid { pid_t real; int state; /* TASK_XXX constants */ - /* If an item is in stopped state it has a signal number - * that caused task to stop. - */ - int stop_signo; /* * The @virt pid is one which used in the image itself and keeps diff --git a/criu/include/pidfd.h b/criu/include/pidfd.h deleted file mode 100644 index bcc0fb45a..000000000 --- a/criu/include/pidfd.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __CR_PIDFD_H__ -#define __CR_PIDFD_H__ - -#include "files.h" -#include "pidfd.pb-c.h" - -extern const struct fdtype_ops pidfd_dump_ops; -extern struct collect_image_info pidfd_cinfo; -extern int is_pidfd_link(char *link); -extern void init_dead_pidfd_hash(void); -struct pidfd_dump_info { - PidfdEntry pidfe; - pid_t pid; -}; - -#endif /* __CR_PIDFD_H__ */ diff --git a/criu/include/pipes.h b/criu/include/pipes.h index f442d7f65..6e6310e14 100644 --- a/criu/include/pipes.h +++ b/criu/include/pipes.h @@ -49,8 +49,8 @@ extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst struct pipe_info { PipeEntry *pe; struct list_head pipe_list; /* All pipe_info with the same pipe_id - * This is pure circular list without head */ - struct list_head list; /* global list of pipes */ + * This is pure circular list without head */ + struct list_head list; /* global list of pipes */ struct file_desc d; unsigned int create : 1, reopen : 1; }; diff --git a/criu/include/plugin.h b/criu/include/plugin.h index 0115e6ea0..a1796b641 100644 --- a/criu/include/plugin.h +++ b/criu/include/plugin.h @@ -5,9 +5,7 @@ #include "common/compiler.h" #include "common/list.h" -#ifndef CR_PLUGIN_DEFAULT -#define CR_PLUGIN_DEFAULT "/usr/lib/criu/" -#endif +#define CR_PLUGIN_DEFAULT "/var/lib/criu/" void cr_plugin_fini(int stage, int err); int cr_plugin_init(int stage); diff --git a/criu/include/prctl.h b/criu/include/prctl.h index 2966659da..c843f40a7 100644 --- a/criu/include/prctl.h +++ b/criu/include/prctl.h @@ -30,21 +30,6 @@ #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif -#ifndef PR_GET_NO_NEW_PRIVS -#define PR_GET_NO_NEW_PRIVS 39 -#endif -#ifndef PR_SET_NO_NEW_PRIVS -#define PR_SET_NO_NEW_PRIVS 38 -#endif -#ifndef PR_CAP_AMBIENT -#define PR_CAP_AMBIENT 47 -#endif -#ifndef PR_CAP_AMBIENT_IS_SET -#define PR_CAP_AMBIENT_IS_SET 1 -#endif -#ifndef PR_CAP_AMBIENT_RAISE -#define PR_CAP_AMBIENT_RAISE 2 -#endif #ifndef PR_SET_MM #define PR_SET_MM 35 @@ -97,11 +82,4 @@ struct prctl_mm_map { #define PR_GET_THP_DISABLE 42 #endif -#ifndef PR_TIMER_CREATE_RESTORE_IDS -#define PR_TIMER_CREATE_RESTORE_IDS 77 -# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 -# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 -# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 -#endif - #endif /* __CR_PRCTL_H__ */ diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 76d3242d2..0c334a190 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -81,7 +81,6 @@ struct proc_status_creds { u32 cap_prm[PROC_CAP_SIZE]; u32 cap_eff[PROC_CAP_SIZE]; u32 cap_bnd[PROC_CAP_SIZE]; - u32 cap_amb[PROC_CAP_SIZE]; }; #define INVALID_UID ((uid_t)-1) @@ -105,6 +104,4 @@ extern int parse_uptime(uint64_t *upt); extern int parse_timens_offsets(struct timespec *boff, struct timespec *moff); -extern bool found_uprobes_vma(void); - #endif /* __CR_PROC_PARSE_H__ */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index c4241be55..3824de101 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,7 +70,6 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, - PB_PIDFD, /* PB_AUTOGEN_STOP */ diff --git a/criu/include/pstree.h b/criu/include/pstree.h index b750a919e..c5b0fa7ea 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -15,14 +15,14 @@ struct pstree_item { struct pstree_item *parent; struct list_head children; /* list of my children */ - struct list_head sibling; /* linkage in my parent's children list */ + struct list_head sibling; /* linkage in my parent's children list */ struct pid *pid; pid_t pgid; pid_t sid; pid_t born_sid; - int nr_threads; /* number of threads */ + int nr_threads; /* number of threads */ struct pid *threads; /* array of threads */ CoreEntry **core; TaskKobjIdsEntry *ids; @@ -63,7 +63,6 @@ struct dmp_info { struct parasite_ctl *parasite_ctl; struct parasite_thread_ctl **thread_ctls; uint64_t *thread_sp; - struct criu_rseq_cs *thread_rseq_cs; /* * Although we don't support dumping different struct creds in general, @@ -104,7 +103,6 @@ extern void pstree_insert_pid(struct pid *pid_node); extern struct pid *pstree_pid_by_virt(pid_t pid); extern struct pstree_item *root_item; -extern bool has_children(struct pstree_item *item); extern struct pstree_item *pstree_item_next(struct pstree_item *item); #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi)) diff --git a/criu/include/rbtree.h b/criu/include/rbtree.h index 6981aa8f9..ba0a8100e 100644 --- a/criu/include/rbtree.h +++ b/criu/include/rbtree.h @@ -14,7 +14,7 @@ #define RB_MASK 3 struct rb_node { - unsigned long rb_parent_color; /* Keeps both parent and color */ + unsigned long rb_parent_color; /* Keeps both parent anc color */ struct rb_node *rb_right; struct rb_node *rb_left; } __aligned(sizeof(long)); diff --git a/criu/include/restore.h b/criu/include/restore.h index 189051826..8ef0dbddf 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -7,57 +7,4 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); -struct task_restore_args; -struct pstree_item; -struct rst_shstk_info; - -#ifndef arch_shstk_prepare -static inline int arch_shstk_prepare(struct pstree_item *item, - CoreEntry *core, - struct task_restore_args *ta) -{ - return 0; -} -#define arch_shstk_prepare arch_shstk_prepare -#endif - -#ifndef arch_shstk_unlock -static inline int arch_shstk_unlock(struct pstree_item *item, - CoreEntry *core, pid_t pid) -{ - return 0; -} -#define arch_shstk_unlock arch_shstk_unlock -#endif - -#ifndef arch_shstk_trampoline -static inline int arch_shstk_trampoline(struct pstree_item *item, CoreEntry *core, - int (*func)(void *arg), void *arg) -{ - return func(arg); -} -#define arch_shstk_trampoline arch_shstk_trampoline -#endif - -#ifndef shstk_restorer_stack_size -static always_inline long shstk_restorer_stack_size(void) -{ - return 0; -} -#endif - -#ifndef shstk_set_restorer_stack -static always_inline long shstk_set_restorer_stack(struct rst_shstk_info *info, void *ptr) -{ - return 0; -} -#endif - -#ifndef shstk_min_mmap_addr -static always_inline long shstk_min_mmap_addr(struct rst_shstk_info *info, unsigned long def) -{ - return def; -} -#endif - #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 14c0a3768..934d60cf9 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -44,22 +44,12 @@ struct rst_sched_param { int prio; }; -struct rst_rseq_param { - u64 rseq_abi_pointer; - u32 rseq_abi_size; - u32 signature; -}; - struct restore_posix_timer { struct str_posix_timer spt; struct itimerspec val; int overrun; }; -#ifndef rst_shstk_info -struct rst_shstk_info {}; -#endif - /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things @@ -75,8 +65,8 @@ struct thread_creds_args { u32 cap_prm[CR_CAP_SIZE]; u32 cap_eff[CR_CAP_SIZE]; u32 cap_bnd[CR_CAP_SIZE]; - u32 cap_amb[CR_CAP_SIZE]; + unsigned int secbits; char *lsm_profile; unsigned int *groups; char *lsm_sockcreate; @@ -108,7 +98,6 @@ struct thread_restore_args { struct task_restore_args *ta; tls_t tls; - struct rst_rseq_param rseq; siginfo_t *siginfo; unsigned int siginfo_n; @@ -124,11 +113,7 @@ struct thread_restore_args { unsigned int seccomp_filters_n; bool seccomp_force_tsync; - struct rst_shstk_info shstk; - char comm[TASK_COMM_LEN]; - int cg_set; - int cgroupd_sk; } __aligned(64); typedef long (*thread_restore_fcall_t)(struct thread_restore_args *args); @@ -150,10 +135,10 @@ struct task_restore_args { struct timeval logstart; int uffd; - bool thp_disabled; + bool has_thp_enabled; /* threads restoration */ - int nr_threads; /* number of threads */ + int nr_threads; /* number of threads */ thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */ struct thread_restore_args *thread_args; /* array of thread arguments */ struct task_entries *task_entries; @@ -170,7 +155,6 @@ struct task_restore_args { struct restore_posix_timer *posix_timers; unsigned int posix_timers_n; - bool posix_timer_cr_ids; struct restore_timerfd *timerfd; unsigned int timerfd_n; @@ -227,7 +211,7 @@ struct task_restore_args { bool can_map_vdso; bool auto_dedup; unsigned long vdso_rt_size; - struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ + struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ unsigned long vdso_rt_parked_at; /* safe place to keep vdso */ void **breakpoint; @@ -237,19 +221,7 @@ struct task_restore_args { #endif int lsm_type; int child_subreaper; - int membarrier_registration_mask; bool has_clone3_set_tid; - - /* - * info about rseq from libc used to - * unregister it before memory restoration procedure - */ - struct rst_rseq_param libc_rseq; - - uid_t uid; - u32 cap_eff[CR_CAP_SIZE]; - - struct rst_shstk_info shstk; } __aligned(64); /* @@ -341,27 +313,4 @@ enum { #define __r_sym(name) restorer_sym##name #define restorer_sym(rblob, name) (void *)(rblob + __r_sym(name)) -#ifndef arch_shstk_switch_to_restorer -static inline int arch_shstk_switch_to_restorer(struct rst_shstk_info *shstk) -{ - return 0; -} -#define arch_shstk_switch_to_restorer arch_shstk_switch_to_restorer -#endif - -#ifndef arch_shstk_restore -static inline int arch_shstk_restore(struct rst_shstk_info *shstk) -{ - return 0; -} -#define arch_shstk_restore arch_shstk_restore -#endif - -#ifndef shstk_vma_restore -static always_inline int shstk_vma_restore(VmaEntry *vma_entry) -{ - return -1; -} -#endif - #endif /* __CR_RESTORER_H__ */ diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index deb297e5f..2e2107b0e 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -1,13 +1,11 @@ #ifndef __CR_RST_INFO_H__ #define __CR_RST_INFO_H__ -#include "asm/restore.h" #include "common/lock.h" #include "common/list.h" #include "vma.h" #include "kerndat.h" #include "images/mm.pb-c.h" -#include "images/core.pb-c.h" struct task_entries { int nr_threads, nr_tasks, nr_helpers; @@ -15,30 +13,19 @@ struct task_entries { futex_t start; atomic_t cr_err; mutex_t userns_sync_lock; - mutex_t cgroupd_sync_lock; mutex_t last_pid_mutex; }; struct fdt { - int nr; /* How many tasks share this fd table */ + int nr; /* How many tasks share this fd table */ pid_t pid; /* Who should restore this fd table */ /* * The fd table is ready for restoing, if fdt_lock is equal to nr - * The fdt table was restored, if fdt_lock is equal to nr + 1 + * The fdt table was restrored, if fdt_lock is equal to nr + 1 */ futex_t fdt_lock; }; -struct rst_rseq { - uint64_t rseq_abi_pointer; - uint64_t rseq_cs_pointer; -}; - -#ifndef ARCH_RST_INFO -struct rst_arch_info { -}; -#endif - struct rst_info { struct list_head fds; @@ -80,14 +67,9 @@ struct rst_info { */ bool has_old_seccomp_filter; - struct rst_rseq *rseqe; - - futex_t shstk_enable; - futex_t shstk_unlock; + bool has_thp_enabled; void *breakpoint; - - struct rst_arch_info arch_info; }; extern struct task_entries *task_entries; diff --git a/criu/include/seize.h b/criu/include/seize.h index fc7facad3..cf7366cb0 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -2,14 +2,8 @@ #define __CR_SEIZE_H__ extern int collect_pstree(void); -extern int checkpoint_devices(void); -struct pstree_item; extern void pstree_switch_state(struct pstree_item *root_item, int st); extern const char *get_real_freezer_state(void); extern bool alarm_timeouted(void); -extern char *task_comm_info(pid_t pid, char *comm, size_t size); -extern char *__task_comm_info(pid_t pid); -extern void set_compel_interrupt_only_mode(void); - #endif diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index 4265d94ed..e75e8444c 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -22,11 +22,10 @@ enum sfd_type { * - For dump -- target ns' proc * - For restore -- CRIU ns' proc */ - ROOT_FD_OFF, /* Root of the namespace we dump/restore */ + ROOT_FD_OFF, /* Root of the namespace we dump/restore */ CGROUP_YARD, - CGROUPD_SK, /* Socket for cgroupd to fix up thread's cgroup controller */ - USERNSD_SK, /* Socket for usernsd */ - NS_FD_OFF, /* Node's net namespace fd */ + USERNSD_SK, /* Socket for usernsd */ + NS_FD_OFF, /* Node's net namespace fd */ TRANSPORT_FD_OFF, /* to transfer file descriptors */ RPC_SK_OFF, FDSTORE_SK_OFF, diff --git a/criu/include/setproctitle.h b/criu/include/setproctitle.h index a4873578a..bc634331b 100644 --- a/criu/include/setproctitle.h +++ b/criu/include/setproctitle.h @@ -1,7 +1,19 @@ #ifndef __CR_SETPROCTITLE_H__ #define __CR_SETPROCTITLE_H__ -extern void __setproctitle_init(int argc, char *argv[], char *envp[]); -extern void __setproctitle(const char *fmt, ...); +#ifdef CONFIG_HAS_LIBBSD +#include +#else + +/* + * setproctitle_init is in the libbsd since v0.6.0. This macro allows to + * compile criu with libbsd<0.6.0. + */ +#ifndef CONFIG_HAS_SETPROCTITLE_INIT +#define setproctitle_init(argc, argv, envp) +#endif + +#define setproctitle(fmt, ...) +#endif #endif /* __CR_SETPROCTITLE_H__ */ diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 15cab1146..813ef630e 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -4,14 +4,13 @@ #include "int.h" #include "common/lock.h" #include "images/vma.pb-c.h" -#include "pagemap-cache.h" struct vma_area; extern int collect_shmem(int pid, struct vma_area *vma); extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); -extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc); +extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); extern int fixup_sysv_shmems(void); extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); diff --git a/criu/include/sigact.h b/criu/include/sigact.h deleted file mode 100644 index 4df011f96..000000000 --- a/criu/include/sigact.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __CR_SIGACT_H__ -#define __CR_SIGACT_H__ - -#include "images/core.pb-c.h" - -extern rt_sigaction_t sigchld_act; - -struct parasite_ctl; -struct pstree_item; - -extern int prepare_sigactions(CoreEntry *core); -extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *); - -#endif diff --git a/criu/include/sizes.h b/criu/include/sizes.h deleted file mode 100644 index 0ec977fc0..000000000 --- a/criu/include/sizes.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef __CR_SIZES_H__ -#define __CR_SIZES_H__ - -/* - * Copied from the Linux kernel header include/linux/sizes.h - */ - -#define SZ_1 0x00000001 -#define SZ_2 0x00000002 -#define SZ_4 0x00000004 -#define SZ_8 0x00000008 -#define SZ_16 0x00000010 -#define SZ_32 0x00000020 -#define SZ_64 0x00000040 -#define SZ_128 0x00000080 -#define SZ_256 0x00000100 -#define SZ_512 0x00000200 - -#define SZ_1K 0x00000400 -#define SZ_2K 0x00000800 -#define SZ_4K 0x00001000 -#define SZ_8K 0x00002000 -#define SZ_16K 0x00004000 -#define SZ_32K 0x00008000 -#define SZ_64K 0x00010000 -#define SZ_128K 0x00020000 -#define SZ_256K 0x00040000 -#define SZ_512K 0x00080000 - -#define SZ_1M 0x00100000 -#define SZ_2M 0x00200000 -#define SZ_4M 0x00400000 -#define SZ_8M 0x00800000 -#define SZ_16M 0x01000000 -#define SZ_32M 0x02000000 -#define SZ_64M 0x04000000 -#define SZ_128M 0x08000000 -#define SZ_256M 0x10000000 -#define SZ_512M 0x20000000 - -#define SZ_1G 0x40000000 -#define SZ_2G 0x80000000 - -#define SZ_4G 0x100000000ULL -#define SZ_8G 0x200000000ULL -#define SZ_16G 0x400000000ULL -#define SZ_32G 0x800000000ULL -#define SZ_64T 0x400000000000ULL - -#endif /* __CR_SIZES_H__ */ diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 69ee8589e..c832d6387 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -35,7 +35,7 @@ struct inet_sk_desc { unsigned int dst_port; unsigned int state; unsigned int rqlen; - unsigned int wqlen; /* sent + unsent data */ + unsigned int wqlen; /* sent + unsent data */ unsigned int uwqlen; /* unsent data */ unsigned int src_addr[4]; unsigned int dst_addr[4]; @@ -69,7 +69,6 @@ extern int inet_connect(int sk, struct inet_sk_info *); #ifdef CR_NOGLIBC #define setsockopt sys_setsockopt -#define pr_perror(fmt, ...) pr_err(fmt ": errno %d\n", ##__VA_ARGS__, -ret) #endif static inline void tcp_repair_off(int fd) { @@ -77,7 +76,7 @@ static inline void tcp_repair_off(int fd) ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux)); if (ret < 0) - pr_perror("Failed to turn off repair mode on socket %d", fd); + pr_err("Failed to turn off repair mode on socket: %m\n"); } extern void tcp_locked_conn_add(struct inet_sk_info *); @@ -87,9 +86,6 @@ extern void cpt_unlock_tcp_connections(void); extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); -extern int dump_tcp_opts(int sk, TcpOptsEntry *toe); -extern int restore_tcp_opts(int sk, TcpOptsEntry *toe); - #define SK_EST_PARAM "tcp-established" #define SK_INFLIGHT_PARAM "skip-in-flight" #define SK_CLOSE_PARAM "tcp-close" diff --git a/criu/include/sockets.h b/criu/include/sockets.h index 6c81d3edd..3e8f3d601 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -25,9 +25,8 @@ struct socket_desc { }; extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); -extern int dump_socket_opts(int sk, int family, SkOptsEntry *soe); +extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); -extern int sk_setbufs(int sk, uint32_t *bufs); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); @@ -124,8 +123,4 @@ extern const char *socket_proto_name(unsigned int proto, char *nm, size_t size); #define ___socket_family_name(family) __socket_info_helper(socket_family_name, family) #define ___socket_proto_name(proto) __socket_info_helper(socket_proto_name, proto) -#ifndef SO_BUF_LOCK -#define SO_BUF_LOCK 72 -#endif - #endif /* __CR_SOCKETS_H__ */ diff --git a/criu/include/string.h b/criu/include/string.h index 4c71d961c..e11a42058 100644 --- a/criu/include/string.h +++ b/criu/include/string.h @@ -3,9 +3,18 @@ #include +#ifdef CONFIG_HAS_LIBBSD +#include +#endif + #include "common/config.h" -extern size_t __strlcpy(char *dest, const char *src, size_t size); -extern size_t __strlcat(char *dest, const char *src, size_t count); +#ifndef CONFIG_HAS_STRLCPY +extern size_t strlcpy(char *dest, const char *src, size_t size); +#endif + +#ifndef CONFIG_HAS_STRLCAT +extern size_t strlcat(char *dest, const char *src, size_t count); +#endif #endif /* __CR_STRING_H__ */ diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h index 2d689a9a0..ac7924dcd 100644 --- a/criu/include/sysctl.h +++ b/criu/include/sysctl.h @@ -34,9 +34,8 @@ enum { /* * Some entries might be missing mark them as optional. */ -#define CTL_FLAGS_OPTIONAL 1 -#define CTL_FLAGS_HAS 2 -#define CTL_FLAGS_READ_EIO_SKIP 4 -#define CTL_FLAGS_IPC_EACCES_SKIP 8 +#define CTL_FLAGS_OPTIONAL 1 +#define CTL_FLAGS_HAS 2 +#define CTL_FLAGS_READ_EIO_SKIP 4 #endif /* __CR_SYSCTL_H__ */ diff --git a/criu/include/sysfs_parse.h b/criu/include/sysfs_parse.h index f987d622f..ff0e61148 100644 --- a/criu/include/sysfs_parse.h +++ b/criu/include/sysfs_parse.h @@ -2,9 +2,9 @@ #define __CR_SYSFS_PARSE_H__ #define SYSFS_AUFS "/sys/fs/aufs/" -#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ +#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */ #define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/ */ -#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ +#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs//br%3d */ struct mount_info; struct vma_area; diff --git a/criu/include/timer.h b/criu/include/timer.h deleted file mode 100644 index d1deb6051..000000000 --- a/criu/include/timer.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __CR_TIMER_H__ -#define __CR_TIMER_H__ - -#include "images/core.pb-c.h" - -struct task_restore_args; -struct pstree_item; -struct parasite_ctl; -struct proc_posix_timers_stat; - -extern int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core); -extern int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); - -extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item); -extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item); -#endif diff --git a/criu/include/tls.h b/criu/include/tls.h index f563c092c..26f9976fd 100644 --- a/criu/include/tls.h +++ b/criu/include/tls.h @@ -4,7 +4,7 @@ #ifdef CONFIG_GNUTLS int tls_x509_init(int sockfd, bool is_server); -void tls_terminate_session(bool async); +void tls_terminate_session(void); ssize_t tls_send(const void *buf, size_t len, int flags); ssize_t tls_recv(void *buf, size_t len, int flags); @@ -19,7 +19,7 @@ int tls_recv_data_to_fd(int fd, unsigned long len); #define tls_recv(buf, len, flags) (-1) #define tls_send_data_from_fd(fd, len) (-1) #define tls_recv_data_to_fd(fd, len) (-1) -#define tls_terminate_session(async) +#define tls_terminate_session() #endif /* CONFIG_HAS_GNUTLS */ diff --git a/criu/include/util-caps.h b/criu/include/util-caps.h deleted file mode 100644 index 7ccd162f5..000000000 --- a/criu/include/util-caps.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef __CR_UTIL_CAPS_H__ -#define __CR_UTIL_CAPS_H__ - -#include - -#ifndef CAP_CHECKPOINT_RESTORE -#define CAP_CHECKPOINT_RESTORE 40 -#endif - -static inline bool has_capability(int cap, u32 *cap_eff) -{ - int mask = CAP_TO_MASK(cap); - int index = CAP_TO_INDEX(cap); - u32 effective; - - effective = cap_eff[index]; - - if (!(mask & effective)) { - pr_debug("Effective capability %d missing\n", cap); - return false; - } - - return true; -} - -static inline bool has_cap_checkpoint_restore(u32 *cap_eff) -{ - /* - * Everything guarded by CAP_CHECKPOINT_RESTORE is also - * guarded by CAP_SYS_ADMIN. Check for both capabilities. - */ - if (has_capability(CAP_CHECKPOINT_RESTORE, cap_eff) || has_capability(CAP_SYS_ADMIN, cap_eff)) - return true; - - return false; -} - -static inline bool has_cap_net_admin(u32 *cap_eff) -{ - return has_capability(CAP_NET_ADMIN, cap_eff); -} - -static inline bool has_cap_sys_chroot(u32 *cap_eff) -{ - return has_capability(CAP_SYS_CHROOT, cap_eff); -} - -static inline bool has_cap_setuid(u32 *cap_eff) -{ - return has_capability(CAP_SETUID, cap_eff); -} - -static inline bool has_cap_sys_resource(u32 *cap_eff) -{ - return has_capability(CAP_SYS_RESOURCE, cap_eff); -} - -#endif /* __CR_UTIL_CAPS_H__ */ diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h index 9fd9a6de4..c4386cf8e 100644 --- a/criu/include/util-vdso.h +++ b/criu/include/util-vdso.h @@ -30,7 +30,6 @@ struct vdso_symbol { struct vdso_symtable { unsigned long vdso_size; unsigned long vvar_size; - unsigned long vvar_vclock_size; struct vdso_symbol symbols[VDSO_SYMBOL_MAX]; bool vdso_before_vvar; /* order of vdso/vvar pair */ }; diff --git a/criu/include/util.h b/criu/include/util.h index 55ad5b63c..a2dac2233 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -21,8 +21,6 @@ #include "log.h" #include "common/err.h" -#include "compel/infect-util.h" - #define PREF_SHIFT_OP(pref, op, size) ((size)op(pref##BYTES_SHIFT)) #define KBYTES_SHIFT 10 #define MBYTES_SHIFT 20 @@ -168,11 +166,9 @@ extern int is_anon_link_type(char *link, char *type); extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags); extern int cr_system_userns(int in, int out, int err, char *cmd, char *const argv[], unsigned flags, int userns_pid); -extern pid_t fork_and_ptrace_attach(int (*child_setup)(void)); extern int cr_daemon(int nochdir, int noclose, int close_fd); extern int status_ready(void); extern int is_root_user(void); -extern int close_fds(int minfd); extern int set_proc_self_fd(int fd); @@ -245,10 +241,6 @@ static inline bool issubpath(const char *path, const char *sub_path) return strstartswith2(path, sub_path, &end) && (end == '/' || end == '\0'); } -extern char *get_relative_path(char *path, char *sub_path); -extern bool is_sub_path(char *path, char *sub_path); -extern bool is_same_path(char *path1, char *path2); - int strip_deleted(char *path, int len); int cut_path_ending(char *path, char *sub_path); @@ -266,10 +258,6 @@ bool is_path_prefix(const char *path, const char *prefix); FILE *fopenat(int dirfd, char *path, char *cflags); void split(char *str, char token, char ***out, int *n); -int cr_fchown(int fd, uid_t new_uid, gid_t new_gid); -int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode); -int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags); - int fd_has_data(int lfd); int make_yard(char *path); @@ -281,6 +269,8 @@ static inline int sk_wait_data(int sk) } void fd_set_nonblocking(int fd, bool on); +void tcp_nodelay(int sk, bool on); +void tcp_cork(int sk, bool on); const char *ns_to_string(unsigned int ns); @@ -294,8 +284,8 @@ int setup_tcp_server(char *type, char *addr, unsigned short *port); int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk); int setup_tcp_client(char *hostname); -/* path should be writable and no more than PATH_MAX long */ -int rmrf(char *path); +/* *dir should be writable and at least PATH_MAX long */ +int rm_rf(char *dir); #define LAST_PID_PATH "sys/kernel/ns_last_pid" #define PID_MAX_PATH "sys/kernel/pid_max" @@ -389,14 +379,7 @@ static inline void print_stack_trace(pid_t pid) extern int mount_detached_fs(const char *fsname); -extern int cr_fsopen(const char *fsname, unsigned int flags); -extern int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux); -extern int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags); -extern void fsfd_dump_messages(int fd); - -extern char *get_legacy_iptables_bin(bool ipv6, bool restore); - -extern int set_opts_cap_eff(void); +extern char *get_legacy_iptables_bin(bool ipv6); extern ssize_t read_all(int fd, void *buf, size_t size); extern ssize_t write_all(int fd, const void *buf, size_t size); @@ -408,27 +391,6 @@ static inline void cleanup_freep(void *p) free(*pp); } -#define cleanup_file __attribute__((cleanup(cleanup_filep))) -static inline void cleanup_filep(FILE **f) -{ - FILE *file = *f; - if (file) - (void)fclose(file); -} - extern int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args); -/* - * criu_run_id is a unique value of the current run. It can be used to - * generate resource ID-s to avoid conflicts with other CRIU processes. - */ -extern char criu_run_id[RUN_ID_HASH_LENGTH]; -extern void util_init(void); -#define NO_DUMP_CRIU_RUN_ID 0x7f -extern char dump_criu_run_id[RUN_ID_HASH_LENGTH]; - -extern char *resolve_mountpoint(char *path); - -extern int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); - #endif /* __CR_UTIL_H__ */ diff --git a/criu/include/vma.h b/criu/include/vma.h index b8ddfc142..ed9f31ef6 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -10,14 +10,14 @@ #include struct vm_area_list { - struct list_head h; /* list of VMAs */ - unsigned nr; /* nr of all VMAs in the list */ + struct list_head h; /* list of VMAs */ + unsigned nr; /* nr of all VMAs in the list */ unsigned int nr_aios; /* nr of AIOs VMAs in the list */ union { unsigned long nr_priv_pages; /* dmp: nr of pages in private VMAs */ unsigned long rst_priv_size; /* rst: size of private VMAs */ }; - unsigned long nr_priv_pages_longest; /* nr of pages in longest private VMA */ + unsigned long nr_priv_pages_longest; /* nr of pages in longest private VMA */ unsigned long nr_shared_pages_longest; /* nr of pages in longest shared VMA */ }; @@ -53,8 +53,8 @@ struct vma_area { struct /* for restore */ { int (*vm_open)(int pid, struct vma_area *vma); struct file_desc *vmfd; - struct vma_area *pvma; /* parent for inherited VMAs */ - unsigned long *page_bitmap; /* existent pages */ + struct vma_area *pvma; /* parent for inherited VMAs */ + unsigned long *page_bitmap; /* existent pages */ unsigned long premmaped_addr; /* restore only */ /* @@ -106,7 +106,6 @@ static inline bool vma_entry_is_private(VmaEntry *entry, unsigned long task_size return (vma_entry_is(entry, VMA_AREA_REGULAR) && (vma_entry_is(entry, VMA_ANON_PRIVATE) || vma_entry_is(entry, VMA_FILE_PRIVATE)) && (entry->end <= task_size)) || - vma_entry_is(entry, VMA_AREA_SHSTK) || vma_entry_is(entry, VMA_AREA_AIORING); } @@ -123,8 +122,7 @@ static inline struct vma_area *vma_next(struct vma_area *vma) static inline bool vma_entry_can_be_lazy(VmaEntry *e) { return ((e->flags & MAP_ANONYMOUS) && (e->flags & MAP_PRIVATE) && !(e->flags & MAP_LOCKED) && - !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VVAR)) && - !(vma_entry_is(e, VMA_AREA_VSYSCALL)) && !(e->flags & MAP_HUGETLB)); + !(vma_entry_is(e, VMA_AREA_VDSO)) && !(vma_entry_is(e, VMA_AREA_VSYSCALL))); } #endif /* __CR_VMA_H__ */ diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c index 7e95be8c5..a2eb72f28 100644 --- a/criu/ipc_ns.c +++ b/criu/ipc_ns.c @@ -15,7 +15,6 @@ #include "sysctl.h" #include "ipc_ns.h" #include "shmem.h" -#include "types.h" #include "protobuf.h" #include "images/ipc-var.pb-c.h" @@ -292,8 +291,6 @@ static void pr_info_ipc_shm(const IpcShmEntry *shm) static int ipc_sysctl_req(IpcVarEntry *e, int op) { - int i; - struct sysctl_req req[] = { { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) }, { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 }, @@ -334,9 +331,6 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op) if (e->has_shm_next_id) req[nr++] = req[16]; - for (i = 0; i < nr; i++) - req[i].flags = CTL_FLAGS_IPC_EACCES_SKIP; - return sysctl_op(req, nr, op, CLONE_NEWIPC); } @@ -360,42 +354,6 @@ static int dump_ipc_shm_pages(const IpcShmEntry *shm) return ret; } -static int dump_shm_hugetlb_flag(IpcShmEntry *shm, int id, unsigned long size) -{ - void *addr; - int ret, hugetlb_flag, exit_code = -1; - struct stat st; - char path[64]; - - addr = shmat(id, NULL, SHM_RDONLY); - if (addr == (void *)-1) { - pr_perror("Failed to attach shm"); - return -1; - } - - /* The shm segment size may not be aligned, - * we need to align it up to next page size - */ - size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); - snprintf(path, sizeof(path), "/proc/self/map_files/%lx-%lx", (unsigned long)addr, (unsigned long)addr + size); - - ret = stat(path, &st); - if (ret < 0) { - pr_perror("Can't stat map_files"); - goto detach; - } - - if (is_hugetlb_dev(st.st_dev, &hugetlb_flag)) { - shm->has_hugetlb_flag = true; - shm->hugetlb_flag = hugetlb_flag | SHM_HUGETLB; - } - - exit_code = 0; -detach: - shmdt(addr); - return exit_code; -} - static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *ds) { IpcShmEntry shm = IPC_SHM_ENTRY__INIT; @@ -406,10 +364,6 @@ static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *d shm.size = ds->shm_segsz; shm.has_in_pagemaps = true; shm.in_pagemaps = true; - - if (dump_shm_hugetlb_flag(&shm, id, ds->shm_segsz)) - return -1; - fill_ipc_desc(id, shm.desc, &ds->shm_perm); pr_info_ipc_shm(&shm); @@ -575,7 +529,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem) { int ret, id; struct sysctl_req req[] = { - { "kernel/sem_next_id", &sem->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, + { "kernel/sem_next_id", &sem->desc->id, CTL_U32 }, }; struct semid_ds semid; @@ -708,7 +662,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq) { int ret, id; struct sysctl_req req[] = { - { "kernel/msg_next_id", &msq->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, + { "kernel/msg_next_id", &msq->desc->id, CTL_U32 }, }; struct msqid_ds msqid; @@ -844,9 +798,9 @@ static int prepare_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm) static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) { - int ret, id, hugetlb_flag = 0; + int ret, id; struct sysctl_req req[] = { - { "kernel/shm_next_id", &shm->desc->id, CTL_U32, CTL_FLAGS_IPC_EACCES_SKIP }, + { "kernel/shm_next_id", &shm->desc->id, CTL_U32 }, }; struct shmid_ds shmid; @@ -859,10 +813,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) return ret; } - if (shm->has_hugetlb_flag) - hugetlb_flag = shm->hugetlb_flag; - - id = shmget(shm->desc->key, shm->size, hugetlb_flag | shm->desc->mode | IPC_CREAT | IPC_EXCL); + id = shmget(shm->desc->key, shm->size, shm->desc->mode | IPC_CREAT | IPC_EXCL); if (id == -1) { pr_perror("Failed to create shm set"); return -errno; diff --git a/criu/irmap.c b/criu/irmap.c index d2c5d588a..09570c593 100644 --- a/criu/irmap.c +++ b/criu/irmap.c @@ -67,7 +67,6 @@ static struct irmap hints[] = { .path = "/var/log", .nr_kids = -1, }, - { .path = "/usr/share/dbus-1/services", .nr_kids = -1 }, { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 }, { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 }, { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 }, @@ -102,7 +101,7 @@ static int irmap_update_stat(struct irmap *i) pr_debug("Refresh stat for %s\n", i->path); if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) { - pr_pwarn("Can't stat %s", i->path); + pr_perror("Can't stat %s", i->path); return -1; } @@ -137,7 +136,7 @@ static int irmap_update_dir(struct irmap *t) pr_debug("Refilling %s dir\n", t->path); fd = openat(mntns_root, t->path + 1, O_RDONLY); if (fd < 0) { - pr_pwarn("Can't open %s", t->path); + pr_perror("Can't open %s", t->path); return -1; } @@ -161,8 +160,8 @@ static int irmap_update_dir(struct irmap *t) k = &t->kids[nr - 1]; - k->kids = NULL; /* for xrealloc above */ - k->ino = 0; /* for irmap_update_stat */ + k->kids = NULL; /* for xrealloc above */ + k->ino = 0; /* for irmap_update_stat */ k->nr_kids = -1; /* for irmap_update_dir */ k->path = xsprintf("%s/%s", t->path, de->d_name); if (!k->path) @@ -500,13 +499,8 @@ int irmap_scan_path_add(char *path) return -1; } - o->ir->path = xstrdup(path); - if (!o->ir->path) { - xfree(o->ir); - xfree(o); - return -1; - } + o->ir->path = path; o->ir->nr_kids = -1; - list_add_tail(&o->node, &opts.irmap_scan_paths); + list_add(&o->node, &opts.irmap_scan_paths); return 0; } diff --git a/criu/kerndat.c b/criu/kerndat.c index 2dc2f77d5..0e88ba43e 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -4,25 +4,20 @@ #include #include #include -#include -#include #include #include #include #include #include #include -#include +#include /* for sockaddr_in and inet_ntoa() */ #include #include #include -#include -#include #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) #include #endif -#include #include "common/config.h" #include "int.h" @@ -31,7 +26,6 @@ #include "kerndat.h" #include "fs-magic.h" #include "mem.h" -#include "mman.h" #include "common/compiler.h" #include "sysctl.h" #include "cr_options.h" @@ -42,7 +36,6 @@ #include "sockets.h" #include "net.h" #include "tun.h" -#include #include #include "netfilter.h" #include "fsnotify.h" @@ -53,25 +46,13 @@ #include "kcmp.h" #include "sched.h" #include "memfd.h" -#include "mount-v2.h" -#include "util-caps.h" -#include "pagemap_scan.h" struct kerndat_s kdat = {}; -volatile int dummy_var; static int check_pagemap(void) { - int ret, fd, retry; + int ret, fd; u64 pfn = 0; - struct pm_scan_arg args = { - .size = sizeof(struct pm_scan_arg), - .flags = 0, - .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, - .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, - .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, - .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, - }; fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap"); if (fd < 0) { @@ -84,44 +65,11 @@ static int check_pagemap(void) return -1; } - if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) { - pr_debug("PAGEMAP_SCAN is supported\n"); - kdat.has_pagemap_scan = true; - - args.return_mask |= PAGE_IS_GUARD; - if (ioctl(fd, PAGEMAP_SCAN, &args) == 0) - kdat.has_pagemap_scan_guard_pages = true; - } else { - switch (errno) { - case EINVAL: - case ENOTTY: - pr_debug("PAGEMAP_SCAN isn't supported\n"); - break; - default: - pr_perror("PAGEMAP_SCAN failed with unexpected errno"); - return -1; - } - } - - retry = 3; - while (retry--) { - ++dummy_var; - /* Get the PFN of a page likely to be present. */ - ret = pread(fd, &pfn, sizeof(pfn), PAGE_PFN((uintptr_t)&dummy_var) * sizeof(pfn)); - if (ret != sizeof(pfn)) { - pr_perror("Can't read pagemap"); - close(fd); - return -1; - } - /* The page can be swapped out by the time the read occurs, - * in which case the rest of the bits are a swap type + offset - * (which could be zero even if not hidden). - * Retry if this happens. */ - if (pfn & PME_PRESENT) - break; - pr_warn("got non-present PFN %#lx for the dummy data page; %s\n", (unsigned long)pfn, - retry ? "retrying" : "giving up"); - pfn = 0; + /* Get the PFN of some present page. Stack is here, so try it :) */ + ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn)); + if (ret != sizeof(pfn)) { + pr_perror("Can't read pagemap"); + return -1; } close(fd); @@ -235,38 +183,11 @@ static int kerndat_files_stat(void) return 0; } -static int kerndat_get_dev(dev_t *dev, char *map, size_t size) -{ - char maps[128]; - struct stat buf; - - sprintf(maps, "/proc/self/map_files/%lx-%lx", (unsigned long)map, (unsigned long)map + size); - if (stat(maps, &buf) < 0) { - int e = errno; - if (errno == EPERM) { - /* - * Kernel disables messing with map_files. - * OK, let's go the slower route. - */ - - if (parse_self_maps((unsigned long)map, dev) < 0) { - pr_err("Can't read self maps\n"); - return -1; - } - } else { - pr_perror("Can't stat self map_files %d", e); - return -1; - } - } else { - *dev = buf.st_dev; - } - - return 0; -} - static int kerndat_get_shmemdev(void) { void *map; + char maps[128]; + struct stat buf; dev_t dev; map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); @@ -275,8 +196,25 @@ static int kerndat_get_shmemdev(void) return -1; } - if (kerndat_get_dev(&dev, map, PAGE_SIZE)) - goto err; + sprintf(maps, "/proc/self/map_files/%lx-%lx", (unsigned long)map, (unsigned long)map + page_size()); + if (stat(maps, &buf) < 0) { + int e = errno; + if (errno == EPERM) { + /* + * Kernel disables messing with map_files. + * OK, let's go the slower route. + */ + + if (parse_self_maps((unsigned long)map, &dev) < 0) { + pr_err("Can't read self maps\n"); + goto err; + } + } else { + pr_perror("Can't stat self map_files %d", e); + goto err; + } + } else + dev = buf.st_dev; munmap(map, PAGE_SIZE); kdat.shmem_dev = dev; @@ -288,60 +226,6 @@ err: return -1; } -/* Return -1 -- error - * Return 0 -- successful but can't get any new device's numbers - * Return 1 -- successful and get new device's numbers - * - * At first, all kdat.hugetlb_dev elements are initialized to 0. - * When the function finishes, - * kdat.hugetlb_dev[i] == -1 -- this hugetlb page size is not supported - * kdat.hugetlb_dev[i] == 0 -- this hugetlb page size is supported but can't collect device's number - * Otherwise, kdat.hugetlb_dev[i] contains the corresponding device's number - * - * Next time the function is called, it only tries to collect the device's number of hugetlb page size - * that is supported but can't be collected in the previous call (kdat.hugetlb_dev[i] == 0) - */ -static int kerndat_get_hugetlb_dev(void) -{ - void *map; - int i, flag, ret = 0; - unsigned long long size; - dev_t dev; - - for (i = 0; i < HUGETLB_MAX; i++) { - /* Skip if this hugetlb size is not supported or the device's number has been collected */ - if (kdat.hugetlb_dev[i]) - continue; - - size = hugetlb_info[i].size; - flag = hugetlb_info[i].flag; - map = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | flag, 0, 0); - if (map == MAP_FAILED) { - if (errno == EINVAL) { - kdat.hugetlb_dev[i] = (dev_t)-1; - continue; - } else if (errno == ENOMEM) { - pr_info("Hugetlb size %llu Mb is supported but cannot get dev's number\n", size >> 20); - continue; - } else { - pr_perror("Unexpected result when get hugetlb dev"); - return -1; - } - } - - if (kerndat_get_dev(&dev, map, size)) { - munmap(map, size); - return -1; - } - - munmap(map, size); - kdat.hugetlb_dev[i] = dev; - ret = 1; - pr_info("Found hugetlb device at %" PRIx64 "\n", kdat.hugetlb_dev[i]); - } - return ret; -} - static dev_t get_host_dev(unsigned int which) { static struct kst { @@ -467,6 +351,10 @@ static int kerndat_get_dirty_track(void) } else { no_dt: pr_info("Dirty tracking support is OFF\n"); + if (opts.track_mem) { + pr_err("Tracking memory is not available\n"); + return -1; + } } return 0; @@ -510,15 +398,8 @@ static int get_last_cap(void) struct sysctl_req req[] = { { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 }, }; - int ret; - ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); - if (ret || kdat.last_cap < 32 * CR_CAP_SIZE) - return ret; - - pr_err("Kernel reports more capabilities than this CRIU supports: %u > %u\n", - kdat.last_cap, 32 * CR_CAP_SIZE - 1); - return -1; + return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0); } static bool kerndat_has_memfd_create(void) @@ -539,29 +420,6 @@ static bool kerndat_has_memfd_create(void) return 0; } -static bool kerndat_has_memfd_hugetlb(void) -{ - int ret; - - if (!kdat.has_memfd) { - kdat.has_memfd_hugetlb = false; - return 0; - } - - ret = memfd_create("", MFD_HUGETLB); - if (ret >= 0) { - kdat.has_memfd_hugetlb = true; - close(ret); - } else if (ret == -1 && (errno == EINVAL || errno == ENOENT || errno == ENOSYS)) { - kdat.has_memfd_hugetlb = false; - } else { - pr_perror("Unexpected error from memfd_create(\"\", MFD_HUGETLB)"); - return -1; - } - - return 0; -} - static int get_task_size(void) { kdat.task_size = compel_task_size(); @@ -651,7 +509,7 @@ static int kerndat_loginuid(void) static int kerndat_iptables_has_xtlocks(void) { int fd; - char *argv[4] = { "sh", "-c", "iptables -n -w -L", NULL }; + char *argv[4] = { "sh", "-c", "iptables -w -L", NULL }; fd = open("/dev/null", O_RDWR); if (fd < 0) { @@ -667,52 +525,29 @@ static int kerndat_iptables_has_xtlocks(void) return 0; } -/* - * Unfortunately in C htonl() is not constexpr and cannot be used in a static - * initialization below. - */ -#define constant_htonl(x) \ - (__BYTE_ORDER == __BIG_ENDIAN ? (x) : \ - (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ - (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) - -static int kerndat_tcp_repair(void) +int kerndat_tcp_repair(void) { - static const struct sockaddr_in loopback_ip4 = { - .sin_family = AF_INET, - .sin_port = 0, - .sin_addr = { constant_htonl(INADDR_LOOPBACK) }, - }; - static const struct sockaddr_in6 loopback_ip6 = { - .sin6_family = AF_INET6, - .sin6_port = 0, - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - }; int sock, clnt = -1, yes = 1, exit_code = -1; - const struct sockaddr *addr; - struct sockaddr_storage listener_addr; - socklen_t addrlen; + struct sockaddr_in addr; + socklen_t aux; - addr = (const struct sockaddr *)&loopback_ip4; - addrlen = sizeof(loopback_ip4); + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + inet_pton(AF_INET, "127.0.0.1", &(addr.sin_addr)); + addr.sin_port = 0; sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - if (sock < 0 && errno == EAFNOSUPPORT) { - addr = (const struct sockaddr *)&loopback_ip6; - addrlen = sizeof(loopback_ip6); - sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); - } if (sock < 0) { pr_perror("Unable to create a socket"); return -1; } - if (bind(sock, addr, addrlen)) { + if (bind(sock, (struct sockaddr *)&addr, sizeof(addr))) { pr_perror("Unable to bind a socket"); goto err; } - addrlen = sizeof(listener_addr); - if (getsockname(sock, (struct sockaddr *)&listener_addr, &addrlen)) { + aux = sizeof(addr); + if (getsockname(sock, (struct sockaddr *)&addr, &aux)) { pr_perror("Unable to get a socket name"); goto err; } @@ -722,13 +557,13 @@ static int kerndat_tcp_repair(void) goto err; } - clnt = socket(addr->sa_family, SOCK_STREAM, IPPROTO_TCP); + clnt = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); if (clnt < 0) { pr_perror("Unable to create a socket"); goto err; } - if (connect(clnt, (const struct sockaddr *)&listener_addr, addrlen)) { + if (connect(clnt, (struct sockaddr *)&addr, sizeof(addr))) { pr_perror("Unable to connect a socket"); goto err; } @@ -755,22 +590,20 @@ err: return exit_code; } -static int kerndat_nsid(void) +int kerndat_nsid(void) { int nsid, sk; - kdat.has_nsid = false; - sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); if (sk < 0) { - pr_pwarn("Unable to create a netlink socket: NSID can't be used."); - return 0; + pr_perror("Unable to create a netlink socket"); + return -1; } if (net_get_nsid(sk, getpid(), &nsid) < 0) { - pr_warn("NSID is not supported\n"); + pr_err("NSID is not supported\n"); close(sk); - return 0; + return -1; } kdat.has_nsid = true; @@ -834,14 +667,14 @@ static int kerndat_detect_stack_guard_gap(void) /* * When reading /proc/$pid/[s]maps the - * start/end addresses might be cut off + * start/end addresses might be cutted off * with PAGE_SIZE on kernels prior 4.12 * (see kernel commit 1be7107fbe18ee). * * Same time there was semi-complete - * patch released which hit a number + * patch released which hitted a number * of repos (Ubuntu, Fedora) where instead - * of PAGE_SIZE the 1M gap is cut off. + * of PAGE_SIZE the 1M gap is cutted off. */ if (start == (unsigned long)mem) { kdat.stack_guard_gap_hidden = false; @@ -983,257 +816,19 @@ static int kerndat_x86_has_ptrace_fpu_xsave_bug(void) return 0; } -static int kerndat_has_rseq(void) -{ - if (syscall(__NR_rseq, NULL, 0, 0, 0) != -1) { - pr_err("rseq should fail\n"); - return -1; - } - if (errno == ENOSYS) - pr_info("rseq syscall isn't supported\n"); - else - kdat.has_rseq = true; +#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/criu.kdat" +#define KERNDAT_CACHE_FILE_TMP KDAT_RUNDIR "/.criu.kdat" - return 0; -} - -static int kerndat_has_ptrace_get_rseq_conf(void) -{ - pid_t pid; - int len; - struct __ptrace_rseq_configuration rseq; - int ret = 0; - - pid = fork_and_ptrace_attach(NULL); - if (pid < 0) - return -1; - - len = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, pid, sizeof(rseq), &rseq); - if (len != sizeof(rseq)) { - if (kdat.has_ptrace_get_rseq_conf) - ret = 1; /* we should update kdat */ - - kdat.has_ptrace_get_rseq_conf = false; - pr_info("ptrace(PTRACE_GET_RSEQ_CONFIGURATION) is not supported\n"); - goto out; - } - - /* - * flags is always zero from the kernel side, if it will be changed - * we need to pay attention to that and, possibly, make changes on the CRIU side. - */ - if (rseq.flags != 0) { - if (kdat.has_ptrace_get_rseq_conf) - ret = 1; /* we should update kdat */ - - kdat.has_ptrace_get_rseq_conf = false; - pr_err("ptrace(PTRACE_GET_RSEQ_CONFIGURATION): rseq.flags != 0\n"); - } else { - if (!kdat.has_ptrace_get_rseq_conf) - ret = 1; /* we should update kdat */ - - kdat.has_ptrace_get_rseq_conf = true; - - if (memcmp(&kdat.libc_rseq_conf, &rseq, sizeof(rseq))) - ret = 1; /* we should update kdat */ - - kdat.libc_rseq_conf = rseq; - } - -out: - kill(pid, SIGKILL); - waitpid(pid, NULL, 0); - return ret; -} - -int kerndat_sockopt_buf_lock(void) -{ - int exit_code = -1; - socklen_t len; - u32 buf_lock; - int sock; - - sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - if (sock < 0 && errno == EAFNOSUPPORT) - sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP); - if (sock < 0) { - pr_perror("Unable to create a socket"); - return -1; - } - - len = sizeof(buf_lock); - if (getsockopt(sock, SOL_SOCKET, SO_BUF_LOCK, &buf_lock, &len)) { - if (errno != ENOPROTOOPT) { - pr_perror("Unable to get SO_BUF_LOCK with getsockopt"); - goto err; - } - kdat.has_sockopt_buf_lock = false; - } else - kdat.has_sockopt_buf_lock = true; - - exit_code = 0; -err: - close(sock); - return exit_code; -} - -static int kerndat_has_move_mount_set_group(void) -{ - char tmpdir[] = "/tmp/.criu.move_mount_set_group.XXXXXX"; - char subdir[64]; - int exit_code = -1; - - if (mkdtemp(tmpdir) == NULL) { - pr_perror("Fail to make dir %s", tmpdir); - return -1; - } - - if (mount("criu.move_mount_set_group", tmpdir, "tmpfs", 0, NULL)) { - pr_perror("Fail to mount tmfps to %s", tmpdir); - rmdir(tmpdir); - return -1; - } - - if (mount(NULL, tmpdir, NULL, MS_PRIVATE, NULL)) { - pr_perror("Fail to make %s private", tmpdir); - goto out; - } - - if (snprintf(subdir, sizeof(subdir), "%s/subdir", tmpdir) >= sizeof(subdir)) { - pr_err("Fail to snprintf subdir\n"); - goto out; - } - - if (mkdir(subdir, 0700)) { - pr_perror("Fail to make dir %s", subdir); - goto out; - } - - if (mount(subdir, subdir, NULL, MS_BIND, NULL)) { - pr_perror("Fail to make bind-mount %s", subdir); - goto out; - } - - if (mount(NULL, tmpdir, NULL, MS_SHARED, NULL)) { - pr_perror("Fail to make %s private", tmpdir); - goto out; - } - - if (sys_move_mount(AT_FDCWD, tmpdir, AT_FDCWD, subdir, MOVE_MOUNT_SET_GROUP)) { - if (errno == EINVAL || errno == ENOSYS) { - pr_debug("No MOVE_MOUNT_SET_GROUP kernel feature\n"); - kdat.has_move_mount_set_group = false; - exit_code = 0; - goto out; - } - pr_perror("Fail to MOVE_MOUNT_SET_GROUP"); - goto out; - } - - kdat.has_move_mount_set_group = true; - exit_code = 0; -out: - if (umount2(tmpdir, MNT_DETACH)) - pr_warn("Fail to umount2 %s: %s\n", tmpdir, strerror(errno)); - if (rmdir(tmpdir)) - pr_warn("Fail to rmdir %s: %s\n", tmpdir, strerror(errno)); - return exit_code; -} - -static int kerndat_has_openat2(void) -{ - if (sys_openat2(AT_FDCWD, ".", NULL, 0) != -1) { - pr_err("openat2 should fail\n"); - return -1; - } - if (errno == ENOSYS) { - pr_debug("No openat2 syscall support\n"); - kdat.has_openat2 = false; - } else { - kdat.has_openat2 = true; - } - - return 0; -} - -int __attribute__((weak)) kdat_has_shstk(void) -{ - return 0; -} - -static int kerndat_has_shstk(void) -{ - int ret = kdat_has_shstk(); - - if (ret < 0) { - pr_err("kdat_has_shstk failed\n"); - return ret; - } - - kdat.has_shstk = !!ret; - return 0; -} - -#define KERNDAT_CACHE_NAME "criu.kdat" -#define KERNDAT_CACHE_FILE KDAT_RUNDIR "/" KERNDAT_CACHE_NAME - -/* - * Returns: - * -1 if kdat_file was not written due to error - * 0 if kdat_file was written - * 1 if kdat_file was not written because cache directory undefined in env (non-root mode) - */ -static int get_kerndat_filename(char **kdat_file) -{ - int ret; - - /* - * Running as non-root, even with CAP_CHECKPOINT_RESTORE, does not - * allow to write to KDAT_RUNDIR which usually is only writable by root. - * Let's write criu.kdat file to XDG_RUNTIME_DIR for non-root cases. - * Note that XDG_RUNTIME_DIR is not always defined (e.g. when executing - * via su/sudo). - */ - if (opts.unprivileged) { - const char *cache_dir = getenv("XDG_RUNTIME_DIR"); - if (!cache_dir) { - pr_warn("$XDG_RUNTIME_DIR not set. Cannot find location for kerndat file\n"); - return 1; - } - ret = asprintf(kdat_file, "%s/%s", cache_dir, KERNDAT_CACHE_NAME); - } else { - ret = asprintf(kdat_file, "%s", KERNDAT_CACHE_FILE); - } - - if (unlikely(ret < 0)) { - pr_warn("Cannot allocate memory for kerndat file name\n"); - return -1; - } - - return 0; -} - -/* - * Returns: - * -1 if error - * 0 if cache was loaded - * 1 if cache does not exist or is stale or cache directory undefined in env (non-root mode) - */ static int kerndat_try_load_cache(void) { - cleanup_free char *kdat_file = NULL; int fd, ret; - ret = get_kerndat_filename(&kdat_file); - if (ret) - return ret; - - fd = open(kdat_file, O_RDONLY); + fd = open(KERNDAT_CACHE_FILE, O_RDONLY); if (fd < 0) { if (ENOENT == errno) - pr_debug("File %s does not exist\n", kdat_file); + pr_debug("File %s does not exist\n", KERNDAT_CACHE_FILE); else - pr_warn("Can't load %s\n", kdat_file); + pr_warn("Can't load %s\n", KERNDAT_CACHE_FILE); return 1; } @@ -1247,12 +842,12 @@ static int kerndat_try_load_cache(void) close(fd); if (ret != sizeof(kdat) || kdat.magic1 != KDAT_MAGIC || kdat.magic2 != KDAT_MAGIC_2) { - pr_warn("Stale %s file\n", kdat_file); - unlink(kdat_file); + pr_warn("Stale %s file\n", KERNDAT_CACHE_FILE); + unlink(KERNDAT_CACHE_FILE); return 1; } - pr_info("Loaded kdat cache from %s\n", kdat_file); + pr_info("Loaded kdat cache from %s\n", KERNDAT_CACHE_FILE); return 0; } @@ -1260,20 +855,8 @@ static void kerndat_save_cache(void) { int fd, ret; struct statfs s; - cleanup_free char *kdat_file = NULL; - cleanup_free char *kdat_file_tmp = NULL; - if (get_kerndat_filename(&kdat_file)) - return; - - ret = asprintf(&kdat_file_tmp, "%s.tmp", kdat_file); - - if (unlikely(ret < 0)) { - pr_warn("Cannot allocate memory for kerndat file name\n"); - return; - } - - fd = open(kdat_file_tmp, O_CREAT | O_EXCL | O_WRONLY, 0600); + fd = open(KERNDAT_CACHE_FILE_TMP, O_CREAT | O_EXCL | O_WRONLY, 0600); if (fd < 0) /* * It can happen that we race with some other criu @@ -1282,10 +865,6 @@ static void kerndat_save_cache(void) */ return; - /* - * If running as root we store the cache file on a tmpfs (/run), - * because the file should be gone after reboot. - */ if (fstatfs(fd, &s) < 0 || s.f_type != TMPFS_MAGIC) { pr_warn("Can't keep kdat cache on non-tempfs\n"); close(fd); @@ -1299,21 +878,20 @@ static void kerndat_save_cache(void) */ kdat.magic1 = KDAT_MAGIC; kdat.magic2 = KDAT_MAGIC_2; - ret = write(fd, &kdat, sizeof(kdat)); close(fd); if (ret == sizeof(kdat)) - ret = rename(kdat_file_tmp, kdat_file); + ret = rename(KERNDAT_CACHE_FILE_TMP, KERNDAT_CACHE_FILE); else { ret = -1; errno = EIO; } if (ret < 0) { - pr_perror("Couldn't save %s", kdat_file); + pr_perror("Couldn't save %s", KERNDAT_CACHE_FILE); unl: - unlink(kdat_file); + unlink(KERNDAT_CACHE_FILE_TMP); } } @@ -1321,14 +899,6 @@ static int kerndat_uffd(void) { int uffd, err = 0; - if (opts.unprivileged) - /* - * If running as non-root uffd_open() fails with - * 'Operation not permitted'. Just ignore uffd for - * non-root for now. - */ - return 0; - kdat.uffd_features = 0; uffd = uffd_open(0, &kdat.uffd_features, &err); @@ -1344,7 +914,7 @@ static int kerndat_uffd(void) if (err == ENOSYS) return 0; if (err == EPERM) { - pr_info("Lazy pages are not permitted\n"); + pr_info("Lazy pages are not permited\n"); return 0; } pr_err("Lazy pages are not available\n"); @@ -1421,8 +991,6 @@ int kerndat_has_thp_disable(void) parse_vmflags(str, &flags, &madv, &io_pf); kdat.has_thp_disable = !(madv & (1 << MADV_NOHUGEPAGE)); - if (!kdat.has_thp_disable) - pr_warn("prctl PR_SET_THP_DISABLE sets MADV_NOHUGEPAGE\n"); break; } } @@ -1466,20 +1034,17 @@ static bool kerndat_has_clone3_set_tid(void) */ pid = syscall(__NR_clone3, &args, sizeof(args)); - if (pid != -1) { - pr_err("Unexpected success: clone3() returned %d\n", pid); + if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { + kdat.has_clone3_set_tid = false; + return 0; + } + if (pid == -1 && errno == EINVAL) { + kdat.has_clone3_set_tid = true; + } else { + pr_perror("Unexpected error from clone3"); return -1; } - if (errno == ENOSYS || errno == E2BIG) - return 0; - - if (errno != EINVAL) { - pr_pwarn("Unexpected error from clone3"); - return 0; - } - - kdat.has_clone3_set_tid = true; return 0; } @@ -1544,7 +1109,7 @@ static int kerndat_has_pidfd_getfd(void) if (val_b == val_a) { kdat.has_pidfd_getfd = true; } else { - /* If val_b != val_a, something unexpected happened. */ + /* If val_b != val_a then something unexpected happend. */ pr_err("Unexpected value read from socket\n"); ret = -1; } @@ -1607,9 +1172,7 @@ static int __has_nftables_concat(void *arg) return 1; if (NFT_RUN_CMD(nft, "create table inet CRIU")) { - pr_warn("Can't create nftables table\n"); - *has = false; /* kdat.has_nftables_concat = false */ - ret = 0; + pr_err("Can't create nftables table\n"); goto nft_ctx_free_out; } @@ -1645,318 +1208,20 @@ static int kerndat_has_nftables_concat(void) #endif } -#ifndef IPV6_FREEBIND -#define IPV6_FREEBIND 78 -#endif - -static int __kerndat_has_ipv6_freebind(int sk) -{ - int val = 1; - - if (setsockopt(sk, SOL_IPV6, IPV6_FREEBIND, &val, sizeof(int)) == -1) { - if (errno == ENOPROTOOPT) { - kdat.has_ipv6_freebind = false; - return 0; - } - pr_perror("Unable to setsockopt ipv6_freebind"); - return -1; - } - - kdat.has_ipv6_freebind = true; - return 0; -} - -static int kerndat_has_ipv6_freebind(void) -{ - int sk, ret; - - if (!kdat.ipv6) { - kdat.has_ipv6_freebind = false; - return 0; - } - - sk = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP); - if (sk == -1) { - pr_perror("Unable to create a ipv6 dgram socket"); - return -1; - } - - ret = __kerndat_has_ipv6_freebind(sk); - close(sk); - return ret; -} - -#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 - -static int kerndat_has_membarrier_get_registrations(void) -{ - int ret = syscall(__NR_membarrier, 1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0); - if (ret < 0) { - if (errno != EINVAL) { - return ret; - } - - kdat.has_membarrier_get_registrations = false; - } else { - kdat.has_membarrier_get_registrations = true; - } - - return 0; -} - -static int kerndat_has_close_range(void) -{ - /* fd is greater than max_fd, so close_range should return EINVAL. */ - if (cr_close_range(2, 1, 0) == 0) { - pr_err("close_range succeeded unexpectedly\n"); - return -1; - } - - if (errno == ENOSYS) { - pr_debug("close_range isn't supported\n"); - return 0; - } - if (errno != EINVAL) { - pr_perror("close_range returned unexpected error code"); - return -1; - } - - kdat.has_close_range = true; - return 0; -} - -static int kerndat_has_timer_cr_ids(void) -{ - if (prctl(PR_TIMER_CREATE_RESTORE_IDS, - PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) == -1) { - if (errno == EINVAL) { - pr_debug("PR_TIMER_CREATE_RESTORE_IDS isn't supported\n"); - return 0; - } - pr_perror("prctl returned unexpected error code"); - return -1; - } - - kdat.has_timer_cr_ids = true; - return 0; -} - -static void breakpoint_func(void) -{ - if (raise(SIGSTOP)) - pr_perror("Unable to kill itself with SIGSTOP"); - exit(1); -} - -/* - * kerndat_breakpoints checks that hardware breakpoints work as they should. - * In some cases, they might not work in virtual machines if the hypervisor - * doesn't virtualize them. For example, they don't work in AMD SEV virtual - * machines if the Debug Virtualization extension isn't supported or isn't - * enabled in SEV_FEATURES. - */ -static int kerndat_breakpoints(void) -{ - int status, ret, exit_code = -1; - pid_t pid; - - pid = fork(); - if (pid == -1) { - pr_perror("fork"); - return -1; - } - if (pid == 0) { - if (ptrace(PTRACE_TRACEME, 0, 0, 0)) { - pr_perror("ptrace(PTRACE_TRACEME)"); - exit(1); - } - raise(SIGSTOP); - breakpoint_func(); - exit(1); - } - if (waitpid(pid, &status, 0) == -1) { - pr_perror("waitpid for initial stop"); - goto err; - } - if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGSTOP) { - pr_err("Child didn't stop as expected: status=%x\n", status); - goto err; - } - ret = ptrace_set_breakpoint(pid, &breakpoint_func); - if (ret < 0) { - pr_err("Failed to set breakpoint\n"); - goto err; - } - if (ret == 0) { - pr_debug("Hardware breakpoints appear to be disabled\n"); - goto out; - } - if (waitpid(pid, &status, 0) == -1) { - pr_perror("waitpid for breakpoint trigger"); - goto err; - } - if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) { - pr_warn("Hardware breakpoints don't seem to work (status=%x)\n", status); - goto out; - } - kdat.has_breakpoints = true; -out: - exit_code = 0; -err: - if (kill(pid, SIGKILL)) { - pr_perror("Failed to kill the child process"); - exit_code = -1; - } - if (waitpid(pid, &status, 0) == -1) { - pr_perror("Failed to wait for the child process"); - exit_code = -1; - } - if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) { - pr_err("The child exited with unexpected code: %x\n", status); - exit_code = -1; - } - return exit_code; -} - -static int kerndat_has_madv_guard(void) -{ - void *map; - - map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); - if (map == MAP_FAILED) { - pr_perror("Can't mmap a page for has_madv_guard feature test"); - return -1; - } - - if (madvise(map, PAGE_SIZE, MADV_GUARD_INSTALL)) { - if (errno != EINVAL) { - pr_perror("madvise failed (has_madv_guard check)"); - goto mmap_cleanup; - } - } else { - kdat.has_madv_guard = true; - } - - munmap(map, PAGE_SIZE); - return 0; - -mmap_cleanup: - munmap(map, PAGE_SIZE); - return -1; -} - -void kerndat_warn_about_madv_guards(void) -{ - if (kdat.has_madv_guard && !kdat.has_pagemap_scan_guard_pages) - pr_warn("ioctl(PAGEMAP_SCAN) doesn't support PAGE_IS_GUARD flag. " - "CRIU dump will fail if dumped processes use madvise(MADV_GUARD_INSTALL). " - "Please, consider updating your kernel.\n"); -} - -/* - * Some features depend on resource that can be dynamically changed - * at the OS runtime. There are cases that we cannot determine the - * availability of those features at the first time we run kerndat - * check. So in later kerndat checks, we need to retry to get those - * information. This function contains calls to those kerndat checks. - * - * Those kerndat checks must - * Return -1 on error - * Return 0 when the check is successful but no new information - * Return 1 when the check is successful and there is new information - */ -int kerndat_try_load_new(void) -{ - int ret; - - ret = kerndat_get_hugetlb_dev(); - if (ret < 0) - return ret; - - ret = kerndat_has_ptrace_get_rseq_conf(); - if (ret < 0) { - pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); - return ret; - } - - ret = kerndat_has_shstk(); - if (ret < 0) { - pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); - return ret; - } - - /* New information is found, we need to save to the cache */ - if (ret) - kerndat_save_cache(); - return 0; -} - -static int root_only_init(void) -{ - int ret = 0; - - if (opts.unprivileged) - return 0; - - if (!ret && kerndat_loginuid()) { - pr_err("kerndat_loginuid failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_tun_netns()) { - pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_socket_unix_file()) { - pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_link_nsid()) { - pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_socket_netns()) { - pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_nftables_concat()) { - pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_move_mount_set_group()) { - pr_err("kerndat_has_move_mount_set_group failed when initializing kerndat.\n"); - ret = -1; - } - - return ret; -} - int kerndat_init(void) { int ret; ret = kerndat_try_load_cache(); - if (ret < 0) + if (ret <= 0) return ret; - - if (ret == 0) - return kerndat_try_load_new(); - ret = 0; /* kerndat_try_load_cache can leave some trash in kdat */ memset(&kdat, 0, sizeof(kdat)); preload_socket_modules(); - if (!opts.unprivileged) - /* - * This uses 'iptables -L' to implicitly load necessary modules. - * If the non nft backed iptables is used it does a - * openat(AT_FDCWD, "/run/xtables.lock", O_RDONLY|O_CREAT, 0600) = -1 EACCES - * which will fail as non-root. There are no capabilities to - * change this. The iptables nft backend fails with - * openat(AT_FDCWD, "/proc/net/ip_tables_names", O_RDONLY) = -1 EACCES - */ - preload_netfilter_modules(); + preload_netfilter_modules(); if (check_pagemap()) { pr_err("check_pagemap failed when initializing kerndat.\n"); @@ -1966,10 +1231,6 @@ int kerndat_init(void) pr_err("kerndat_get_shmemdev failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_get_hugetlb_dev() < 0) { - pr_err("kerndat_get_hugetlb_dev failed when initializing kerndat.\n"); - ret = -1; - } if (!ret && kerndat_get_dirty_track()) { pr_err("kerndat_get_dirty_track failed when initializing kerndat.\n"); ret = -1; @@ -1994,14 +1255,10 @@ int kerndat_init(void) pr_err("get_ipv6 failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_nsid()) { - pr_err("kerndat_nsid failed when initializing kerndat.\n"); + if (!ret && kerndat_loginuid()) { + pr_err("kerndat_loginuid failed when initializing kerndat.\n"); ret = -1; } - - if (!ret && root_only_init()) - ret = -1; - if (!ret && kerndat_iptables_has_xtlocks()) { pr_err("kerndat_iptables_has_xtlocks failed when initializing kerndat.\n"); ret = -1; @@ -2014,12 +1271,24 @@ int kerndat_init(void) pr_err("kerndat_compat_restore failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_memfd_create()) { - pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); + if (!ret && kerndat_tun_netns()) { + pr_err("kerndat_tun_netns failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_has_memfd_hugetlb()) { - pr_err("kerndat_has_memfd_hugetlb failed when initializing kerndat.\n"); + if (!ret && kerndat_socket_unix_file()) { + pr_err("kerndat_socket_unix_file failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_nsid()) { + pr_err("kerndat_nsid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_link_nsid()) { + pr_err("kerndat_link_nsid failed when initializing kerndat.\n"); + ret = -1; + } + if (!ret && kerndat_has_memfd_create()) { + pr_err("kerndat_has_memfd_create failed when initializing kerndat.\n"); ret = -1; } if (!ret && kerndat_detect_stack_guard_gap()) { @@ -2044,6 +1313,10 @@ int kerndat_init(void) pr_err("kerndat_vdso_preserves_hint failed when initializing kerndat.\n"); ret = -1; } + if (!ret && kerndat_socket_netns()) { + pr_err("kerndat_socket_netns failed when initializing kerndat.\n"); + ret = -1; + } if (!ret && kerndat_x86_has_ptrace_fpu_xsave_bug()) { pr_err("kerndat_x86_has_ptrace_fpu_xsave_bug failed when initializing kerndat.\n"); ret = -1; @@ -2068,7 +1341,7 @@ int kerndat_init(void) pr_err("has_time_namespace failed when initializing kerndat.\n"); ret = -1; } - if (!ret && (!opts.unprivileged || has_cap_net_admin(opts.cap_eff)) && kerndat_has_newifindex()) { + if (!ret && kerndat_has_newifindex()) { pr_err("kerndat_has_newifindex failed when initializing kerndat.\n"); ret = -1; } @@ -2082,48 +1355,8 @@ int kerndat_init(void) pr_err("kerndat_has_nspid failed when initializing kerndat.\n"); ret = -1; } - if (!ret && kerndat_sockopt_buf_lock()) { - pr_err("kerndat_sockopt_buf_lock failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_openat2()) { - pr_err("kerndat_has_openat2 failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_rseq()) { - pr_err("kerndat_has_rseq failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && (kerndat_has_ptrace_get_rseq_conf() < 0)) { - pr_err("kerndat_has_ptrace_get_rseq_conf failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && (kerndat_has_ipv6_freebind() < 0)) { - pr_err("kerndat_has_ipv6_freebind failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_membarrier_get_registrations()) { - pr_err("kerndat_has_membarrier_get_registrations failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_shstk()) { - pr_err("kerndat_has_shstk failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_close_range()) { - pr_err("kerndat_has_close_range has failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_timer_cr_ids()) { - pr_err("kerndat_has_timer_cr_ids has failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_breakpoints()) { - pr_err("kerndat_breakpoints has failed when initializing kerndat.\n"); - ret = -1; - } - if (!ret && kerndat_has_madv_guard()) { - pr_err("kerndat_has_madv_guard has failed when initializing kerndat.\n"); + if (!ret && kerndat_has_nftables_concat()) { + pr_err("kerndat_has_nftables_concat failed when initializing kerndat.\n"); ret = -1; } diff --git a/criu/libnetlink.c b/criu/libnetlink.c index c7a84a44d..6dad7551a 100644 --- a/criu/libnetlink.c +++ b/criu/libnetlink.c @@ -45,7 +45,7 @@ static int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *, struct } /* - * Default error handler: just point our an error + * Default errror handler: just point our an error * and pass up to caller. */ static int rtnl_return_err(int err, struct ns_id *ns, void *arg) @@ -214,3 +214,8 @@ int __wrap_nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], in return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy); } + +int32_t nla_get_s32(const struct nlattr *nla) +{ + return *(const int32_t *)nla_data(nla); +} diff --git a/criu/log.c b/criu/log.c index bf6f657f2..c4ce90ec0 100644 --- a/criu/log.c +++ b/criu/log.c @@ -10,7 +10,6 @@ #include #include #include -#include #include @@ -72,8 +71,7 @@ static void print_ts(void) gettimeofday(&t, NULL); timediff(&start, &t); - snprintf(buffer, TS_BUF_OFF, "(%02u.%06u", (unsigned)t.tv_sec, (unsigned)t.tv_usec); - buffer[TS_BUF_OFF - 2] = ')'; /* this will overwrite the last digit if tv_sec>=100 */ + snprintf(buffer, TS_BUF_OFF, "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec); buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */ } @@ -115,9 +113,6 @@ static struct str_and_lock *first_err; int log_keep_err(void) { - if (first_err) - return 0; - first_err = shmalloc(sizeof(struct str_and_lock)); if (first_err == NULL) return -1; @@ -136,11 +131,10 @@ static void log_note_err(char *msg) * anyway, so it doesn't make much sense to try hard * and optimize this out. */ - if (mutex_trylock(&first_err->l)) { - if (first_err->s[0] == '\0') - __strlcpy(first_err->s, msg, sizeof(first_err->s)); - mutex_unlock(&first_err->l); - } + mutex_lock(&first_err->l); + if (first_err->s[0] == '\0') + strlcpy(first_err->s, msg, sizeof(first_err->s)); + mutex_unlock(&first_err->l); } } @@ -190,7 +184,7 @@ void flush_early_log_buffer(int fd) * with reading the log_level. */ struct early_log_hdr *hdr = (void *)early_log_buffer + pos; - pos += sizeof(*hdr); + pos += sizeof(hdr); if (hdr->level <= current_loglevel) { size_t size = 0; while (size < hdr->len) { @@ -202,7 +196,7 @@ void flush_early_log_buffer(int fd) } pos += hdr->len; } - if ((early_log_buf_off + sizeof(struct early_log_hdr)) >= EARLY_LOG_BUF_LEN) + if (early_log_buf_off == EARLY_LOG_BUF_LEN) pr_warn("The early log buffer is full, some messages may have been lost\n"); early_log_buf_off = 0; } @@ -320,10 +314,10 @@ unsigned int log_get_loglevel(void) static void early_vprint(const char *format, unsigned int loglevel, va_list params) { - int log_size = 0, log_space; + unsigned int log_size = 0; struct early_log_hdr *hdr; - if ((early_log_buf_off + sizeof(*hdr)) >= EARLY_LOG_BUF_LEN) + if ((early_log_buf_off + sizeof(hdr)) >= EARLY_LOG_BUF_LEN) return; /* Save loglevel */ @@ -331,8 +325,7 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para hdr = (void *)early_log_buffer + early_log_buf_off; hdr->level = loglevel; /* Skip the log entry size */ - early_log_buf_off += sizeof(*hdr); - log_space = EARLY_LOG_BUF_LEN - early_log_buf_off; + early_log_buf_off += sizeof(hdr); if (loglevel >= LOG_TIMESTAMP) { /* * If logging is not yet setup we just write zeros @@ -340,17 +333,12 @@ static void early_vprint(const char *format, unsigned int loglevel, va_list para * keep the same format as the other messages on * log levels with timestamps (>=LOG_TIMESTAMP). */ - log_size = snprintf(early_log_buffer + early_log_buf_off, log_space, + log_size = snprintf(early_log_buffer + early_log_buf_off, sizeof(early_log_buffer) - early_log_buf_off, "(00.000000) "); } - if (log_size < log_space) - log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, - log_space - log_size, format, params); - if (log_size > log_space) { - /* vsnprintf always add the terminating null byte. */ - log_size = log_space - 1; - } + log_size += vsnprintf(early_log_buffer + early_log_buf_off + log_size, + sizeof(early_log_buffer) - early_log_buf_off - log_size, format, params); /* Save log entry size */ hdr->len = log_size; @@ -409,28 +397,15 @@ void print_on_level(unsigned int loglevel, const char *format, ...) int write_pidfile(int pid) { - int fd, ret, exit_code = -1; + int fd; fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600); if (fd == -1) { - pr_perror("pidfile: Can't open %s", opts.pidfile); + pr_perror("Can't open %s", opts.pidfile); return -1; } - ret = dprintf(fd, "%d", pid); - if (ret < 0) { - pr_perror("pidfile: Can't write pid %d to %s", pid, opts.pidfile); - goto close; - } - - if (ret == 0) { - pr_err("pidfile: Can't write pid %d to %s\n", pid, opts.pidfile); - goto close; - } - - pr_debug("pidfile: Wrote pid %d to %s (%d bytes)\n", pid, opts.pidfile, ret); - exit_code = 0; -close: + dprintf(fd, "%d", pid); close(fd); - return exit_code; + return 0; } diff --git a/criu/lsm.c b/criu/lsm.c index 5faf3e5b2..d1b73cc79 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -29,9 +29,7 @@ static int apparmor_get_label(pid_t pid, char **profile_name) FILE *f; char *space; - f = fopen_proc(pid, "attr/apparmor/current"); - if (!f) - f = fopen_proc(pid, "attr/current"); + f = fopen_proc(pid, "attr/current"); if (!f) return -1; @@ -372,7 +370,7 @@ int render_lsm_profile(char *profile, char **val) case LSMTYPE__APPARMOR: return render_aa_profile(val, profile); case LSMTYPE__SELINUX: - if (asprintf(val, "%s", opts.lsm_supplied ? opts.lsm_profile : profile) < 0) { + if (asprintf(val, "%s", profile) < 0) { *val = NULL; return -1; } diff --git a/criu/mem.c b/criu/mem.c index 9e8740c07..ca74bfbb6 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -10,7 +10,6 @@ #include "cr_options.h" #include "servicefd.h" #include "mem.h" -#include "mman.h" #include "parasite-syscall.h" #include "parasite.h" #include "page-pipe.h" @@ -100,7 +99,7 @@ static inline bool __page_in_parent(bool dirty) return opts.track_mem && opts.img_parent && !dirty; } -static bool should_dump_entire_vma(VmaEntry *vmae) +bool should_dump_page(VmaEntry *vmae, u64 pme) { /* * vDSO area must be always dumped because on restore @@ -108,83 +107,30 @@ static bool should_dump_entire_vma(VmaEntry *vmae) */ if (vma_entry_is(vmae, VMA_AREA_VDSO)) return true; + /* + * In turn VVAR area is special and referenced from + * vDSO area by IP addressing (at least on x86) thus + * never ever dump its content but always use one provided + * by the kernel on restore, ie runtime VVAR area must + * be remapped into proper place.. + */ + if (vma_entry_is(vmae, VMA_AREA_VVAR)) + return false; + + /* + * Optimisation for private mapping pages, that haven't + * yet being COW-ed + */ + if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) + return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; + if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) + return true; return false; } -/* - * should_dump_page writes vaddr in page_info->next if an addressed page has to be dumped. - * Otherwise, it writes an address that has to be inspected next. - */ -int should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, struct page_info *page_info) -{ - if (!page_info) - goto err; - - if (vaddr >= pmc->end && pmc_fill(pmc, vaddr, vmae->end)) - goto err; - - if (pmc->regs) { - while (1) { - if (pmc->regs_idx == pmc->regs_len) { - page_info->next = pmc->end; - return 0; - } - - if (vaddr < pmc->regs[pmc->regs_idx].end) - break; - pmc->regs_idx++; - } - - if (vaddr < pmc->regs[pmc->regs_idx].start) { - page_info->next = pmc->regs[pmc->regs_idx].start; - return 0; - } - - if (pmc->regs[pmc->regs_idx].categories & PAGE_IS_GUARD) - goto skip_guard_page; - - page_info->softdirty = pmc->regs[pmc->regs_idx].categories & PAGE_IS_SOFT_DIRTY; - page_info->next = vaddr; - return 0; - } else { - u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)]; - - if (pme & PME_GUARD_REGION) - goto skip_guard_page; - - /* - * Optimisation for private mapping pages, that haven't - * yet being COW-ed - */ - if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE)) { - page_info->next = vaddr + PAGE_SIZE; - return 0; - } - - if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) { - page_info->softdirty = pme & PME_SOFT_DIRTY; - page_info->next = vaddr; - return 0; - } - - page_info->next = vaddr + PAGE_SIZE; - return 0; - } - -err: - pr_err("should_dump_page failed on vma " - "%#016" PRIx64 "-%#016" PRIx64 " vaddr=%#016" PRIx64 "\n", - vmae->start, vmae->end, vaddr); - return -1; - -skip_guard_page: - page_info->next = vaddr + PAGE_SIZE; - return 0; -} - bool page_is_zero(u64 pme) { return __page_is_zero(pme); @@ -215,34 +161,28 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr) * put the memory into the page-pipe's pipe. * * "Holes" in page-pipe are regions, that should be dumped, but - * the memory contents is present in the parent image set. + * the memory contents is present in the pagent image set. */ -static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr, +static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent) { - unsigned long nr_scanned; + u64 *at = &map[PAGE_PFN(*off)]; + unsigned long pfn, nr_to_scan; unsigned long pages[3] = {}; - unsigned long vaddr; - bool dump_all_pages; int ret = 0; - dump_all_pages = should_dump_entire_vma(vma->e); + nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE; - nr_scanned = 0; - for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) { + for (pfn = 0; pfn < nr_to_scan; pfn++) { + unsigned long vaddr; unsigned int ppb_flags = 0; - struct page_info page_info = {}; int st; - /* If dump_all_pages is true, should_dump_page is called to get pme. */ - if (should_dump_page(pmc, vma->e, vaddr, &page_info)) - return -1; - - if (!dump_all_pages && page_info.next != vaddr) { - vaddr = page_info.next - PAGE_SIZE; + if (!should_dump_page(vma->e, at[pfn])) continue; - } + + vaddr = vma->e->start + *off + pfn * PAGE_SIZE; if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr)) ppb_flags |= PPB_LAZY; @@ -254,7 +194,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct * page. The latter would be checked in page-xfer. */ - if (has_parent && page_in_parent(page_info.softdirty)) { + if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) { ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT); st = 0; } else { @@ -274,8 +214,9 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct pages[st]++; } - *pvaddr = vaddr; - cnt_add(CNT_PAGES_SCANNED, nr_scanned); + *off += pfn * PAGE_SIZE; + + cnt_add(CNT_PAGES_SCANNED, nr_to_scan); cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]); cnt_add(CNT_PAGES_LAZY, pages[1]); cnt_add(CNT_PAGES_WRITTEN, pages[2]); @@ -305,12 +246,6 @@ prep_dump_pages_args(struct parasite_ctl *ctl, struct vm_area_list *vma_area_lis */ if (vma_entry_is(vma->e, VMA_AREA_AIORING) && skip_non_trackable) continue; - /* - * We totally ignore MAP_HUGETLB on pre-dump. - * See also generate_vma_iovs() comment. - */ - if ((vma->e->flags & MAP_HUGETLB) && skip_non_trackable) - continue; if (vma->e->prot & PROT_READ) continue; @@ -336,7 +271,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl, struct pa list_for_each_entry(ppb, &pp->bufs, l) { args->nr_segs = ppb->nr_segs; args->nr_pages = ppb->pages_in; - pr_debug("PPB: %ld pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, + pr_debug("PPB: %d pages %d segs %u pipe %d off\n", args->nr_pages, args->nr_segs, ppb->pipe_size, args->off); ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl); @@ -415,31 +350,12 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode) { - u64 vaddr; + u64 off = 0; + u64 *map; int ret; if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED)) return 0; - /* - * In turn VVAR area is special and referenced from - * vDSO area by IP addressing (at least on x86) thus - * never ever dump its content but always use one provided - * by the kernel on restore, ie runtime VVAR area must - * be remapped into proper place.. - */ - if (vma_entry_is(vma->e, VMA_AREA_VVAR)) - return 0; - - /* - * 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") - * tells us that: - * Under memory pressure, mm can just drop the pages (so that they're - * zero when read back again). - * - * Let's just skip MAP_DROPPABLE mappings pages dump logic. - */ - if (vma->e->flags & MAP_DROPPABLE) - return 0; /* * To facilitate any combination of pre-dump modes to run after @@ -486,27 +402,21 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str has_parent = false; } - /* - * We want to completely ignore these VMA types on the pre-dump: - * 1. VMA_AREA_AIORING because it is not soft-dirty trackable (kernel writes) - * 2. MAP_HUGETLB mappings because they are not premapped and we can't use - * parent images from pre-dump stages. Instead, the content is restored from - * the parasite context using full memory image. - */ - if (vma_entry_is(vma->e, VMA_AREA_AIORING) || vma->e->flags & MAP_HUGETLB) { + if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { if (pre_dump) return 0; has_parent = false; } - if (pmc_get_map(pmc, vma)) + map = pmc_get_map(pmc, vma); + if (!map) return -1; if (vma_area_is(vma, VMA_ANON_SHARED)) - return add_shmem_area(item->pid->real, vma->e, pmc); - vaddr = vma->e->start; + return add_shmem_area(item->pid->real, vma->e, map); + again: - ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent); + ret = generate_iovs(item, vma, pp, map, &off, has_parent); if (ret == -EAGAIN) { BUG_ON(!(pp->flags & PP_CHUNK_MODE)); @@ -599,9 +509,6 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, struct parasit parent_predump_mode = mdc->parent_ie->pre_dump_mode; list_for_each_entry(vma_area, &vma_area_list->h, list) { - if (vma_area_is(vma_area, VMA_AREA_GUARD)) - continue; - ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, &pmc, has_parent, mdc->pre_dump, parent_predump_mode); if (ret < 0) @@ -826,9 +733,6 @@ static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma) return false; if (!vma_area_is_private(pvma, kdat.task_size)) return false; - /* ... but not hugetlb mappings */ - if (vma->e->flags & MAP_HUGETLB || pvma->e->flags & MAP_HUGETLB) - return false; /* ... have growsdown and anon flags coincide */ if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS)) return false; @@ -862,14 +766,14 @@ static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list /* <= here to shift from matching VMAs and ... */ while (vma->e->start <= pvma->e->start) { vma = vma_next(vma); - if ((&vma->list == &vmas->h) || vma_area_is(vma, VMA_AREA_GUARD)) + if (&vma->list == &vmas->h) return; } /* ... no == here since we must stop on matching pair */ while (pvma->e->start < vma->e->start) { pvma = vma_next(pvma); - if ((&pvma->list == &pvmas->h) || vma_area_is(pvma, VMA_AREA_GUARD)) + if (&pvma->list == &pvmas->h) return; } } @@ -928,7 +832,6 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void vma->e->start -= PAGE_SIZE; size = vma_entry_len(vma->e); - if (!vma_inherited(vma)) { int flag = 0; /* @@ -1004,15 +907,6 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head) { - /* - * Shadow stack VMAs cannot be mmap()ed, they must be created using - * map_shadow_stack() system call. - * Premap them to reserve virtual address space and populate them - * to have there contents available for later copying. - */ - if (vma_area_is(vma, VMA_AREA_SHSTK)) - return true; - /* * On kernels with 4K guard pages, growsdown VMAs * always have one guard page at the @@ -1063,9 +957,6 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo filemap_ctx_init(true); list_for_each_entry(vma, &vmas->h, list) { - if (vma_area_is(vma, VMA_AREA_GUARD)) - continue; - if (task_size_check(vpid(t), vma->e)) { ret = -1; break; @@ -1080,13 +971,6 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo if (!vma_area_is_private(vma, kdat.task_size)) continue; - if (vma->e->flags & MAP_HUGETLB) - continue; - - /* VMA offset may change due to plugin so we cannot premap */ - if (vma->e->status & VMA_EXT_PLUGIN) - continue; - if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h)) { /* * VMA in question is not shared with anyone. We'll @@ -1097,7 +981,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas, vo do { if (pr->pe->vaddr + pr->pe->nr_pages * PAGE_SIZE <= vma->e->start) continue; - if (pr->pe->vaddr >= vma->e->end) + if (pr->pe->vaddr > vma->e->end) vma->e->status |= VMA_NO_PROT_WRITE; break; } while (pr->advance(pr)); @@ -1127,7 +1011,6 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) unsigned int nr_shared = 0; unsigned int nr_dropped = 0; unsigned int nr_compared = 0; - unsigned int nr_enqueued = 0; unsigned int nr_lazy = 0; unsigned long va; @@ -1203,8 +1086,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr) len >>= PAGE_SHIFT; nr_restored += len; i += len - 1; - - nr_enqueued++; + pr_debug("Enqueue page-read\n"); continue; } @@ -1273,9 +1155,6 @@ err_read: unsigned long size, i = 0; void *addr = decode_pointer(vma->premmaped_addr); - if (vma_area_is(vma, VMA_AREA_GUARD)) - continue; - if (!vma_inherited(vma)) continue; @@ -1303,8 +1182,7 @@ err_read: pr_info("nr_restored_pages: %d\n", nr_restored); pr_info("nr_shared_pages: %d\n", nr_shared); - pr_info("nr_dropped_pages: %d\n", nr_dropped); - pr_info("nr_enqueued: %d\n", nr_enqueued); + pr_info("nr_dropped_pages: %d\n", nr_dropped); pr_info("nr_lazy: %d\n", nr_lazy); return 0; @@ -1316,6 +1194,8 @@ err_addr: static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) { + MmEntry *mm = rsti(t)->mm; + /* * There is no need to disable it if the page read doesn't * have parent. In this case VMA will be empty until @@ -1338,6 +1218,8 @@ static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr) pr_perror("Cannot disable THP"); return -1; } + if (!(mm->has_thp_disabled && mm->thp_disabled)) + rsti(t)->has_thp_enabled = true; return 0; } @@ -1539,72 +1421,3 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta) return prepare_vma_ios(t, ta); } - -int collect_madv_guards(pid_t pid, struct vm_area_list *vma_area_list) -{ - int pagemap_fd = -1; - struct page_region *regs = NULL; - long regs_len = 0; - int i, ret = -1; - - struct pm_scan_arg args = { - .size = sizeof(struct pm_scan_arg), - .flags = 0, - .start = 0, - .end = kdat.task_size, - .walk_end = 0, - .vec_len = 1000, /* this should be enough for most cases */ - .max_pages = 0, - .category_mask = PAGE_IS_GUARD, - .return_mask = PAGE_IS_GUARD, - }; - - if (!kdat.has_pagemap_scan_guard_pages) { - ret = 0; - goto out; - } - - pagemap_fd = open_proc(pid, "pagemap"); - if (pagemap_fd < 0) - goto out; - - regs = xmalloc(args.vec_len * sizeof(struct page_region)); - if (!regs) - goto out; - args.vec = (long)regs; - - do { - /* start from where we finished the last time */ - args.start = args.walk_end; - regs_len = ioctl(pagemap_fd, PAGEMAP_SCAN, &args); - if (regs_len == -1) { - pr_perror("PAGEMAP_SCAN"); - goto out; - } - - for (i = 0; i < regs_len; i++) { - struct vma_area *vma; - - BUG_ON(!(regs[i].categories & PAGE_IS_GUARD)); - - vma = alloc_vma_area(); - if (!vma) - goto out; - - vma->e->start = regs[i].start; - vma->e->end = regs[i].end; - vma->e->status = VMA_AREA_GUARD; - - list_add_tail(&vma->list, &vma_area_list->h); - vma_area_list->nr++; - } - } while (args.walk_end != kdat.task_size); - - ret = 0; - -out: - xfree(regs); - if (pagemap_fd >= 0) - close(pagemap_fd); - return ret; -} diff --git a/criu/memfd.c b/criu/memfd.c index 9d9f0621f..cb3704499 100644 --- a/criu/memfd.c +++ b/criu/memfd.c @@ -18,7 +18,6 @@ #include "file-ids.h" #include "namespaces.h" #include "shmem.h" -#include "hugetlb.h" #include "protobuf.h" #include "images/memfd.pb-c.h" @@ -46,7 +45,6 @@ struct memfd_restore_inode { int fdstore_id; unsigned int pending_seals; MemfdInodeEntry *mie; - bool was_opened_rw; }; static LIST_HEAD(memfd_inodes); @@ -59,17 +57,22 @@ static u32 memfd_inode_ids = 1; int is_memfd(dev_t dev) { + /* + * TODO When MAP_HUGETLB is used, the file device is not shmem_dev, + * Note that other parts of CRIU have similar issues, see + * is_anon_shmem_map(). + */ return dev == kdat.shmem_dev; } static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char *name, const struct stat *st) { MemfdInodeEntry mie = MEMFD_INODE_ENTRY__INIT; - int ret = -1, flag; + int ret = -1; u32 shmid; /* - * shmids are chosen as the inode number of the corresponding mmapped + * shmids are chosen as the inode number of the corresponding mmaped * file. See handle_vma() in proc_parse.c. * It works for memfd too, because we share the same device as the * shmem device. @@ -88,25 +91,10 @@ static int dump_memfd_inode(int fd, struct memfd_dump_inode *inode, const char * mie.name = (char *)name; mie.size = st->st_size; mie.shmid = shmid; - if (is_hugetlb_dev(inode->dev, &flag)) { - mie.has_hugetlb_flag = true; - mie.hugetlb_flag = flag | MFD_HUGETLB; - } - mie.mode = st->st_mode; - mie.has_mode = true; mie.seals = fcntl(fd, F_GET_SEALS); - if (mie.seals == -1) { - if (errno != EINVAL || ~mie.hugetlb_flag & MFD_HUGETLB) { - pr_perror("fcntl(F_GET_SEALS)"); - goto out; - } - /* Kernels before 4.16 don't allow MFD_HUGETLB | - * MFD_ALLOW_SEALING and return EINVAL for - * fcntl(MFD_HUGETLB-enabled fd). - */ - mie.seals = F_SEAL_SEAL; - } + if (mie.seals == -1) + goto out; if (pb_write_one(img_from_set(glob_imgset, CR_FD_MEMFD_INODE), &mie, PB_MEMFD_INODE)) goto out; @@ -234,7 +222,6 @@ static int collect_one_memfd_inode(void *o, ProtobufCMessage *base, struct cr_im mutex_init(&inode->lock); inode->fdstore_id = -1; inode->pending_seals = 0; - inode->was_opened_rw = false; list_add_tail(&inode->list, &memfd_inodes); @@ -271,9 +258,6 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) flags = MFD_ALLOW_SEALING; } - if (mie->has_hugetlb_flag) - flags |= mie->hugetlb_flag; - fd = memfd_create(mie->name, flags); if (fd < 0) { pr_perror("Can't create memfd:%s", mie->name); @@ -283,13 +267,8 @@ static int memfd_open_inode_nocache(struct memfd_restore_inode *inode) if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) goto out; - if (mie->has_mode) - ret = cr_fchperm(fd, mie->uid, mie->gid, mie->mode); - else - ret = cr_fchown(fd, mie->uid, mie->gid); - if (ret) { - pr_perror("Can't set permissions { uid %d gid %d mode %#o } of memfd:%s", (int)mie->uid, - (int)mie->gid, mie->has_mode ? (int)mie->mode : -1, mie->name); + if (fchown(fd, mie->uid, mie->gid)) { + pr_perror("Can't change uid %d gid %d of memfd:%s", (int)mie->uid, (int)mie->gid, mie->name); goto out; } @@ -323,7 +302,7 @@ static int memfd_open_inode(struct memfd_restore_inode *inode) return fd; } -int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) +int memfd_open(struct file_desc *d, u32 *fdflags) { struct memfd_info *mfi; MemfdFileEntry *mfe; @@ -333,80 +312,57 @@ int memfd_open(struct file_desc *d, u32 *fdflags, bool filemap) mfi = container_of(d, struct memfd_info, d); mfe = mfi->mfe; + if (inherited_fd(d, &fd)) + return fd; + pr_info("Restoring memfd id=%d\n", mfe->id); fd = memfd_open_inode(mfi->inode); if (fd < 0) - return -1; + goto err; /* Reopen the fd with original permissions */ flags = fdflags ? *fdflags : mfe->flags; - - if (filemap && (flags & O_ACCMODE) == O_RDWR) - return fd; - - if (!mfi->inode->was_opened_rw && (flags & O_ACCMODE) == O_RDWR) { - /* - * If there is only a single RW-opened fd for a memfd, it can - * be used to pass it to execveat() with AT_EMPTY_PATH to have - * its contents executed. This currently works only for the - * original fd from memfd_create() so return the original fd - * once -- in case the caller expects to be the sole opener - * and does execveat() from this memfd. - */ - if (!fcntl(fd, F_SETFL, flags)) { - mfi->inode->was_opened_rw = true; - return fd; - } - - pr_pwarn("Can't change fd flags to %#o for memfd id=%d", flags, mfe->id); - } - /* * Ideally we should call compat version open() to not force the * O_LARGEFILE file flag with regular open(). It doesn't seem that * important though. */ _fd = __open_proc(PROC_SELF, 0, flags, "fd/%d", fd); - if (_fd < 0) + if (_fd < 0) { pr_perror("Can't reopen memfd id=%d", mfe->id); - else if (!filemap && (flags & O_ACCMODE) == O_RDWR) - pr_warn("execveat(fd=%d, ..., AT_EMPTY_PATH) might fail after restore; memfd id=%d\n", _fd, mfe->id); - + goto err; + } close(fd); - return _fd; -} - -static int memfd_open_fe_fd(struct file_desc *d, int *new_fd) -{ - MemfdFileEntry *mfe; - int fd; - - if (inherited_fd(d, new_fd)) - return 0; - - fd = memfd_open(d, NULL, false); - if (fd < 0) - return -1; - - mfe = container_of(d, struct memfd_info, d)->mfe; + fd = _fd; if (restore_fown(fd, mfe->fown) < 0) goto err; if (lseek(fd, mfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file position of %d for memfd id=%d", fd, mfe->id); + pr_perror("Can't restore file position of memfd id=%d", mfe->id); goto err; } - *new_fd = fd; - return 0; + return fd; err: - close(fd); + if (fd >= 0) + close(fd); return -1; } +static int memfd_open_fe_fd(struct file_desc *fd, int *new_fd) +{ + int tmp; + + tmp = memfd_open(fd, NULL); + if (tmp < 0) + return -1; + *new_fd = tmp; + return 0; +} + static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) { MemfdInodeEntry *mie = NULL; diff --git a/criu/mount-v2.c b/criu/mount-v2.c deleted file mode 100644 index 1e33ac12a..000000000 --- a/criu/mount-v2.c +++ /dev/null @@ -1,1321 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "kerndat.h" -#include "log.h" -#include "cr_options.h" -#include "xmalloc.h" -#include "util.h" -#include "filesystems.h" -#include "mount.h" -#include "mount-v2.h" -#include "namespaces.h" -#include "fs-magic.h" -#include "path.h" -#include "files-reg.h" -#include "fdstore.h" -#include "common/list.h" -#include "common/bug.h" -#include "common/compiler.h" - -#include "images/mnt.pb-c.h" - -#undef LOG_PREFIX -#define LOG_PREFIX "mnt-v2: " - -LIST_HEAD(sharing_groups); - -int check_mount_v2(void) -{ - if (!kdat.has_move_mount_set_group) { - pr_debug("Mounts-v2 requires MOVE_MOUNT_SET_GROUP support\n"); - return -1; - } - - if (!kdat.has_openat2) { - pr_debug("Mounts-v2 requires openat2 support\n"); - return -1; - } - - return 0; -} - -static struct sharing_group *get_sharing_group(int shared_id, int master_id) -{ - struct sharing_group *sg; - - list_for_each_entry(sg, &sharing_groups, list) { - if (sg->shared_id == shared_id && sg->master_id == master_id) - return sg; - } - - return NULL; -} - -static struct sharing_group *alloc_sharing_group(int shared_id, int master_id) -{ - struct sharing_group *sg; - - sg = xzalloc(sizeof(struct sharing_group)); - if (!sg) - return NULL; - - sg->shared_id = shared_id; - sg->master_id = master_id; - - INIT_LIST_HEAD(&sg->list); - INIT_LIST_HEAD(&sg->mnt_list); - INIT_LIST_HEAD(&sg->children); - INIT_LIST_HEAD(&sg->siblings); - - list_add(&sg->list, &sharing_groups); - - return sg; -} - -int resolve_shared_mounts_v2(void) -{ - struct sharing_group *sg; - struct mount_info *mi; - - /* - * Create sharing groups for each unique shared_id+master_id pair and - * link each mount to the corresponding sharing group. - */ - for (mi = mntinfo; mi; mi = mi->next) { - if (!mi->shared_id && !mi->master_id) - continue; - - pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", mi->mnt_id, mi->shared_id, - mi->master_id, mi->ns_mountpoint); - - sg = get_sharing_group(mi->shared_id, mi->master_id); - if (!sg) { - sg = alloc_sharing_group(mi->shared_id, mi->master_id); - if (!sg) - return -1; - } - - list_add(&mi->mnt_sharing, &sg->mnt_list); - mi->sg = sg; - } - - /* - * Collect sharing groups tree. Mount propagation between sharing - * groups only goes down this tree, meaning that only mounts of same or - * descendant sharing groups receive mount propagation. - */ - list_for_each_entry(sg, &sharing_groups, list) { - if (sg->master_id) { - struct sharing_group *p; - - /* - * Lookup parent sharing group. If one sharing group - * has master_id equal to shared_id of another sharing - * group than the former is a child (slave) of the - * latter. Also sharing groups should not have two - * parents so we check this here too. - */ - list_for_each_entry(p, &sharing_groups, list) { - if (p->shared_id != sg->master_id) - continue; - - if (sg->parent) { - pr_err("Sharing group (%d, %d) parent collision (%d, %d) (%d, %d)\n", - sg->shared_id, sg->master_id, p->shared_id, p->master_id, - sg->parent->shared_id, sg->parent->master_id); - return -1; - } - sg->parent = p; - - if (!list_empty(&sg->siblings)) { - pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", - sg->shared_id, sg->master_id, p->shared_id, p->master_id); - return -1; - } - list_add(&sg->siblings, &p->children); - /* Don't break to check for parent collision */ - } - - /* - * If sharing group has master_id but we did't find - * parent for it inside the dumped container yet, this - * means that the master_id is external and a mount on - * host should exist with corresponding shared_id. - */ - if (!sg->parent && list_empty(&sg->siblings)) { - struct mount_info *ext; - struct sharing_group *s; - char *source = NULL; - - /* - * Though we don't have parent sharing group - * (inaccessible sharing), we can still have - * siblings, sharing groups with same master_id - * but different shared_id, let's collect them - * to the list. - */ - list_for_each_entry(s, &sharing_groups, list) { - if (s->master_id != sg->master_id) - continue; - - if (s->parent) { - pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", - sg->shared_id, sg->master_id, s->parent->shared_id, - s->parent->master_id); - return -1; - } - - if (!list_empty(&s->siblings)) { - pr_err("External slavery sharing group collision (%d, %d) (%d, %d)\n", - sg->shared_id, sg->master_id, s->shared_id, s->master_id); - return -1; - } - list_add(&s->siblings, &sg->siblings); - } - - BUG_ON(list_empty(&sg->mnt_list)); - mi = list_entry(sg->mnt_list.next, struct mount_info, mnt_sharing); - - /* - * We need to know from which mount on host we - * can get this external master_id. There are - * two options: mountpoint external mount or - * root mount of container. - */ - if ((ext = mnt_get_external_bind_nodev(mi))) - source = ext->external; - else if (mnt_is_root_bind(mi)) - source = opts.root; - - if (!source) { - pr_err("Sharing group (%d, %d) " - "has unreachable sharing. Try --enable-external-masters.\n", - sg->shared_id, sg->master_id); - return -1; - } - - sg->source = source; - list_for_each_entry(s, &sg->siblings, siblings) - s->source = sg->source; - - pr_debug("Detected external slavery for shared group (%d, %d) with source %s\n", - sg->shared_id, sg->master_id, source); - } - } - } - - return 0; -} - -/* - * When first mount from superblock is mounted, give other mounts - * a hint that they can now just bindmount from the first one. - */ -static int propagate_mount_v2(struct mount_info *mi) -{ - struct mount_info *t; - - list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { - if (t->mounted) - continue; - if (t->bind) - continue; - if (!issubpath(t->root, mi->root)) - continue; - pr_debug("\t\tPropagate %d to %d\n", mi->mnt_id, t->mnt_id); - t->bind = mi; - t->s_dev_rt = mi->s_dev_rt; - } - - return 0; -} - -/* - * Mounts first mount of superblock - */ -static int do_new_mount_v2(struct mount_info *mi) -{ - unsigned long sflags = mi->sb_flags; - unsigned long mflags = mi->flags & (~MS_PROPAGATE); - char *src; - struct fstype *tp = mi->fstype; - bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); - mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; - - src = resolve_source(mi); - if (!src) - return -1; - - /* Merge superblock and mount flags if it's possible */ - if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { - sflags |= mflags; - mflags = 0; - } - - if (remount_ro) - sflags &= ~MS_RDONLY; - - if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { - pr_perror("Can't mount at %s", mi->plain_mountpoint); - return -1; - } - - /* - * Mount-v2 relies that before mount tree is constructed all mounts - * should remain private. Newly created mounts can become non-private - * initially depending on parent/source sharing, let's be as explicit - * as possible here and make it obvious that mount becomes private. - */ - if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { - pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); - return -1; - } - - if (tp->restore && tp->restore(mi)) - return -1; - - if (remount_ro) { - int fd; - - fd = open(mi->plain_mountpoint, O_PATH); - if (fd < 0) { - pr_perror("Unable to open %s", mi->plain_mountpoint); - return -1; - } - sflags |= MS_RDONLY | MS_REMOUNT; - if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { - pr_perror("Unable to apply mount flags %d for %s", mi->sb_flags, mi->plain_mountpoint); - close(fd); - return -1; - } - close(fd); - } - - if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { - pr_perror("Unable to apply bind-mount options"); - return -1; - } - - mi->mounted = true; - return 0; -} - -/* - * Does simple bindmount, but via new kernel mount api, - * which also handles autofs and symlink without resolving. - */ -static int __do_bind_mount_v2(char *from, char *to) -{ - int detached_fd; - - detached_fd = sys_open_tree(AT_FDCWD, from, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); - if (detached_fd == -1) { - pr_perror("Failed to open_tree %s", from); - return -1; - } - - if (sys_move_mount(detached_fd, "", AT_FDCWD, to, MOVE_MOUNT_F_EMPTY_PATH)) { - pr_perror("Failed to move_mount from %s to %s", from, to); - close(detached_fd); - return -1; - } - close(detached_fd); - - return 0; -} - -LIST_HEAD(deleted_mounts); - -/* - * Bind-mounts all later mounts of superblock from first one, - * also handles first mounts of mountpoint external mounts. - */ -static int do_bind_mount_v2(struct mount_info *mi) -{ - char *root = NULL, *cut_root, rpath[PATH_MAX]; - unsigned long mflags; - int exit_code = -1; - char *mnt_path = NULL; - int level = 0; - - if (mi->need_plugin) { - if (restore_ext_mount(mi)) - return -1; - goto out; - } - - if (mnt_is_nodev_external(mi)) { - root = mi->external; - goto do_bind; - } - - cut_root = get_relative_path(mi->root, mi->bind->root); - if (!cut_root) { - pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); - return -1; - } - - /* - * Mount ->private can be initialized on fstype->mount() callback, - * which is called for first mount of superblock in do_new_mount(). - * Also ->private have to be copied to all other mounts of superblock - * to provide users of it with actual data. - */ - mi->private = mi->bind->private; - - mnt_path = mi->bind->plain_mountpoint; - - if (cut_root[0]) { - snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); - root = rpath; - } else { - root = mnt_path; - } -do_bind: - pr_info("\tBind %s to %s\n", root, mi->plain_mountpoint); - - if (unlikely(mi->deleted)) { - level = make_parent_dirs_if_need(-1, root); - if (level < 0) - goto err; - - if (mi->is_dir) { - if (mkdir(root, 0600)) { - pr_perror("Can't re-create deleted directory %s", root); - goto err; - } - } else { - int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, 0600); - if (fd < 0) { - pr_perror("Can't re-create deleted file %s", root); - goto err; - } - close(fd); - } - } - - if (__do_bind_mount_v2(root, mi->plain_mountpoint)) - goto err; - - /* - * Mount-v2 relies that before mount tree is constructed all mounts - * should remain private. Newly created mounts can become non-private - * initially depending on parent/source sharing, let's be as explicit - * as possible here and make it obvious that mount becomes private. - */ - if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { - pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); - goto err; - } - - mflags = mi->flags & (~MS_PROPAGATE); - if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) - if (mount(NULL, mi->plain_mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { - pr_perror("Can't bind remount 0x%lx at %s", mflags, mi->plain_mountpoint); - goto err; - } - - if (mi->deleted) { - /* - * Deleted mounts can't be moved, will delete source after - * moving to proper position in the mount tree FIXME. - */ - mi->deleted_level = level; - level = 0; - list_add(&mi->deleted_list, &deleted_mounts); - } -out: - mi->mounted = true; - exit_code = 0; -err: - if (level) - rm_parent_dirs(-1, root, level); - - return exit_code; -} - -/* Mounts root container mount. */ -static int do_mount_root_v2(struct mount_info *mi) -{ - unsigned long mflags = mi->flags & (~MS_PROPAGATE); - unsigned long flags = MS_BIND; - int fd; - - if (root_ns_mask & CLONE_NEWUSER) { - fd = open(mi->plain_mountpoint, O_PATH); - if (fd < 0) { - pr_perror("Unable to open %s", mi->plain_mountpoint); - return -1; - } - - if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { - pr_err("Unable to mount %s\n", mi->plain_mountpoint); - close(fd); - return -1; - } - close(fd); - } else { - if (mount(opts.root, mi->plain_mountpoint, NULL, flags, NULL)) { - pr_perror("Unable to mount %s %s (id=%d)", opts.root, mi->plain_mountpoint, mi->mnt_id); - return -1; - } - } - - /* - * Mount-v2 relies that before mount tree is constructed all mounts - * should remain private. Newly created mounts can become non-private - * initially depending on parent/source sharing, let's be as explicit - * as possible here and make it obvious that mount becomes private. - */ - if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { - pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); - return -1; - } - - if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { - pr_perror("Unable to apply root mount options"); - return -1; - } - - mi->mounted = true; - - return 0; -} - -/* Check if mount is ready to be mounted. */ -static bool can_mount_now_v2(struct mount_info *mi) -{ - struct mount_info *root, *ext; - - /* Parent should be mounted already, that's how mnt_tree_for_each works */ - BUG_ON(mi->parent && !mi->parent->mounted); - - /* Root mounts can be mounted at any moment */ - if (rst_mnt_is_root(mi)) { - pr_debug("%s: true as %d is global root\n", __func__, mi->mnt_id); - return true; - } - - /* External mounts can be mounted at any moment */ - if (mi->external) { - pr_debug("%s: true as %d is external\n", __func__, mi->mnt_id); - return true; - } - - /* - * Container root and external mounts should go before - * anything which should be bindmounted from them. - */ - if (!mi->bind) { - root = mnt_get_root_bind(mi); - if (root) { - pr_debug("%s: false as %d is bind of not mounted global root %d\n", __func__, mi->mnt_id, - root->mnt_id); - return false; - } - - ext = mnt_get_external_bind(mi); - if (ext) { - pr_debug("%s: false as %d is a bind of not mounted external %d\n", __func__, mi->mnt_id, - ext->mnt_id); - return false; - } - } - - /* Non fsroot mounts can not be mounted without bind-mount */ - if (!fsroot_mounted(mi) && !mi->bind && !mi->need_plugin) { - pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); - return false; - } - - return true; -} - -static int __set_unbindable_v2(struct mount_info *mi) -{ - if (mi->flags & MS_UNBINDABLE) { - if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { - pr_perror("Failed to set mount %d unbindable", mi->mnt_id); - return -1; - } - } - return 0; -} - -/* - * Setting MS_UNBINDABLE flag is slightly delayed, - * obviousely until we finish bind-mounting everything. - */ -static int set_unbindable_v2(void) -{ - int orig_nsfd = -1, nsfd = -1, exit_code = -1; - struct mount_info *mi; - struct ns_id *nsid; - - for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { - bool ns_has_unbindable = false; - - if (nsid->nd != &mnt_ns_desc) - continue; - - for (mi = mntinfo; mi != NULL; mi = mi->next) - if (mi->nsid == nsid && mi->flags & MS_UNBINDABLE) - ns_has_unbindable = true; - - if (!ns_has_unbindable) - continue; - - nsfd = fdstore_get(nsid->mnt.nsfd_id); - if (nsfd < 0) - goto err; - - if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) - goto err; - close_safe(&nsfd); - - if (mnt_tree_for_each(nsid->mnt.mntinfo_tree, __set_unbindable_v2)) - goto err; - } - - exit_code = 0; -err: - if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) - exit_code = -1; - close_safe(&nsfd); - return exit_code; -} - -/* - * Detects if mount is a directory mount or file mount based on stat on - * its mountpoint inside already mounted parent mount. This is deeply - * integrated in plain mount creation process because before mounting - * something plain we need to create right type of mountpoint for it. - */ -static int detect_is_dir(struct mount_info *mi) -{ - static char mountpoint[PATH_MAX]; - char *rel_path; - struct stat st; - - if (mi->is_dir != -1) - return 0; - - if (mi->mnt_id == HELPER_MNT_ID) { - pr_err("Helper %s should have is_dir pre-set\n", mi->ns_mountpoint); - return -1; - } - - if (!mi->parent || mi->parent == root_yard_mp) { - pr_err("Mount namespace root mount %d should have is_dir pre-set\n", mi->mnt_id); - return -1; - } - - if (!mi->parent->mounted) { - pr_err("Parent mount %d of %d should be mounted\n", mi->parent->mnt_id, mi->mnt_id); - return -1; - } - - rel_path = get_relative_path(mi->ns_mountpoint, mi->parent->ns_mountpoint); - if (!rel_path) { - pr_err("Child-parent mountpoint mismatch %d:%s %d:%s\n", mi->mnt_id, mi->ns_mountpoint, - mi->parent->mnt_id, mi->parent->ns_mountpoint); - return -1; - } - - snprintf(mountpoint, sizeof(mountpoint), "%s%s%s", mi->parent->plain_mountpoint, rel_path[0] ? "/" : "", - rel_path); - if (stat(mountpoint, &st)) { - pr_perror("Can't stat mountpoint %s", mountpoint); - return -1; - } - - if (S_ISDIR(st.st_mode)) - mi->is_dir = true; - else - mi->is_dir = false; - - pr_debug("Mount %d is detected as %s-mount\n", mi->mnt_id, mi->is_dir ? "dir" : "file"); - return 0; -} - -static int create_plain_mountpoint(struct mount_info *mi) -{ - BUG_ON(mi->is_dir == -1); - - pr_debug("Create plain mountpoint %s for %d\n", mi->plain_mountpoint, mi->mnt_id); - if (mi->is_dir) { - if (mkdir(mi->plain_mountpoint, 0600)) { - pr_perror("Unable to mkdir mountpoint %s", mi->plain_mountpoint); - return -1; - } - } else { - int fd; - - fd = creat(mi->plain_mountpoint, 0600); - if (fd < 0) { - pr_perror("Unable to create mountpoint %s", mi->plain_mountpoint); - return -1; - } - close(fd); - } - - return 0; -} - -/* - * At this point we already have a mount in service mount namespace now we - * bind-mount it to the final restored mount namespace via new kernel mount - * API. - */ -static int do_mount_in_right_mntns(struct mount_info *mi) -{ - int nsfd = -1, orig_nsfd = -1, detached_fd = -1, exit_code = -1; - - if (!mi->nsid) - return 0; - - detached_fd = - sys_open_tree(AT_FDCWD, mi->plain_mountpoint, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); - if (detached_fd == -1) { - pr_perror("Failed to open_tree %s", mi->plain_mountpoint); - goto err; - } - - nsfd = fdstore_get(mi->nsid->mnt.nsfd_id); - if (nsfd < 0) - goto err; - - if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) - goto err; - - if (create_plain_mountpoint(mi)) - goto err; - - if (sys_move_mount(detached_fd, "", AT_FDCWD, mi->plain_mountpoint, MOVE_MOUNT_F_EMPTY_PATH)) { - pr_perror("Failed to cross-mntns move_mount plain mount %d", mi->mnt_id); - goto err; - } - - exit_code = 0; -err: - if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) - exit_code = -1; - close_safe(&nsfd); - close_safe(&detached_fd); - return exit_code; -} - -static int do_mount_one_v2(struct mount_info *mi) -{ - int ret; - - if (mi->mounted) - return 0; - - if (!can_mount_now_v2(mi)) { - pr_debug("Postpone mount %d\n", mi->mnt_id); - return 1; - } - - if (detect_is_dir(mi)) - return -1; - - if (create_plain_mountpoint(mi)) - return -1; - - pr_debug("\tMounting %s @%d (%d)\n", mi->fstype->name, mi->mnt_id, mi->need_plugin); - - if (rst_mnt_is_root(mi)) { - if (opts.root == NULL) { - pr_err("The --root option is required to restore a mount namespace\n"); - return -1; - } - ret = do_mount_root_v2(mi); - } else if (!mi->bind && !mi->need_plugin && (!mi->external || !strcmp(mi->external, EXTERNAL_DEV_MOUNT))) { - ret = do_new_mount_v2(mi); - } else { - ret = do_bind_mount_v2(mi); - } - - if (ret == 0 && fetch_rt_stat(mi, mi->plain_mountpoint)) - return -1; - - if (ret == 0 && propagate_mount_v2(mi)) - return -1; - - if (mi->fstype->code == FSTYPE__UNSUPPORTED) { - struct statfs st; - - if (statfs(mi->plain_mountpoint, &st)) { - pr_perror("Unable to statfs %s", mi->plain_mountpoint); - return -1; - } - if (st.f_type == BTRFS_SUPER_MAGIC) - mi->fstype = find_fstype_by_name("btrfs"); - } - - if (ret == 0 && do_mount_in_right_mntns(mi)) - return -1; - - return ret; -} - -static int populate_mnt_ns_v2(void) -{ - if (make_yard(mnt_roots)) - return -1; - - if (mnt_tree_for_each(root_yard_mp, do_mount_one_v2)) - return -1; - - return set_unbindable_v2(); -} - -/* - * This function moves plain mounts into actual mount tree. - * - * Mounts in children list are sorted the way that sibling overmount goes after - * all siblings which it overmounts (see __mnt_resort_children). The function - * mnt_tree_for_each is effectively DFS (in case we don't postpone), thus all - * descendants of all mounts which we sibling-overmount are mounted before us. - * Be careful, we can't postpone (return >0) from this function because of it. - */ -static int move_mount_to_tree(struct mount_info *mi) -{ - int fd; - - fd = open(mi->mountpoint, O_PATH); - if (fd < 0) { - pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); - return -1; - } - - mi->mp_fd_id = fdstore_add(fd); - close(fd); - if (mi->mp_fd_id < 0) { - pr_err("Can't add mountpoint of mount %d to fdstore\n", mi->mnt_id); - return -1; - } - - pr_info("Move mount %d from %s to %s\n", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); - if (sys_move_mount(AT_FDCWD, mi->plain_mountpoint, AT_FDCWD, mi->mountpoint, 0)) { - pr_perror("Failed to move mount %d from %s to %s", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); - return -1; - } - - fd = open(mi->mountpoint, O_PATH); - if (fd < 0) { - pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); - return -1; - } - - mi->mnt_fd_id = fdstore_add(fd); - close(fd); - if (mi->mnt_fd_id < 0) { - pr_err("Can't add mount %d fd to fdstore\n", mi->mnt_id); - return -1; - } - - return 0; -} - -static int assemble_tree_from_plain_mounts(struct ns_id *nsid) -{ - return mnt_tree_for_each(nsid->mnt.mntinfo_tree, move_mount_to_tree); -} - -/* - * With MOVE_MOUNT_SET_GROUP source mount should have wider root than - * destination, thus let's choose widest mount from group as first. - */ -static struct mount_info *get_first_mount(struct sharing_group *sg) -{ - struct mount_info *first = NULL, *tmp; - int min_len = 0; - - list_for_each_entry(tmp, &sg->mnt_list, mnt_sharing) { - int len = strlen(tmp->root); - - if (!first || len < min_len) { - first = tmp; - min_len = len; - } - } - - return first; -} - -struct set_group_arg { - int src_id; - char source[PATH_MAX]; - int dst_id; -}; - -static int __move_mount_set_group(void *arg, int dfd, int pid) -{ - struct set_group_arg *sga = (struct set_group_arg *)arg; - int src_fd, dst_fd, exit_code = -1; - - if (sga->src_id != -1) { - src_fd = fdstore_get(sga->src_id); - BUG_ON(src_fd < 0); - } else { - char *source_mp; - - BUG_ON(sga->source[0] == '\0'); - /* - * Source path should not always be a mountpoint as we - * automatically resolve it to mountpoint below. - */ - source_mp = resolve_mountpoint(sga->source); - if (!source_mp) { - pr_err("Failed to find %s mountpoint\n", sga->source); - return -1; - } - - src_fd = open(source_mp, O_PATH); - if (src_fd < 0) { - pr_perror("Failed to open %s mountpoint", source_mp); - xfree(source_mp); - return -1; - } - xfree(source_mp); - } - - dst_fd = fdstore_get(sga->dst_id); - BUG_ON(dst_fd < 0); - - /* Copy shared_id of the source */ - if (sys_move_mount(src_fd, "", dst_fd, "", - MOVE_MOUNT_SET_GROUP | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH)) { - pr_perror("Failed to copy sharing from %d:%s to %d", sga->src_id, sga->source ?: "", sga->dst_id); - goto err; - } - - exit_code = 0; -err: - close(src_fd); - close(dst_fd); - return exit_code; -} - -/* - * Copy sharing between mounts passing mountpoint fds via fdstore ids. Also it - * is possible (for external mounts) to pass path on mountpoint via source path, - * it would resolve to mountpoint automatically. - */ -static int move_mount_set_group(int src_id, char *source, int dst_id) -{ - struct set_group_arg sga = { - .src_id = src_id, - .dst_id = dst_id, - }; - - sga.source[0] = '\0'; - if (source) { - if (snprintf(sga.source, sizeof(sga.source), "%s", source) >= sizeof(sga.source)) { - pr_err("Source %s is too long\n", source); - return -1; - } - } - - if (userns_call(__move_mount_set_group, 0, &sga, sizeof(sga), -1)) - return -1; - - return 0; -} - -static int restore_one_sharing(struct sharing_group *sg, struct mount_info *target) -{ - int nsfd = -1, orig_nsfd = -1, exit_code = -1; - char target_path[PATH_MAX]; - int target_fd = -1; - - if (!sg->master_id && !sg->shared_id) - return 0; - - target_fd = fdstore_get(target->mnt_fd_id); - BUG_ON(target_fd < 0); - snprintf(target_path, sizeof(target_path), "/proc/self/fd/%d", target_fd); - - /* Restore target's master_id from shared_id of the source */ - if (sg->master_id) { - if (sg->parent) { - struct mount_info *first; - - /* Get shared_id from parent sharing group */ - first = get_first_mount(sg->parent); - if (move_mount_set_group(first->mnt_fd_id, NULL, target->mnt_fd_id)) { - pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, target->mnt_id); - goto err; - } - } else { - /* - * External slavery. We rely on the user to give us the - * right source for external mount with all proper - * sharing options setup (it should be either shared - * or non-shared slave). If source is a private mount - * we would fail. - */ - if (move_mount_set_group(-1, sg->source, target->mnt_fd_id)) { - pr_err("Failed to copy sharing from source %s to %d\n", sg->source, target->mnt_id); - goto err; - } - } - } - - nsfd = fdstore_get(target->nsid->mnt.nsfd_id); - if (nsfd < 0) - goto err; - - if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) - goto err; - - if (sg->master_id) { - /* Convert shared_id to master_id */ - if (mount(NULL, target_path, NULL, MS_SLAVE, NULL)) { - pr_perror("Failed to make mount %d slave", target->mnt_id); - goto err; - } - } - - /* Restore target's shared_id */ - if (sg->shared_id) { - if (mount(NULL, target_path, NULL, MS_SHARED, NULL)) { - pr_perror("Failed to make mount %d shared", target->mnt_id); - goto err; - } - } - exit_code = 0; -err: - close_safe(&target_fd); - close_safe(&nsfd); - if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) - exit_code = -1; - return exit_code; -} - -static int restore_one_sharing_group(struct sharing_group *sg) -{ - struct mount_info *first, *other; - - first = get_first_mount(sg); - - if (restore_one_sharing(sg, first)) - return -1; - - /* Restore sharing for other mounts from the sharing group */ - list_for_each_entry(other, &sg->mnt_list, mnt_sharing) { - if (other == first) - continue; - - if (is_sub_path(other->root, first->root)) { - if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { - pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); - return -1; - } - } else { - /* - * Case where mounts of this sharing group don't have common root. - * For instance we can create two sub-directories .a and .b in some - * shared mount, bindmount them separately somethere and umount the - * original mount. Now we have both bindmounts shared between each - * other. Kernel only allows to copy sharing between mounts when - * source root contains destination root, which is not true for - * these two, so we can't just copy from first to other. - * - * For external sharing (!sg->parent) with only master_id (shared_id - * == 0) we can workaround this by copying from their external source - * instead (same as we did for a first mount). - * - * This is a w/a runc usecase, see https://github.com/opencontainers/runc/pull/3442 - */ - if (!sg->parent && !sg->shared_id) { - if (restore_one_sharing(sg, other)) - return -1; - } else { - pr_err("Can't copy sharing from %d[%s] to %d[%s]\n", first->mnt_id, first->root, - other->mnt_id, other->root); - return -1; - } - } - } - - return 0; -} - -static struct sharing_group *sharing_group_next(struct sharing_group *sg) -{ - if (!list_empty(&sg->children)) - return list_entry(sg->children.next, struct sharing_group, siblings); - - while (sg->parent) { - if (sg->siblings.next == &sg->parent->children) - sg = sg->parent; - else - return list_entry(sg->siblings.next, struct sharing_group, siblings); - } - - return NULL; -} - -static int restore_mount_sharing_options(void) -{ - struct sharing_group *sg; - - list_for_each_entry(sg, &sharing_groups, list) { - struct sharing_group *t; - - if (sg->parent) - continue; - - /* Handle dependent sharing groups in tree order */ - for (t = sg; t != NULL; t = sharing_group_next(t)) { - if (restore_one_sharing_group(t)) - return -1; - } - } - - return 0; -} - -static int remove_source_of_deleted_mount(struct mount_info *mi) -{ - char *cut_root, path[PATH_MAX], *root; - - BUG_ON(!mi->deleted || !mi->bind); - - cut_root = get_relative_path(mi->root, mi->bind->root); - if (!cut_root) { - pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); - return -1; - } - - if (cut_root[0]) { - snprintf(path, sizeof(path), "%s/%s", mi->bind->plain_mountpoint, cut_root); - root = path; - } else { - root = mi->bind->plain_mountpoint; - } - - if (mi->is_dir) { - if (rmdir(root)) { - pr_perror("Can't remove deleted directory %s", root); - return -1; - } - } else { - if (unlink(root)) { - pr_perror("Can't unlink deleted file %s", root); - return -1; - } - } - - if (mi->deleted_level) - rm_parent_dirs(-1, root, mi->deleted_level); - - return 0; -} - -/* Delay making mounts deleted until we've restored sharing groups */ -static int remove_sources_of_deleted_mounts(void) -{ - struct mount_info *mi; - int ret = 0; - - list_for_each_entry(mi, &deleted_mounts, deleted_list) { - if (remove_source_of_deleted_mount(mi)) - ret = -1; - } - - return ret; -} - -static int get_empty_mntns(void) -{ - int orig_nsfd, nsfd = -1; - - orig_nsfd = open_proc(PROC_SELF, "ns/mnt"); - if (orig_nsfd < 0) - return -1; - - /* Create the new mount namespace */ - if (unshare(CLONE_NEWNS)) { - pr_perror("Unable to create a new mntns"); - close(orig_nsfd); - return -1; - } - - if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { - pr_perror("Can't remount \"/\" with MS_PRIVATE"); - goto err; - } - - if (make_yard(mnt_roots)) - goto err; - - if (cr_pivot_root(mnt_roots)) - goto err; - - if (mkdirpat(AT_FDCWD, mnt_roots, 0777)) { - pr_err("Failed to setup root yard in empty mntns\n"); - goto err; - } - - nsfd = open_proc(PROC_SELF, "ns/mnt"); -err: - if (restore_ns(orig_nsfd, &mnt_ns_desc)) - close_safe(&nsfd); - return nsfd; -} - -/* Create almost empty mount namespaces only with root yard precreated */ -static int pre_create_mount_namespaces(void) -{ - int orig_nsfd = -1, nsfd = -1, empty_mntns, exit_code = -1; - char path[PATH_MAX]; - struct ns_id *nsid; - - empty_mntns = get_empty_mntns(); - if (empty_mntns == -1) { - pr_err("Failed to create empty mntns\n"); - goto err; - } - - /* restore mount namespaces */ - for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { - if (nsid->nd != &mnt_ns_desc) - continue; - - if (switch_ns_by_fd(empty_mntns, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) - goto err; - - /* Create the new mount namespace */ - if (unshare(CLONE_NEWNS)) { - pr_perror("Unable to create a new mntns"); - goto err; - } - - nsfd = open_proc(PROC_SELF, "ns/mnt"); - if (nsfd < 0) - goto err; - - /* Pin new mntns with a file descriptor */ - nsid->mnt.nsfd_id = fdstore_add(nsfd); - close(nsfd); - if (nsid->mnt.nsfd_id < 0) { - pr_err("Can't add mntns fd to fdstore\n"); - goto err; - } - - if (make_yard(mnt_roots)) - goto err; - - print_ns_root(nsid, 0, path, sizeof(path)); - if (mkdir(path, 0600)) { - pr_perror("Unable to create %s", path); - goto err; - } - } - - exit_code = 0; -err: - if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) - exit_code = -1; - close_safe(&empty_mntns); - return exit_code; -} - -/* - * Assemble the mount tree for each restored mount namespace - * from pre-created plain mounts. - */ -static int assemble_mount_namespaces(void) -{ - int orig_nsfd = -1, nsfd = -1, rootfd = -1, exit_code = -1; - char path[PATH_MAX]; - struct ns_id *nsid; - - for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { - if (nsid->nd != &mnt_ns_desc) - continue; - - nsfd = fdstore_get(nsid->mnt.nsfd_id); - if (nsfd < 0) - goto err; - - if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) { - close(nsfd); - goto err; - } - close(nsfd); - - if (assemble_tree_from_plain_mounts(nsid)) - goto err; - - /* Set its root */ - print_ns_root(nsid, 0, path, sizeof(path) - 1); - if (cr_pivot_root(path)) - goto err; - - /* root fd is used to restore file mappings */ - rootfd = open_proc(PROC_SELF, "root"); - if (rootfd < 0) - goto err; - nsid->mnt.root_fd_id = fdstore_add(rootfd); - if (nsid->mnt.root_fd_id < 0) { - pr_err("Can't add root fd to fdstore\n"); - close(rootfd); - goto err; - } - close(rootfd); - } - - exit_code = 0; -err: - if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) - exit_code = -1; - return exit_code; -} - -/* The main entry point of mount-v2 for creating mounts */ -int prepare_mnt_ns_v2(void) -{ - if (!(root_ns_mask & CLONE_NEWNS)) - return 0; - -#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED - if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { - /* - * Add to root yard along with other plain mounts and mntns - * directories. This mount would be created and restored by - * generic mount creation code, but it would never be moved to - * any restored mount namespaces. - */ - if (!add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true)) - return -1; - } -#endif - - if (validate_mounts(mntinfo, false)) - return -1; - - if (pre_create_mount_namespaces()) - return -1; - - if (populate_mnt_ns_v2()) - return -1; - - if (assemble_mount_namespaces()) - return -1; - - if (restore_mount_sharing_options()) - return -1; - - return remove_sources_of_deleted_mounts(); -} diff --git a/criu/mount.c b/criu/mount.c index b643a7f26..ec31f02c2 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -17,7 +17,6 @@ #include "plugin.h" #include "filesystems.h" #include "mount.h" -#include "mount-v2.h" #include "pstree.h" #include "image.h" #include "namespaces.h" @@ -28,32 +27,34 @@ #include "external.h" #include "clone-noasan.h" #include "fdstore.h" -#include "rst-malloc.h" #include "images/mnt.pb-c.h" +/* + * Put a : in here since those are invalid on + * the cli, so we know it's autogenerated in + * debugging. + */ +#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED" +#define NO_ROOT_MOUNT "CRIU:NO_ROOT" +#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE) + #undef LOG_PREFIX #define LOG_PREFIX "mnt: " +#define BINFMT_MISC_HOME "proc/sys/fs/binfmt_misc" +#define CRTIME_MNT_ID 0 + #define CONTEXT_OPT "context=" /* A helper mount_info entry for the roots yard */ -struct mount_info *root_yard_mp = NULL; +static struct mount_info *root_yard_mp = NULL; static LIST_HEAD(delayed_unbindable); -char *service_mountpoint(const struct mount_info *mi) -{ - if (!opts.mntns_compat_mode && opts.mode == CR_RESTORE) { - BUG_ON(!mi->plain_mountpoint); - return mi->plain_mountpoint; - } - return mi->mountpoint; -} - int ext_mount_add(char *key, char *val) { - cleanup_free char *e_str = NULL; + char *e_str; e_str = xmalloc(strlen(key) + strlen(val) + 8); if (!e_str) @@ -98,7 +99,7 @@ static char *ext_mount_lookup(char *key) int len = strlen(key); char mkey[len + 6]; - snprintf(mkey, sizeof(mkey), "mnt[%s]", key); + sprintf(mkey, "mnt[%s]", key); v = external_lookup_by_key(mkey); if (IS_ERR(v)) v = NULL; @@ -125,12 +126,6 @@ static void mntinfo_add_list(struct mount_info *new) } } -void mntinfo_add_list_before(struct mount_info **head, struct mount_info *new) -{ - new->next = *head; - *head = new; -} - static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath, unsigned int st_dev, unsigned int st_ino, unsigned int mnt_id) { @@ -163,18 +158,15 @@ static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpat } } - /* - * Concatenates m->ns_mountpoint with rpath and attempts - * to stat the resulting path at mntns_root - */ + /* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */ if (is_root_mount(m)) { ret_stat = fstatat(mntns_root, rpath, &f_stat, 0); } else { char _full_path[PATH_MAX]; - int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->ns_mountpoint, rpath); + int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath); if (n >= PATH_MAX) { - pr_err("Not enough space to concatenate %s and %s\n", m->ns_mountpoint, rpath); + pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath); return ERR_PTR(-ENOSPC); } ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0); @@ -256,11 +248,11 @@ static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, co list_for_each_entry(c, &m->children, siblings) { size_t n; - n = strlen(c->ns_mountpoint + 1); + n = strlen(c->mountpoint + 1); if (n > pathlen) continue; - if (strncmp(c->ns_mountpoint + 1, path, min(n, pathlen))) + if (strncmp(c->mountpoint + 1, path, min(n, pathlen))) continue; if (n < pathlen && path[n] != '/') continue; @@ -272,7 +264,7 @@ static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, co break; } - pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->ns_mountpoint); + pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint); return m; } @@ -302,30 +294,17 @@ bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev, struct ns_id *ns, const c */ static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b) { + if (a->fstype != b->fstype) + return false; + if (a->s_dev != b->s_dev) return false; - /* - * If one of compared mounts is external its mount info can have fstype - * and source fields changed by resolve_external_mounts() or - * try_resolve_ext_mount(), but we still want to detect bindmounts of - * this external mount, so let's skip source and fstype checks for it. - */ - if (!a->external && !b->external) { - if (strcmp(a->source, b->source) != 0) - return false; + if (strcmp(a->source, b->source) != 0) + return false; - if (a->fstype != b->fstype) - return false; - - if (a->fstype->sb_equal) - return a->fstype->sb_equal(a, b); - } else { - if (a->fstype->sb_equal) - return a->fstype->sb_equal(a, b); - else if (b->fstype->sb_equal) - return b->fstype->sb_equal(a, b); - } + if (a->fstype->sb_equal) /* :) */ + return b->fstype->sb_equal(a, b); if (strcmp(a->options, b->options)) return false; @@ -350,7 +329,7 @@ static bool mounts_equal(struct mount_info *a, struct mount_info *b) * mnt_roots is a temporary directory for restoring sub-trees of * non-root namespaces. */ -char *mnt_roots; +static char *mnt_roots; static struct mount_info *mnt_build_ids_tree(struct mount_info *list) { @@ -378,7 +357,7 @@ static struct mount_info *mnt_build_ids_tree(struct mount_info *list) continue; } - pr_err("No parent found for mountpoint %d (@%s)\n", m->mnt_id, m->ns_mountpoint); + pr_err("No parent found for mountpoint %d (@%s)\n", m->mnt_id, m->mountpoint); return NULL; } @@ -399,7 +378,7 @@ static unsigned int mnt_depth(struct mount_info *m) unsigned int depth = 0; char *c; - for (c = m->ns_mountpoint; *c != '\0'; c++) + for (c = m->mountpoint; *c != '\0'; c++) if (*c == '/') depth++; @@ -467,7 +446,7 @@ static void mnt_tree_show(struct mount_info *tree, int off) { struct mount_info *m; - pr_info("%*s[%s](%d->%d)\n", off, "", tree->ns_mountpoint, tree->mnt_id, tree->parent_mnt_id); + pr_info("%*s[%s](%d->%d)\n", off, "", tree->mountpoint, tree->mnt_id, tree->parent_mnt_id); list_for_each_entry(m, &tree->children, siblings) mnt_tree_show(m, off + 1); @@ -478,28 +457,19 @@ static void mnt_tree_show(struct mount_info *tree, int off) /* Returns -1 on error, 1 if external mount resolved, 0 otherwise */ static int try_resolve_ext_mount(struct mount_info *info) { + char *ext; char devstr[64]; - /* - * Only allow mountpoint-external mounts in root mntns. Their lookup is - * based on mountpoint path, but in nested mntns we can have completely - * different mount tree and at same mountpoint we can have completely - * different mount. - */ - if (info->nsid->type == NS_ROOT) { - char *ext; - - ext = ext_mount_lookup(info->ns_mountpoint + 1 /* trim the . */); - if (ext) { - pr_info("Found %s mapping for %s mountpoint\n", ext, info->ns_mountpoint); - info->external = ext; - return 1; - } + ext = ext_mount_lookup(info->mountpoint + 1 /* trim the . */); + if (ext) { + pr_info("Found %s mapping for %s mountpoint\n", ext, info->mountpoint); + info->external = ext; + return 1; } snprintf(devstr, sizeof(devstr), "dev[%d/%d]", kdev_major(info->s_dev), kdev_minor(info->s_dev)); - if (info->fstype->code == FSTYPE__UNSUPPORTED && fsroot_mounted(info)) { + if (info->fstype->code == FSTYPE__UNSUPPORTED) { char *val; val = external_lookup_by_key(devstr); @@ -507,9 +477,6 @@ static int try_resolve_ext_mount(struct mount_info *info) char *source; int len; - pr_info("Found %s dev-mapping for %s(%d) mountpoint\n", val, info->ns_mountpoint, info->mnt_id); - info->external = EXTERNAL_DEV_MOUNT; - len = strlen(val) + sizeof("dev[]"); source = xrealloc(info->source, len); if (source == NULL) @@ -548,13 +515,13 @@ static bool mnt_needs_remap(struct mount_info *m) { struct mount_info *t; - if (!m->parent || m->parent == root_yard_mp) + if (!m->parent) return false; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; - if (issubpath(t->ns_mountpoint, m->ns_mountpoint)) + if (issubpath(t->mountpoint, m->mountpoint)) return true; } @@ -563,107 +530,40 @@ static bool mnt_needs_remap(struct mount_info *m) * remapped too, else fixup_remap_mounts() won't be able to move parent * to it's real place, it will move child instead. */ - if (!strcmp(m->parent->ns_mountpoint, m->ns_mountpoint)) + if (!strcmp(m->parent->mountpoint, m->mountpoint)) return mnt_needs_remap(m->parent); return false; } -static bool __mnt_is_external_bind(struct mount_info *mi, struct mount_info *bind) -{ - if (bind->external && is_sub_path(mi->root, bind->root)) - return true; - - return false; -} - /* - * Say mount is external if it was explicitly specified as an external or it - * can be bind-mounted from such an explicit external mount. + * Say mount is external if it was explicitly specified as an + * external or it will be bind from such an explicit external + * mount, we set bind in propagate_mount and propagate_siblings */ -struct mount_info *mnt_get_external_bind(struct mount_info *mi) + +static bool mnt_is_external(struct mount_info *m) { - return mnt_bind_pick(mi, __mnt_is_external_bind); -} + struct mount_info *t; -bool mnt_is_external_bind(struct mount_info *mi) -{ - return mnt_get_external_bind(mi); -} + while (m) { + if (m->external) + return 1; -static bool __can_receive_master_from_external(struct mount_info *mi, struct mount_info *bind) -{ - if (mnt_is_nodev_external(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) - return true; + if (!list_empty(&m->mnt_share)) + list_for_each_entry(t, &m->mnt_share, mnt_share) + if (t->external) + return 1; - return false; -} + if (m->master_id <= 0 && !list_empty(&m->mnt_bind)) + list_for_each_entry(t, &m->mnt_bind, mnt_bind) + if (issubpath(m->root, t->root) && t->external) + return 1; -static struct mount_info *can_receive_master_from_external(struct mount_info *mi) -{ - return mnt_bind_pick(mi, __can_receive_master_from_external); -} + m = m->mnt_master; + } -static bool __has_mounted_external_bind(struct mount_info *mi, struct mount_info *bind) -{ - if (bind->external && bind->mounted && is_sub_path(mi->root, bind->root)) - return true; - - return false; -} - -bool has_mounted_external_bind(struct mount_info *mi) -{ - return mnt_bind_pick(mi, __has_mounted_external_bind); -} - -bool rst_mnt_is_root(struct mount_info *mi) -{ - return (mi->is_ns_root && mi->nsid->id == root_item->ids->mnt_ns_id); -} - -static bool __mnt_is_root_bind(struct mount_info *mi, struct mount_info *bind) -{ - if (rst_mnt_is_root(bind) && is_sub_path(mi->root, bind->root)) - return true; - - return false; -} - -struct mount_info *mnt_get_root_bind(struct mount_info *mi) -{ - return mnt_bind_pick(mi, __mnt_is_root_bind); -} - -bool mnt_is_root_bind(struct mount_info *mi) -{ - return mnt_get_root_bind(mi); -} - -static bool __can_receive_master_from_root(struct mount_info *mi, struct mount_info *bind) -{ - if (rst_mnt_is_root(bind) && bind->master_id == mi->master_id && is_sub_path(mi->root, bind->root)) - return true; - - return false; -} - -static struct mount_info *can_receive_master_from_root(struct mount_info *mi) -{ - return mnt_bind_pick(mi, __can_receive_master_from_root); -} - -static bool __mnt_is_external_bind_nodev(struct mount_info *mi, struct mount_info *bind) -{ - if (bind->external && !mnt_is_dev_external(bind) && is_sub_path(mi->root, bind->root)) - return true; - - return false; -} - -struct mount_info *mnt_get_external_bind_nodev(struct mount_info *mi) -{ - return mnt_bind_pick(mi, __mnt_is_external_bind_nodev); + return 0; } /* @@ -683,7 +583,7 @@ static int validate_children_collision(struct mount_info *mnt) list_for_each_entry(chj, &mnt->children, siblings) { if (chj == chi) break; - if (!strcmp(chj->ns_mountpoint, chi->ns_mountpoint)) { + if (!strcmp(chj->mountpoint, chi->mountpoint)) { pr_err("Mount %d has two children with same " "mountpoint: %d %d\n", mnt->mnt_id, chj->mnt_id, chi->mnt_id); @@ -694,18 +594,19 @@ static int validate_children_collision(struct mount_info *mnt) return 0; } -int validate_mounts(struct mount_info *info, bool for_dump) +static int validate_mounts(struct mount_info *info, bool for_dump) { struct mount_info *m, *t; for (m = info; m; m = m->next) { + if (m->parent == NULL || m->is_ns_root) + /* root mount can be any */ + continue; + if (validate_children_collision(m)) return -1; - if (mnt_is_external_bind(m)) - continue; - - if (mnt_is_root_bind(m)) + if (mnt_is_external(m)) continue; /* @@ -721,7 +622,7 @@ int validate_mounts(struct mount_info *info, bool for_dump) if (fsroot_mounted(m)) { if (m->fstype->code == FSTYPE__UNSUPPORTED) { - pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", m->ns_mountpoint, m->s_dev, + pr_err("FS mnt %s dev %#x root %s unsupported id %d\n", m->mountpoint, m->s_dev, m->root, m->mnt_id); return -1; } @@ -737,7 +638,7 @@ int validate_mounts(struct mount_info *info, bool for_dump) */ if (for_dump) { - ret = run_plugins(DUMP_EXT_MOUNT, m->ns_mountpoint, m->mnt_id); + ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id); if (ret == 0) m->need_plugin = true; } else @@ -751,7 +652,7 @@ int validate_mounts(struct mount_info *info, bool for_dump) if (ret < 0) { if (ret == -ENOTSUP) pr_err("%d:%s doesn't have a proper root mount\n", m->mnt_id, - m->ns_mountpoint); + m->mountpoint); return -1; } } @@ -826,7 +727,7 @@ static struct ns_id *find_ext_ns_id(void) for (ns = ns_ids; ns->next; ns = ns->next) if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) { - if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, false)) + if (!ns->mnt.mntinfo_list && !collect_mntinfo(ns, true)) break; return ns; } @@ -888,11 +789,7 @@ static int resolve_external_mounts(struct mount_info *info) cut_root = cut_root_for_bind(m->root, match->root); - if (cut_root[0] == '\0') { - p = xstrdup(match->ns_mountpoint + 1); - } else { - p = xsprintf("%s/%s", match->ns_mountpoint + 1, cut_root); - } + p = xsprintf("%s/%s", match->mountpoint + 1, cut_root); if (!p) return -1; @@ -905,7 +802,7 @@ static int resolve_external_mounts(struct mount_info *info) xfree(m->source); m->source = p; - pr_info("autodetected external mount %s for %s(%d)\n", p, m->ns_mountpoint, m->mnt_id); + pr_info("autodetected external mount %s for %s\n", p, m->mountpoint); } return 0; @@ -916,11 +813,11 @@ static int root_path_from_parent(struct mount_info *m, char *buf, int size) bool head_slash = false, tail_slash = false; int p_len, m_len, len; - if (!m->parent || m->parent == root_yard_mp) + if (!m->parent) return -1; - p_len = strlen(m->parent->ns_mountpoint); - m_len = strlen(m->ns_mountpoint); + p_len = strlen(m->parent->mountpoint); + m_len = strlen(m->mountpoint); len = snprintf(buf, size, "%s", m->parent->root); if (len >= size) @@ -936,11 +833,11 @@ static int root_path_from_parent(struct mount_info *m, char *buf, int size) len = m_len - p_len; BUG_ON(len < 0); if (len) { - if (m->ns_mountpoint[p_len] == '/') + if (m->mountpoint[p_len] == '/') head_slash = true; len = snprintf(buf, size, "%s%s", (!tail_slash && !head_slash) ? "/" : "", - m->ns_mountpoint + p_len + (tail_slash && head_slash)); + m->mountpoint + p_len + (tail_slash && head_slash)); if (len >= size) return -1; } @@ -979,66 +876,7 @@ static int same_propagation_group(struct mount_info *a, struct mount_info *b) return 0; } -/* - * Note: Only valid if called consequently on all mounts in mntinfo list. - * - * Note: We may want to iterate over all bindmounts of some mount, and we would - * use ->mnt_bind list for this, but iterating over ->mnt_bind list is - * obviously meaningless before search_bindmounts had actually put bindmounts - * in it. That's why we have ->mnt_bind_is_populated to protect from misuse of - * ->mnt_bind. (As ->mnt_bind list can validly be empty when mount has no - * bindmounts we need separate field to indicate population.) - */ -static void __search_bindmounts(struct mount_info *mi) -{ - struct mount_info *t; - - if (mi->mnt_bind_is_populated) - return; - - for (t = mi->next; t; t = t->next) { - if (mounts_sb_equal(mi, t)) { - list_add(&t->mnt_bind, &mi->mnt_bind); - t->mnt_bind_is_populated = true; - pr_debug("\t" - "The mount %3d is bind for %3d (@%s -> @%s)\n", - t->mnt_id, mi->mnt_id, t->ns_mountpoint, mi->ns_mountpoint); - } - } - - mi->mnt_bind_is_populated = true; -} - -static void search_bindmounts(void) -{ - struct mount_info *mi; - - for (mi = mntinfo; mi; mi = mi->next) - __search_bindmounts(mi); -} - -struct mount_info *mnt_bind_pick(struct mount_info *mi, bool (*pick)(struct mount_info *mi, struct mount_info *bind)) -{ - struct mount_info *bind; - - BUG_ON(!mi); - - if (pick(mi, mi)) - return mi; - - /* - * Shouldn't use mnt_bind list before it was populated in search_bindmounts - */ - BUG_ON(!mi->mnt_bind_is_populated); - - list_for_each_entry(bind, &mi->mnt_bind, mnt_bind) - if (pick(mi, bind)) - return bind; - - return NULL; -} - -static int resolve_shared_mounts(struct mount_info *info) +static int resolve_shared_mounts(struct mount_info *info, int root_master_id) { struct mount_info *m, *t; @@ -1051,19 +889,22 @@ static int resolve_shared_mounts(struct mount_info *info) for (m = info; m; m = m->next) { bool need_share, need_master; + /* the root master_id can be ignored, because it's already created */ + if (root_master_id && root_master_id == m->master_id) + m->master_id = -1; + need_share = m->shared_id && list_empty(&m->mnt_share); - need_master = m->master_id; + need_master = m->master_id > 0; pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", m->mnt_id, m->shared_id, - m->master_id, m->ns_mountpoint); + m->master_id, m->mountpoint); for (t = info; t && (need_share || need_master); t = t->next) { if (t == m) continue; if (need_master && t->shared_id == m->master_id) { - pr_debug("\t" - "The mount %3d is slave for %3d (@%s -> @%s)\n", - m->mnt_id, t->mnt_id, m->ns_mountpoint, t->ns_mountpoint); + pr_debug("\tThe mount %3d is slave for %3d (@%s -> @%s)\n", m->mnt_id, t->mnt_id, + m->mountpoint, t->mountpoint); list_add(&m->mnt_slave, &t->mnt_slave_list); m->mnt_master = t; need_master = false; @@ -1071,29 +912,37 @@ static int resolve_shared_mounts(struct mount_info *info) /* Collect all mounts from this group */ if (need_share && t->shared_id == m->shared_id) { - pr_debug("\t" - "Mount %3d is shared with %3d group %3d (@%s -> @%s)\n", - m->mnt_id, t->mnt_id, m->shared_id, t->ns_mountpoint, m->ns_mountpoint); + pr_debug("\tMount %3d is shared with %3d group %3d (@%s -> @%s)\n", m->mnt_id, + t->mnt_id, m->shared_id, t->mountpoint, m->mountpoint); list_add(&t->mnt_share, &m->mnt_share); } } /* - * External master detected + * If we haven't already determined this mount is external, + * or bind of external, then we don't know where it came from. */ - if (need_master) { - if ((t = can_receive_master_from_external(m)) || (t = can_receive_master_from_root(m))) { - pr_debug("Detected external slavery for %d via %d\n", m->mnt_id, t->mnt_id); - if (m != t) - list_add(&m->mnt_ext_slave, &t->mnt_ext_slave); - continue; - } - + if (need_master && m->parent && !mnt_is_external(m)) { pr_err("Mount %d %s (master_id: %d shared_id: %d) " "has unreachable sharing. Try --enable-external-masters.\n", - m->mnt_id, m->ns_mountpoint, m->master_id, m->shared_id); + m->mnt_id, m->mountpoint, m->master_id, m->shared_id); return -1; } + + /* Search bind-mounts */ + if (list_empty(&m->mnt_bind)) { + /* + * A first mounted point will be set up as a source point + * for others. Look at propagate_mount() + */ + for (t = m->next; t; t = t->next) { + if (mounts_sb_equal(m, t)) { + list_add(&t->mnt_bind, &m->mnt_bind); + pr_debug("\tThe mount %3d is bind for %3d (@%s -> @%s)\n", t->mnt_id, m->mnt_id, + t->mountpoint, m->mountpoint); + } + } + } } /* Search propagation groups */ @@ -1118,7 +967,7 @@ static int resolve_shared_mounts(struct mount_info *info) else if (ret) { BUG_ON(!mounts_equal(m, schild)); pr_debug("\tMount %3d is in same propagation group with %3d (@%s ~ @%s)\n", - m->mnt_id, schild->mnt_id, m->ns_mountpoint, schild->ns_mountpoint); + m->mnt_id, schild->mnt_id, m->mountpoint, schild->mountpoint); list_add(&schild->mnt_propagate, &m->mnt_propagate); } } @@ -1168,21 +1017,39 @@ int mnt_is_dir(struct mount_info *pm) return 0; } -int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinfo) +/* + * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. + * If mnt_fd is -1, the mountpoint will be opened by this function. + */ +int __open_mountpoint(struct mount_info *pm, int mnt_fd) { struct stat st; - unsigned int dev; + int dev; int ret; + if (mnt_fd == -1) { + int mntns_root; + + mntns_root = mntns_get_root_fd(pm->nsid); + if (mntns_root < 0) + return -1; + + mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); + if (mnt_fd < 0) { + pr_perror("Can't open %s", pm->ns_mountpoint); + return -1; + } + } + ret = fstat(mnt_fd, &st); if (ret < 0) { pr_perror("fstat(%s) failed", pm->ns_mountpoint); - return -1; + goto err; } if (pm->s_dev_rt == MOUNT_INVALID_DEV) { pr_err("Resolving over invalid device for %#x %s %s\n", pm->s_dev, pm->fstype->name, pm->ns_mountpoint); - return -1; + goto err; } dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); @@ -1193,66 +1060,26 @@ int __check_mountpoint_fd(struct mount_info *pm, int mnt_fd, bool parse_mountinf * allocates new device ID). */ if (dev != pm->s_dev_rt) { - /* - * For btrfs device numbers in stat and mountinfo can be - * different, fallback to get_sdev_from_fd to get right dev. - */ - if (!strcmp(pm->fstype->name, "btrfs") && !get_sdev_from_fd(mnt_fd, &dev, parse_mountinfo) && - dev == pm->s_dev_rt) - return 0; - - pr_warn("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, - pm->fstype->name, pm->ns_mountpoint); - return -1; - } - - return 0; -} - -int check_mountpoint_fd(struct mount_info *pm, int mnt_fd) -{ - return __check_mountpoint_fd(pm, mnt_fd, false); -} - -/* - * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case. - * If mnt_fd is -1, the mountpoint will be opened by this function. - */ -int __open_mountpoint(struct mount_info *pm) -{ - int mntns_root, mnt_fd; - - mntns_root = mntns_get_root_fd(pm->nsid); - if (mntns_root < 0) - return -1; - - mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY); - if (mnt_fd < 0) { - pr_perror("Can't open %s", pm->ns_mountpoint); - return -1; - } - - if (check_mountpoint_fd(pm, mnt_fd)) { - close(mnt_fd); - return -1; + pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n", pm->s_dev, pm->s_dev_rt, dev, + pm->fstype->name, pm->ns_mountpoint); + goto err; } return mnt_fd; +err: + close(mnt_fd); + return -1; } int open_mount(unsigned int s_dev) { struct mount_info *m; - int mnt_fd; m = lookup_mnt_sdev(s_dev); if (!m) return -ENOENT; - mnt_fd = __open_mountpoint(m); - if (mnt_fd < 0) - pr_err("Can't open mount %#x\n", s_dev); - return mnt_fd; + return __open_mountpoint(m, -1); } /* Bind-mount a mount point in a temporary place without children */ @@ -1268,8 +1095,8 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_ return NULL; } - if (mount(mi->ns_mountpoint, mnt_path, NULL, MS_BIND, NULL)) { - pr_perror("Can't bind-mount %d:%s to %s", mi->mnt_id, mi->ns_mountpoint, mnt_path); + if (mount(mi->mountpoint, mnt_path, NULL, MS_BIND, NULL)) { + pr_warn("Can't bind-mount %d:%s to %s: %s\n", mi->mnt_id, mi->mountpoint, mnt_path, strerror(errno)); rmdir(mnt_path); return NULL; } @@ -1282,34 +1109,12 @@ static int get_clean_fd(struct mount_info *mi) char *mnt_path = NULL; char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; char mnt_path_root[] = "/cr-tmpfs.XXXXXX"; - int fd; mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root); if (!mnt_path) return -1; - fd = open(mnt_path, O_RDONLY | O_DIRECTORY, 0); - if (fd < 0) { - pr_perror("Can't open directory %s", mnt_path); - } else { - if (__check_mountpoint_fd(mi, fd, true)) - goto err_close; - } - - if (umount2(mnt_path, MNT_DETACH)) { - pr_perror("Can't detach mount %s", mnt_path); - goto err_close; - } - - if (rmdir(mnt_path)) { - pr_perror("Can't remove tmp dir %s", mnt_path); - goto err_close; - } - - return fd; -err_close: - close_safe(&fd); - return -1; + return open_detach_mount(mnt_path); } /* @@ -1340,7 +1145,7 @@ bool mnt_is_overmounted(struct mount_info *mi) list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; - if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { + if (issubpath(m->mountpoint, t->mountpoint)) { mi->is_overmounted = 1; goto exit; } @@ -1356,7 +1161,7 @@ bool mnt_is_overmounted(struct mount_info *mi) /* Check there is no children-overmount */ list_for_each_entry(c, &mi->children, siblings) - if (!strcmp(c->ns_mountpoint, mi->ns_mountpoint)) { + if (!strcmp(c->mountpoint, mi->mountpoint)) { mi->is_overmounted = 1; goto exit; } @@ -1365,35 +1170,13 @@ exit: return mi->is_overmounted; } -static int __set_is_overmounted(struct mount_info *mi) +static int set_is_overmounted(struct mount_info *mi) { /* coverity[check_return] */ mnt_is_overmounted(mi); return 0; } -/* - * mnt_is_overmounted is intended to detect overmounts in original dumped mount - * tree, so we pre-save it just after loading mount tree from images, so that - * it does not mess up with any helper mounts or tree changes we can do. - */ -static void prepare_is_overmounted(void) -{ - struct ns_id *nsid; - - for (nsid = ns_ids; nsid; nsid = nsid->next) { - struct mount_info *root; - - if (nsid->nd != &mnt_ns_desc) - continue; - - root = nsid->mnt.mntinfo_tree; - - BUG_ON(root->parent); - mnt_tree_for_each(root, __set_is_overmounted); - } -} - /* * __umount_children_overmounts() assumes that the mountpoint and * it's ancestors have no sibling-overmounts, so we can see children @@ -1410,7 +1193,7 @@ static int __umount_children_overmounts(struct mount_info *mi) */ again: list_for_each_entry(c, &m->children, siblings) { - if (!strcmp(c->ns_mountpoint, m->ns_mountpoint)) { + if (!strcmp(c->mountpoint, m->mountpoint)) { m = c; goto again; } @@ -1418,8 +1201,8 @@ again: /* Unmout children-overmounts in the order of visibility */ while (m != mi) { - if (umount2(m->ns_mountpoint, MNT_DETACH)) { - pr_perror("Unable to umount child-overmount %s", m->ns_mountpoint); + if (umount2(m->mountpoint, MNT_DETACH)) { + pr_perror("Unable to umount child-overmount %s", m->mountpoint); return -1; } BUG_ON(!m->parent); @@ -1449,12 +1232,12 @@ static int __umount_overmounts(struct mount_info *m) /* Unmount sibling-overmounts in visibility order */ next: ovm = NULL; - ovm_len = strlen(m->ns_mountpoint) + 1; + ovm_len = strlen(m->mountpoint) + 1; list_for_each_entry(t, &m->parent->children, siblings) { if (m == t) continue; - if (issubpath(m->ns_mountpoint, t->ns_mountpoint)) { - int t_len = strlen(t->ns_mountpoint); + if (issubpath(m->mountpoint, t->mountpoint)) { + int t_len = strlen(t->mountpoint); if (t_len < ovm_len && t_len > ovm_len_min) { ovm = t; @@ -1470,8 +1253,8 @@ next: if (__umount_children_overmounts(ovm)) return -1; - if (umount2(ovm->ns_mountpoint, MNT_DETACH)) { - pr_perror("Unable to umount %s", ovm->ns_mountpoint + 1); + if (umount2(ovm->mountpoint, MNT_DETACH)) { + pr_perror("Unable to umount %s", ovm->mountpoint); return -1; } @@ -1543,14 +1326,9 @@ int ns_open_mountpoint(void *arg) * explicitly as when last process exits mntns all mounts in it are * cleaned from their children, and we are exactly the last process. */ - *fd = open(mi->ns_mountpoint, O_DIRECTORY | O_RDONLY); + *fd = open(mi->mountpoint, O_DIRECTORY | O_RDONLY); if (*fd < 0) { - pr_perror("Unable to open %s(%d)", mi->ns_mountpoint, mi->mnt_id); - goto err; - } - - if (__check_mountpoint_fd(mi, *fd, true)) { - close(*fd); + pr_perror("Unable to open %s", mi->mountpoint); goto err; } @@ -1565,9 +1343,9 @@ int open_mountpoint(struct mount_info *pm) /* No overmounts and children - the entire mount is visible */ if (list_empty(&pm->children) && !mnt_is_overmounted(pm)) - return __open_mountpoint(pm); + return __open_mountpoint(pm, -1); - pr_info("Mount is not fully visible %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); + pr_info("Mount is not fully visible %s\n", pm->mountpoint); /* * We do two things below: @@ -1585,7 +1363,7 @@ int open_mountpoint(struct mount_info *pm) goto err; if (!mnt_is_overmounted(pm)) { - pr_info("\tmount has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); + pr_info("\tmount has children %s\n", pm->mountpoint); fd = get_clean_fd(pm); } @@ -1597,7 +1375,7 @@ int open_mountpoint(struct mount_info *pm) int pid, status; struct clone_arg ca = { .mi = pm, .fd = &fd }; - pr_info("\tmount is overmounted or has children %s(%d)\n", pm->ns_mountpoint, pm->mnt_id); + pr_info("\tmount is overmounted or has children %s\n", pm->mountpoint); /* * We are overmounted - not accessible in a regular way. We @@ -1628,7 +1406,7 @@ int open_mountpoint(struct mount_info *pm) goto err; } - return fd < 0 ? __open_mountpoint(pm) : fd; + return __open_mountpoint(pm, fd); err: if (ns_old >= 0) /* coverity[check_return] */ @@ -1637,38 +1415,28 @@ err: return -1; } -/* - * Helper for getting a path to mount's plain mountpoint - */ -char *get_plain_mountpoint(int mnt_id, char *name) -{ - static char tmp[PATH_MAX]; - int ret; - - if (!mnt_roots) - return NULL; - - if (name) - ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%s", mnt_roots, name); - else - ret = snprintf(tmp, sizeof(tmp), "%s/mnt-%010d", mnt_roots, mnt_id); - - if (ret >= sizeof(tmp)) - return NULL; - - return xstrdup(tmp); -} - -struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, - unsigned int s_dev, bool rst) +static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev) { struct mount_info *mi, *t, *parent; bool add_slash = false; int len; - mi = mnt_entry_alloc(rst); + if (!root->nsid) { + /* On restore we have fake top mount_info. Find real NS_ROOT */ + list_for_each_entry(t, &root->children, siblings) + if (t->nsid->type == NS_ROOT) { + root = t; + break; + } + if (!root->nsid) { + pr_err("Can't find NS_ROOT\n"); + return -1; + } + } + + mi = mnt_entry_alloc(); if (!mi) - return NULL; + return -1; len = strlen(root->mountpoint); /* It may be "./" or "./path/to/dir" */ @@ -1680,19 +1448,12 @@ struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, cha mi->mountpoint = xmalloc(len + strlen(path) + 1); if (!mi->mountpoint) goto err; - if (!rst) - mi->ns_mountpoint = mi->mountpoint; + mi->ns_mountpoint = mi->mountpoint; if (!add_slash) sprintf(mi->mountpoint, "%s%s", root->mountpoint, path); else sprintf(mi->mountpoint, "%s/%s", root->mountpoint, path); - if (rst) { - mi->plain_mountpoint = get_plain_mountpoint(-1, "crtime"); - if (!mi->plain_mountpoint) - goto err; - } - mi->mnt_id = HELPER_MNT_ID; - mi->is_dir = true; + mi->mnt_id = CRTIME_MNT_ID; mi->flags = mi->sb_flags = 0; mi->root = xstrdup("/"); mi->fsname = xstrdup(fsname); @@ -1707,7 +1468,7 @@ struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, cha parent = root; while (1) { list_for_each_entry(t, &parent->children, siblings) { - if (strstartswith(service_mountpoint(mi), service_mountpoint(t))) { + if (strstartswith(mi->mountpoint, t->mountpoint)) { parent = t; break; } @@ -1716,64 +1477,52 @@ struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, cha break; } - mi->mnt_bind_is_populated = true; - mi->is_overmounted = false; mi->nsid = parent->nsid; mi->parent = parent; mi->parent_mnt_id = parent->mnt_id; + mi->next = parent->next; + parent->next = mi; list_add(&mi->siblings, &parent->children); - pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", service_mountpoint(mi), service_mountpoint(parent), - parent->mnt_id); - return mi; + pr_info("Add cr-time mountpoint %s with parent %s(%u)\n", mi->mountpoint, parent->mountpoint, parent->mnt_id); + return 0; err: mnt_entry_free(mi); - return NULL; + return -1; } -/* - * Returns: - * 0 - success - * -1 - error - * 1 - skip - */ +/* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source, const char *target, const char *type) { - int mnt_fd, cwd_fd, exit_code = -1; + int mnt_fd, cwd_fd, ret, exit_code = 0; struct stat st; - if (switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd)) { + ret = switch_mnt_ns(ns->ns_pid, &mnt_fd, &cwd_fd); + if (ret < 0) { pr_err("Can't switch mnt_ns\n"); - return -1; + goto out; } - if (mount(source, target, type, 0, NULL)) { - switch (errno) { - case EPERM: - case EBUSY: - case ENODEV: - case ENOENT: - pr_debug("Skipping %s as was unable to mount it: %s\n", type, strerror(errno)); + ret = mount(source, target, type, 0, NULL); + if (ret < 0) { + pr_perror("Unable to mount %s %s", source, target); + exit_code = -errno; + goto restore_ns; + } else { + if (stat(target, &st) < 0) { + pr_perror("Can't stat %s", target); + exit_code = 0; + } else { + *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); exit_code = 1; - break; - default: - pr_perror("Unable to mount %s %s %s", type, source, target); } - goto restore_ns; } - if (stat(target, &st)) { - pr_perror("Can't stat %s", target); - goto restore_ns; - } - - *s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev)); - exit_code = 0; restore_ns: - if (restore_mnt_ns(mnt_fd, &cwd_fd)) - exit_code = -1; - return exit_code; + ret = restore_mnt_ns(mnt_fd, &cwd_fd); +out: + return ret < 0 ? 0 : exit_code; } static int dump_one_fs(struct mount_info *mi) @@ -1782,7 +1531,7 @@ static int dump_one_fs(struct mount_info *mi) struct mount_info *t; bool first = true; - if (mnt_is_root_bind(mi) || mi->need_plugin || mnt_is_external_bind(mi) || !mi->fstype->dump) + if (mi->is_ns_root || mi->need_plugin || mnt_is_external(mi) || !mi->fstype->dump) return 0; /* mnt_bind is a cycled list, so list_for_each can't be used here. */ @@ -1806,7 +1555,7 @@ static int dump_one_fs(struct mount_info *mi) return 0; } - pr_err("Unable to dump a file system for %d:%s\n", mi->mnt_id, mi->ns_mountpoint); + pr_err("Unable to dump a file system for %d:%s\n", mi->mnt_id, mi->mountpoint); return -1; } @@ -1814,22 +1563,23 @@ static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img) { MntEntry me = MNT_ENTRY__INIT; - pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, pm->root, pm->ns_mountpoint); + pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev, pm->root, pm->mountpoint); me.fstype = pm->fstype->code; if (me.fstype == FSTYPE__AUTO) me.fsname = pm->fsname; - if (!pm->dumped && dump_one_fs(pm)) - return -1; + if (!pm->external) { + if (!pm->dumped && dump_one_fs(pm)) + return -1; - if (!mnt_is_external_bind(pm) && !fsroot_mounted(pm) && pm->fstype->check_bindmount && - pm->fstype->check_bindmount(pm)) - return -1; + if (!fsroot_mounted(pm) && pm->fstype->check_bindmount && pm->fstype->check_bindmount(pm)) + return -1; + } - if (pm->mnt_id == HELPER_MNT_ID) { - pr_info("Skip dumping helper mountpoint: %s\n", pm->ns_mountpoint); + if (pm->mnt_id == CRTIME_MNT_ID) { + pr_info("Skip dumping cr-time mountpoint: %s\n", pm->mountpoint); return 0; } @@ -1839,7 +1589,7 @@ static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img) me.flags = pm->flags; me.sb_flags = pm->sb_flags; me.has_sb_flags = true; - me.mountpoint = pm->ns_mountpoint + 1; + me.mountpoint = pm->mountpoint + 1; me.source = pm->source; me.options = pm->options; me.shared_id = pm->shared_id; @@ -1974,14 +1724,14 @@ err: #define MNT_WALK_NONE 0 && -int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)) +static int mnt_tree_for_each(struct mount_info *start, int (*fn)(struct mount_info *)) { struct mount_info *tmp; LIST_HEAD(postpone); LIST_HEAD(postpone2); int progress; - pr_debug("Start with %d:%s\n", start->mnt_id, start->ns_mountpoint); + pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint); list_add(&start->postpone, &postpone); again: @@ -1995,7 +1745,7 @@ again: pr_err("A few mount points can't be mounted\n"); list_for_each_entry(m, &postpone2, postpone) { - pr_err("%d:%d %s %s %s\n", m->mnt_id, m->parent_mnt_id, m->root, m->ns_mountpoint, m->source); + pr_err("%d:%d %s %s %s\n", m->mnt_id, m->parent_mnt_id, m->root, m->mountpoint, m->source); } return -1; } @@ -2013,12 +1763,11 @@ static int mnt_tree_for_each_reverse(struct mount_info *m, int (*fn)(struct moun int progress = 0; MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *)NULL, progress); - (void)progress; // Suppress -Wused-but-unset-variable for clang>=15 return 0; } -char *resolve_source(struct mount_info *mi) +static char *resolve_source(struct mount_info *mi) { if (kdev_major(mi->s_dev) == 0) /* @@ -2027,11 +1776,7 @@ char *resolve_source(struct mount_info *mi) */ return mi->source; - /* - * FSTYPE__AUTO check is a fallback for old images which do not have - * explicit EXTERNAL_DEV_MOUNT mark, but still have "dev[key]" in source. - */ - if (mnt_is_dev_external(mi) || mi->fstype->code == FSTYPE__AUTO) { + if (mi->fstype->code == FSTYPE__AUTO) { struct stat st; char *val; @@ -2044,29 +1789,29 @@ char *resolve_source(struct mount_info *mi) return mi->source; } - pr_err("No device for %s(%d) mount\n", mi->ns_mountpoint, mi->mnt_id); + pr_err("No device for %s mount\n", mi->mountpoint); return NULL; } static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave) { - pr_debug("%d:%s private %d shared %d slave %d\n", mi->mnt_id, service_mountpoint(mi), private, shared, slave); + pr_debug("%d:%s private %d shared %d slave %d\n", mi->mnt_id, mi->mountpoint, private, shared, slave); if (mi->flags & MS_UNBINDABLE) { if (shared || slave) { - pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", service_mountpoint(mi)); + pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint); } else { if (!mnt_is_overmounted(mi)) { /* Someone may still want to bind from us, let them do it. */ - pr_debug("Temporary leave unbindable mount %s as private\n", service_mountpoint(mi)); - if (mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { + pr_debug("Temporary leave unbindable mount %s as private\n", mi->mountpoint); + if (mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %d private", mi->mnt_id); return -1; } list_add(&mi->mnt_unbindable, &delayed_unbindable); return 0; } - if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { + if (mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL)) { pr_perror("Unable to make %d unbindable", mi->mnt_id); return -1; } @@ -2074,15 +1819,15 @@ static int restore_shared_options(struct mount_info *mi, bool private, bool shar } } - if (private && mount(NULL, service_mountpoint(mi), NULL, MS_PRIVATE, NULL)) { + if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) { pr_perror("Unable to make %d private", mi->mnt_id); return -1; } - if (slave && mount(NULL, service_mountpoint(mi), NULL, MS_SLAVE, NULL)) { + if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) { pr_perror("Unable to make %d slave", mi->mnt_id); return -1; } - if (shared && mount(NULL, service_mountpoint(mi), NULL, MS_SHARED, NULL)) { + if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) { pr_perror("Unable to make %d shared", mi->mnt_id); return -1; } @@ -2099,8 +1844,6 @@ static int umount_from_slaves(struct mount_info *mi) struct mount_info *t; char *mpath, buf[PATH_MAX]; - BUG_ON(mi->parent == root_yard_mp); - list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) { if (!t->mounted) continue; @@ -2138,7 +1881,7 @@ static int propagate_siblings(struct mount_info *mi) continue; if (t->bind && t->bind->shared_id == t->shared_id) continue; - pr_debug("\t\tBind share %s(%d)\n", t->ns_mountpoint, t->mnt_id); + pr_debug("\t\tBind share %s\n", t->mountpoint); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } @@ -2146,15 +1889,7 @@ static int propagate_siblings(struct mount_info *mi) list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) { if (t->mounted || t->bind) continue; - pr_debug("\t\tBind slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); - t->bind = mi; - t->s_dev_rt = mi->s_dev_rt; - } - - list_for_each_entry(t, &mi->mnt_ext_slave, mnt_ext_slave) { - if (t->mounted || t->bind) - continue; - pr_debug("\t\tBind ext-slave %s(%d)\n", t->ns_mountpoint, t->mnt_id); + pr_debug("\t\tBind slave %s\n", t->mountpoint); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } @@ -2168,7 +1903,7 @@ static int propagate_mount(struct mount_info *mi) propagate_siblings(mi); - if (!mi->parent || mi->parent == root_yard_mp) + if (!mi->parent) goto skip_parent; umount_from_slaves(mi); @@ -2177,7 +1912,7 @@ static int propagate_mount(struct mount_info *mi) list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { /* Should not propagate the same mount twice */ BUG_ON(p->mounted); - pr_debug("\t\tPropagate %s(%d)\n", p->ns_mountpoint, p->mnt_id); + pr_debug("\t\tPropagate %s\n", p->mountpoint); /* * When a mount is propagated, the result mount @@ -2203,11 +1938,11 @@ skip_parent: continue; if (t->bind) continue; - if (t->master_id) + if (t->master_id > 0) continue; if (!issubpath(t->root, mi->root)) continue; - pr_debug("\t\tBind private %s(%d)\n", t->ns_mountpoint, t->mnt_id); + pr_debug("\t\tBind private %s\n", t->mountpoint); t->bind = mi; t->s_dev_rt = mi->s_dev_rt; } @@ -2216,7 +1951,7 @@ skip_parent: return 0; } -int fetch_rt_stat(struct mount_info *m, const char *where) +static int fetch_rt_stat(struct mount_info *m, const char *where) { struct stat st; @@ -2229,15 +1964,23 @@ int fetch_rt_stat(struct mount_info *m, const char *where) return 0; } -int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags) +/* + * Here are a set of flags which we know how to handle for the one mount call. + * All of them except MS_RDONLY are set only as mnt flags. + * MS_RDONLY is set for both mnt ans sb flags, so we can restore it for one + * mount call only if it set for both masks. + */ +#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_RDONLY) + +static int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags) { - int ret = mount(src, service_mountpoint(mi), fstype, mountflags, mi->options); + int ret = mount(src, mi->mountpoint, fstype, mountflags, mi->options); if (ret) - pr_perror("Unable to mount %s %s (id=%d)", src, service_mountpoint(mi), mi->mnt_id); + pr_perror("Unable to mount %s %s (id=%d)", src, mi->mountpoint, mi->mnt_id); return ret; } -char *mnt_fsname(struct mount_info *mi) +static char *mnt_fsname(struct mount_info *mi) { if (mi->fstype->code == FSTYPE__AUTO) return mi->fsname; @@ -2265,12 +2008,12 @@ static int userns_mount(char *src, void *args, int fd, pid_t pid) return err; } -int apply_sb_flags(void *args, int fd, pid_t pid) +static int apply_sb_flags(void *args, int fd, pid_t pid) { return userns_mount(NULL, args, fd, pid); } -int mount_root(void *args, int fd, pid_t pid) +static int mount_root(void *args, int fd, pid_t pid) { return userns_mount(opts.root, args, fd, pid); } @@ -2298,31 +2041,40 @@ static int do_new_mount(struct mount_info *mi) sflags &= ~MS_RDONLY; if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { - pr_perror("Can't mount at %s", service_mountpoint(mi)); + pr_perror("Can't mount at %s", mi->mountpoint); return -1; } if (tp->restore && tp->restore(mi)) return -1; - if (remount_ro) { + if (mi->mnt_id == CRTIME_MNT_ID) { + /* C-r time mountpoint, umount it */ + if (umount(mi->mountpoint) < 0) { + pr_perror("Can't umount %s", mi->mountpoint); + return -1; + } + goto out; + } + + if (!mi->is_ns_root && remount_ro) { int fd; - fd = open(service_mountpoint(mi), O_PATH); + fd = open(mi->mountpoint, O_PATH); if (fd < 0) { - pr_perror("Unable to open %s", service_mountpoint(mi)); + pr_perror("Unable to open %s", mi->mountpoint); return -1; } sflags |= MS_RDONLY | MS_REMOUNT; if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { - pr_err("Unable to apply mount flags %d for %s\n", mi->sb_flags, service_mountpoint(mi)); + pr_err("Unable to apply mount flags %d for %s\n", mi->sb_flags, mi->mountpoint); close(fd); return -1; } close(fd); } - if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + if (mflags && mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { pr_perror("Unable to apply bind-mount options"); return -1; } @@ -2334,18 +2086,18 @@ static int do_new_mount(struct mount_info *mi) BUG_ON(mi->master_id); if (restore_shared_options(mi, !mi->shared_id, mi->shared_id, 0)) return -1; - +out: mi->mounted = true; return 0; } -int restore_ext_mount(struct mount_info *mi) +static int restore_ext_mount(struct mount_info *mi) { int ret; - pr_debug("Restoring external bind mount %s\n", service_mountpoint(mi)); - ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, service_mountpoint(mi), "/", NULL); + pr_debug("Restoring external bind mount %s\n", mi->mountpoint); + ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL); if (ret) pr_err("Can't restore ext mount (%d)\n", ret); return ret; @@ -2414,7 +2166,7 @@ static int do_bind_mount(struct mount_info *mi) goto out; } - if (mnt_is_nodev_external(mi)) { + if (mi->external) { /* * We have / pointing to criu's ns root still, * so just use the mapping's path. The mountpoint @@ -2438,9 +2190,9 @@ static int do_bind_mount(struct mount_info *mi) */ mi->private = mi->bind->private; - mnt_path = service_mountpoint(mi->bind); + mnt_path = mi->bind->mountpoint; - /* Access a mount by fd if service_mountpoint(mi->bind) is overmounted */ + /* Access a mount by fd if mi->bind->mountpoint is overmounted */ if (mi->bind->fd >= 0) { snprintf(mnt_fd_path, sizeof(mnt_fd_path), "/proc/self/fd/%d", mi->bind->fd); mnt_path = mnt_fd_path; @@ -2453,14 +2205,14 @@ static int do_bind_mount(struct mount_info *mi) * The target path may be over-mounted by one of child mounts * and we need to create a new bind-mount to get access to the path. */ - mp_len = strlen(service_mountpoint(mi->bind)); - if (mp_len > 1) /* skip a joining / if service_mountpoint(mi->bind) isn't "/" */ + mp_len = strlen(mi->bind->mountpoint); + if (mp_len > 1) /* skip a joining / if mi->bind->mountpoint isn't "/" */ mp_len++; list_for_each_entry(c, &mi->bind->children, siblings) { if (!c->mounted) continue; - if (issubpath(cut_root, service_mountpoint(c) + mp_len)) + if (issubpath(cut_root, c->mountpoint + mp_len)) break; /* a source path is overmounted */ } @@ -2481,11 +2233,11 @@ skip_overmount_check: snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); root = rpath; do_bind: - pr_info("\tBind %s to %s\n", root, service_mountpoint(mi)); + pr_info("\tBind %s to %s\n", root, mi->mountpoint); if (unlikely(mi->deleted)) { - if (stat(service_mountpoint(mi), &st)) { - pr_perror("Can't fetch stat on %s", service_mountpoint(mi)); + if (stat(mi->mountpoint, &st)) { + pr_perror("Can't fetch stat on %s", mi->mountpoint); goto err; } @@ -2507,15 +2259,15 @@ do_bind: } } - if (mount(root, service_mountpoint(mi), NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { - pr_perror("Can't bind-mount at %s", service_mountpoint(mi)); + if (mount(root, mi->mountpoint, NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) { + pr_perror("Can't mount at %s", mi->mountpoint); goto err; } mflags = mi->flags & (~MS_PROPAGATE); if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) - if (mount(NULL, service_mountpoint(mi), NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { - pr_perror("Can't re-mount at %s", service_mountpoint(mi)); + if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { + pr_perror("Can't mount at %s", mi->mountpoint); goto err; } @@ -2560,26 +2312,22 @@ err: return exit_code; } +static bool rst_mnt_is_root(struct mount_info *m) +{ + return (m->is_ns_root && m->nsid->id == root_item->ids->mnt_ns_id); +} + static bool can_mount_now(struct mount_info *mi) { - struct mount_info *ext; - - if (rst_mnt_is_root(mi)) { - pr_debug("%s: true as %d is mntns root\n", __func__, mi->mnt_id); + if (rst_mnt_is_root(mi)) return true; - } /* Parent should be mounted already, that's how mnt_tree_for_each works */ BUG_ON(mi->parent && !mi->parent->mounted); - if (mnt_is_nodev_external(mi)) + if (mi->external) goto shared; - if (!mi->bind && !mi->external && (ext = mnt_get_external_bind(mi)) && !has_mounted_external_bind(mi)) { - pr_debug("%s: false as %d's external %d is not mounted\n", __func__, mi->mnt_id, ext->mnt_id); - return false; - } - /* * We're the slave peer: * - Make sure the master peer is already mounted @@ -2589,35 +2337,21 @@ static bool can_mount_now(struct mount_info *mi) if (mi->mnt_master) { struct mount_info *c, *s; - if (mi->bind == NULL) { - pr_debug("%s: false as %d is slave with unmounted master %d\n", __func__, mi->mnt_id, - mi->mnt_master->mnt_id); + if (mi->bind == NULL) return false; - } - list_for_each_entry(c, &mi->mnt_master->children, siblings) { - if (!c->mounted) { - pr_debug("%s: false as %d is slave with unmounted master's children %d\n", __func__, - mi->mnt_id, c->mnt_id); + list_for_each_entry(c, &mi->mnt_master->children, siblings) + if (!c->mounted) return false; - } - } - list_for_each_entry(s, &mi->mnt_master->mnt_share, mnt_share) { - list_for_each_entry(c, &s->children, siblings) { - if (!c->mounted) { - pr_debug("%s: false as %d is slave with unmounted children of master's share\n", - __func__, mi->mnt_id); + list_for_each_entry(s, &mi->mnt_master->mnt_share, mnt_share) + list_for_each_entry(c, &s->children, siblings) + if (!c->mounted) return false; - } - } - } } - if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) { - pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); + if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin)) return false; - } shared: /* Mount only after all parents of our propagation group mounted */ @@ -2626,11 +2360,8 @@ shared: list_for_each_entry(p, &mi->mnt_propagate, mnt_propagate) { BUG_ON(!p->parent); - if (!p->parent->mounted) { - pr_debug("%s: false as %d has unmounted parent %d of its propagation group\n", __func__, - mi->mnt_id, p->parent->mnt_id); + if (!p->parent->mounted) return false; - } } } @@ -2677,11 +2408,8 @@ shared: /* Check not propagated mounts mounted and cleanup list */ list_for_each_entry_safe(p, t, &mi_notprop, mnt_notprop) { - if (!p->mounted) { - pr_debug("%s: false as %d has unmounted 'anti'-propagation mount %d\n", __func__, - mi->mnt_id, p->mnt_id); + if (!p->mounted) can = false; - } list_del_init(&p->mnt_notprop); } @@ -2694,17 +2422,10 @@ shared: static int do_mount_root(struct mount_info *mi) { - unsigned long mflags = mi->flags & (~MS_PROPAGATE); - if (restore_shared_options(mi, !mi->shared_id && !mi->master_id, mi->shared_id, mi->master_id)) return -1; - if (mflags && mount(NULL, service_mountpoint(mi), NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { - pr_perror("Unable to apply root mount options"); - return -1; - } - - return fetch_rt_stat(mi, service_mountpoint(mi)); + return fetch_rt_stat(mi, mi->mountpoint); } static int do_close_one(struct mount_info *mi) @@ -2715,7 +2436,7 @@ static int do_close_one(struct mount_info *mi) static int set_unbindable(struct mount_info *mi) { - if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { + if (mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL)) { pr_perror("Failed setting unbindable flag on %d", mi->mnt_id); return -1; } @@ -2731,19 +2452,19 @@ static int do_mount_one(struct mount_info *mi) return 0; if (!can_mount_now(mi)) { - pr_debug("Postpone mount %s(%d)\n", mi->ns_mountpoint, mi->mnt_id); + pr_debug("Postpone slave %s\n", mi->mountpoint); return 1; } - if ((mi->parent && mi->parent != root_yard_mp) && !strcmp(mi->parent->ns_mountpoint, mi->ns_mountpoint)) { - mi->parent->fd = open(service_mountpoint(mi->parent), O_PATH); + if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) { + mi->parent->fd = open(mi->parent->mountpoint, O_PATH); if (mi->parent->fd < 0) { - pr_perror("Unable to open %s", service_mountpoint(mi)); + pr_perror("Unable to open %s", mi->mountpoint); return -1; } } - pr_debug("\tMounting %s %d@%s (%d)\n", mi->fstype->name, mi->mnt_id, service_mountpoint(mi), mi->need_plugin); + pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin); if (rst_mnt_is_root(mi)) { int fd; @@ -2756,22 +2477,21 @@ static int do_mount_one(struct mount_info *mi) /* do_mount_root() is called from populate_mnt_ns() */ if (root_ns_mask & CLONE_NEWUSER) { - fd = open(service_mountpoint(mi), O_PATH); + fd = open(mi->mountpoint, O_PATH); if (fd < 0) { - pr_perror("Unable to open %s", service_mountpoint(mi)); + pr_perror("Unable to open %s", mi->mountpoint); return -1; } if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { - pr_err("Unable to mount %s\n", service_mountpoint(mi)); + pr_err("Unable to mount %s\n", mi->mountpoint); close(fd); return -1; } close(fd); } else { - if (mount(opts.root, service_mountpoint(mi), NULL, flags, NULL)) { - pr_perror("Unable to mount %s %s (id=%d)", opts.root, service_mountpoint(mi), - mi->mnt_id); + if (mount(opts.root, mi->mountpoint, NULL, flags, NULL)) { + pr_perror("Unable to mount %s %s (id=%d)", opts.root, mi->mountpoint, mi->mnt_id); return -1; } } @@ -2780,13 +2500,12 @@ static int do_mount_one(struct mount_info *mi) return -1; mi->mounted = true; ret = 0; - } else if (!mi->bind && !mi->need_plugin && !mnt_is_nodev_external(mi)) { + } else if (!mi->bind && !mi->need_plugin && !mi->external) ret = do_new_mount(mi); - } else { + else ret = do_bind_mount(mi); - } - if (ret == 0 && fetch_rt_stat(mi, service_mountpoint(mi))) + if (ret == 0 && fetch_rt_stat(mi, mi->mountpoint)) return -1; if (ret == 0 && propagate_mount(mi)) @@ -2795,8 +2514,8 @@ static int do_mount_one(struct mount_info *mi) if (mi->fstype->code == FSTYPE__UNSUPPORTED) { struct statfs st; - if (statfs(service_mountpoint(mi), &st)) { - pr_perror("Unable to statfs %s", service_mountpoint(mi)); + if (statfs(mi->mountpoint, &st)) { + pr_perror("Unable to statfs %s", mi->mountpoint); return -1; } if (st.f_type == BTRFS_SUPER_MAGIC) @@ -2811,17 +2530,17 @@ static int do_umount_one(struct mount_info *mi) if (!mi->parent) return 0; - if (mount("none", service_mountpoint(mi->parent), "none", MS_REC | MS_PRIVATE, NULL)) { - pr_perror("Can't mark %s as private", service_mountpoint(mi->parent)); + if (mount("none", mi->parent->mountpoint, "none", MS_REC | MS_PRIVATE, NULL)) { + pr_perror("Can't mark %s as private", mi->parent->mountpoint); return -1; } - if (umount(service_mountpoint(mi))) { - pr_perror("Can't umount at %s", service_mountpoint(mi)); + if (umount(mi->mountpoint)) { + pr_perror("Can't umount at %s", mi->mountpoint); return -1; } - pr_info("Umounted at %s\n", service_mountpoint(mi)); + pr_info("Umounted at %s\n", mi->mountpoint); return 0; } @@ -2834,11 +2553,15 @@ static int do_umount_one(struct mount_info *mi) * roots_yard where it will be restored. The remapped mount will be * moved to the right places after restoring all mounts. */ + +static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs); +static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len); + static LIST_HEAD(mnt_remap_list); static int remap_id; struct mnt_remap_entry { - struct mount_info *mi; /* child is remapped into the root yards */ + struct mount_info *mi; /* child is remaped into the root yards */ struct mount_info *parent; /* the origin parent for the child*/ struct list_head node; }; @@ -2932,7 +2655,7 @@ static int fixup_remap_mounts(void) return 0; } -int cr_pivot_root(char *root) +static int cr_pivot_root(char *root) { char tmp_dir_tmpl[] = "crtools-put-root.XXXXXX"; bool tmp_dir = false; @@ -3000,7 +2723,7 @@ err_root: return exit_code; } -struct mount_info *mnt_entry_alloc(bool rst) +struct mount_info *mnt_entry_alloc() { struct mount_info *new; @@ -3011,30 +2734,17 @@ struct mount_info *mnt_entry_alloc(bool rst) new = xzalloc(sizeof(struct mount_info)); if (new) { - if (rst) { - new->rmi = shmalloc(sizeof(struct rst_mount_info)); - if (!new->rmi) { - xfree(new); - return NULL; - } - memset(new->rmi, 0, sizeof(struct rst_mount_info)); - } - new->mp_fd_id = -1; - new->mnt_fd_id = -1; - new->is_dir = -1; new->fd = -1; new->is_overmounted = -1; INIT_LIST_HEAD(&new->children); INIT_LIST_HEAD(&new->siblings); INIT_LIST_HEAD(&new->mnt_slave_list); - INIT_LIST_HEAD(&new->mnt_ext_slave); INIT_LIST_HEAD(&new->mnt_share); INIT_LIST_HEAD(&new->mnt_bind); INIT_LIST_HEAD(&new->mnt_propagate); INIT_LIST_HEAD(&new->mnt_notprop); INIT_LIST_HEAD(&new->mnt_unbindable); INIT_LIST_HEAD(&new->postpone); - INIT_LIST_HEAD(&new->deleted_list); } return new; } @@ -3044,7 +2754,6 @@ void mnt_entry_free(struct mount_info *mi) if (mi) { xfree(mi->root); xfree(mi->mountpoint); - xfree(mi->plain_mountpoint); xfree(mi->source); xfree(mi->options); xfree(mi->fsname); @@ -3056,7 +2765,7 @@ void mnt_entry_free(struct mount_info *mi) * Helper for getting a path to where the namespace's root * is re-constructed. */ -int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) +static inline int print_ns_root(struct ns_id *ns, int remap_id, char *buf, int bs) { return snprintf(buf, bs, "%s/%d-%010d", mnt_roots, ns->id, remap_id); } @@ -3095,7 +2804,7 @@ static int get_mp_root(MntEntry *me, struct mount_info *mi) me->ext_key = me->root; /* * Putting the id of external mount which is provided by user, - * to ->root can confuse mnt_is_external_bind and other functions + * to ->root can confuse mnt_is_external and other functions * which expect to see the path in the file system to the root * of these mount (mounts_equal, mnt_build_ids_tree, * find_fsroot_mount_for, find_best_external_match, etc.) @@ -3115,11 +2824,10 @@ static int get_mp_root(MntEntry *me, struct mount_info *mi) * from the command line and put into root's place */ - if (!strcmp(me->ext_key, AUTODETECTED_MOUNT)) { + ext = ext_mount_lookup(me->ext_key); + if (!ext) { if (!opts.autodetect_ext_mounts) { - pr_err("Mount %d:%s is autodetected external mount. " - "Try \"--ext-mount-map auto\" to allow them.\n", - mi->mnt_id, mi->ns_mountpoint); + pr_err("No mapping for %s mountpoint\n", me->mountpoint); return -1; } @@ -3133,14 +2841,6 @@ static int get_mp_root(MntEntry *me, struct mount_info *mi) */ ext = mi->source; - } else if (!strcmp(me->ext_key, EXTERNAL_DEV_MOUNT)) { - ext = EXTERNAL_DEV_MOUNT; - } else { - ext = ext_mount_lookup(me->ext_key); - if (!ext) { - pr_err("No mapping for %d:%s mountpoint\n", mi->mnt_id, mi->ns_mountpoint); - return -1; - } } mi->external = ext; @@ -3170,11 +2870,7 @@ static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root mi->ns_mountpoint = mi->mountpoint + root_len; - mi->plain_mountpoint = get_plain_mountpoint(mi->mnt_id, NULL); - if (!mi->plain_mountpoint) - return -1; - - pr_debug("\t\tWill mount %d @ %s %s\n", mi->mnt_id, service_mountpoint(mi), mi->ns_mountpoint); + pr_debug("\t\tWill mount %d @ %s\n", mi->mnt_id, mi->mountpoint); return 0; } @@ -3260,12 +2956,13 @@ static int collect_mnt_from_image(struct mount_info **head, struct mount_info ** if (ret <= 0) break; - pm = mnt_entry_alloc(true); + pm = mnt_entry_alloc(); if (!pm) goto err; pm->nsid = nsid; - mntinfo_add_list_before(head, pm); + pm->next = *head; + *head = pm; if (!*tail) *tail = pm; @@ -3324,9 +3021,7 @@ static int collect_mnt_from_image(struct mount_info **head, struct mount_info ** if (get_mp_mountpoint(me->mountpoint, pm, root, root_len)) goto err; - pr_debug("\t" - "Read %d mp @ %s\n", - pm->mnt_id, pm->ns_mountpoint); + pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint); } if (me) @@ -3340,41 +3035,6 @@ err: return -1; } -static int merge_mount_trees(void) -{ - struct ns_id *nsid; - - root_yard_mp = mnt_entry_alloc(true); - if (!root_yard_mp) - return -1; - - root_yard_mp->mountpoint = mnt_roots; - root_yard_mp->plain_mountpoint = xstrdup(mnt_roots); - if (!root_yard_mp->plain_mountpoint) - return -1; - root_yard_mp->is_dir = true; - root_yard_mp->mounted = true; - root_yard_mp->mnt_bind_is_populated = true; - root_yard_mp->is_overmounted = false; - root_yard_mp->mnt_id = HELPER_MNT_ID; - - /* Merge mount trees together under root_yard_mp */ - for (nsid = ns_ids; nsid; nsid = nsid->next) { - struct mount_info *root; - - if (nsid->nd != &mnt_ns_desc) - continue; - - root = nsid->mnt.mntinfo_tree; - - pr_debug("Mountpoint %d (@%s) moved to the root yard\n", root->mnt_id, root->ns_mountpoint); - root->parent = root_yard_mp; - list_add(&root->siblings, &root_yard_mp->children); - } - - return 0; -} - int read_mnt_ns_img(void) { struct mount_info *pms = NULL; @@ -3398,24 +3058,11 @@ int read_mnt_ns_img(void) if (!nsid->mnt.mntinfo_tree) return -1; - /* mntns root mounts are always directories */ - nsid->mnt.mntinfo_tree->is_dir = true; - tail->next = pms; pms = head; } mntinfo = pms; - - search_bindmounts(); - prepare_is_overmounted(); - - if (!opts.mntns_compat_mode && resolve_shared_mounts_v2()) - return -1; - - if (merge_mount_trees()) - return -1; - return 0; } @@ -3514,10 +3161,42 @@ void fini_restore_mntns(void) } } +static int merge_mount_trees(struct mount_info *root_yard) +{ + struct mount_info *first = NULL; + struct ns_id *nsid; + + /* Merge mount trees together under root_yard */ + for (nsid = ns_ids; nsid; nsid = nsid->next) { + struct mount_info *root; + + if (nsid->nd != &mnt_ns_desc) + continue; + + root = nsid->mnt.mntinfo_tree; + + if (!first) + first = root; + else if (!mounts_sb_equal(root, first) || strcmp(root->root, first->root)) { + pr_err("Nested mount namespaces with different " + "roots %d (@%s %s) %d (@%s %s) are not supported yet\n", + root->mnt_id, root->mountpoint, root->root, first->mnt_id, first->mountpoint, + first->root); + return -1; + } + + pr_debug("Mountpoint %d (@%s) moved to the root yard\n", root->mnt_id, root->mountpoint); + root->parent = root_yard; + list_add(&root->siblings, &root_yard->children); + } + + return 0; +} + /* * All nested mount namespaces are restore as sub-trees of the root namespace. */ -static int populate_roots_yard(struct mount_info *cr_time) +static int populate_roots_yard(void) { struct mnt_remap_entry *r; char path[PATH_MAX]; @@ -3542,44 +3221,50 @@ static int populate_roots_yard(struct mount_info *cr_time) * contains mounts which has to be restored separately */ list_for_each_entry(r, &mnt_remap_list, node) { - if (mkdirpat(AT_FDCWD, service_mountpoint(r->mi), 0755)) { - pr_perror("Unable to create %s", service_mountpoint(r->mi)); + if (mkdirpat(AT_FDCWD, r->mi->mountpoint, 0755)) { + pr_perror("Unable to create %s", r->mi->mountpoint); return -1; } } - if (cr_time && mkdirpat(AT_FDCWD, service_mountpoint(cr_time), 0755)) { - pr_perror("Unable to create %s", service_mountpoint(cr_time)); - return -1; - } - return 0; } static int populate_mnt_ns(void) { - struct mount_info *cr_time = NULL; int ret; + root_yard_mp = mnt_entry_alloc(); + if (!root_yard_mp) + return -1; + + root_yard_mp->mountpoint = mnt_roots; + root_yard_mp->mounted = true; + + if (merge_mount_trees(root_yard_mp)) + return -1; + #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* Add to mount tree. Generic code will mount it later */ - cr_time = add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true); - if (!cr_time) + ret = add_cr_time_mount(root_yard_mp, "binfmt_misc", BINFMT_MISC_HOME, 0); + if (ret) return -1; } #endif - if (resolve_shared_mounts(mntinfo)) + if (resolve_shared_mounts(mntinfo, 0)) return -1; if (validate_mounts(mntinfo, false)) return -1; + mnt_tree_for_each(root_yard_mp, set_is_overmounted); + if (find_remap_mounts(root_yard_mp)) return -1; - if (populate_roots_yard(cr_time)) + if (populate_roots_yard()) return -1; if (mount_clean_path()) @@ -3736,9 +3421,6 @@ int prepare_mnt_ns(void) free_mntinfo(old); } - if (!opts.mntns_compat_mode) - return prepare_mnt_ns_v2(); - ret = populate_mnt_ns(); if (ret) return -1; @@ -3959,6 +3641,7 @@ int mntns_get_root_by_mnt_id(int mnt_id) struct collect_mntns_arg { bool need_to_validate; bool for_dump; + int root_master_id; }; static int collect_mntns(struct ns_id *ns, void *__arg) @@ -3975,6 +3658,9 @@ static int collect_mntns(struct ns_id *ns, void *__arg) mntinfo_add_list(pms); + if (arg->need_to_validate && ns->id == root_item->ids->mnt_ns_id) + arg->root_master_id = ns->mnt.mntinfo_tree->master_id; + return 0; } @@ -3990,8 +3676,6 @@ int collect_mnt_namespaces(bool for_dump) if (ret) goto err; - search_bindmounts(); - #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (for_dump && !opts.has_binfmt_misc) { unsigned int s_dev = 0; @@ -4004,10 +3688,16 @@ int collect_mnt_namespaces(bool for_dump) if (ns) { ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", "/" BINFMT_MISC_HOME, "binfmt_misc"); - if (ret == -1) { + if (ret == -EPERM) + pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n"); + else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) { + pr_err("Can't mount binfmt_misc: %d %s\n", ret, strerror(-ret)); goto err; - } else if (ret == 0 && !add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, - s_dev, false)) { + } else if (ret == 0) { + ret = -1; + goto err; + } else if (ret > 0 && add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc", BINFMT_MISC_HOME, + s_dev) < 0) { ret = -1; goto err; } @@ -4022,7 +3712,7 @@ int collect_mnt_namespaces(bool for_dump) if (arg.need_to_validate) { ret = -1; - if (resolve_shared_mounts(mntinfo)) + if (resolve_shared_mounts(mntinfo, arg.root_master_id)) goto err; if (validate_mounts(mntinfo, true)) goto err; @@ -4065,7 +3755,7 @@ void clean_cr_time_mounts(void) for (mi = mntinfo; mi; mi = mi->next) { int cwd_fd; - if (mi->mnt_id != HELPER_MNT_ID) + if (mi->mnt_id != CRTIME_MNT_ID) continue; ret = switch_mnt_ns(mi->nsid->ns_pid, &ns_old, &cwd_fd); if (ret) { @@ -4073,8 +3763,8 @@ void clean_cr_time_mounts(void) continue; } - if (umount(mi->ns_mountpoint) < 0) - pr_perror("Can't umount forced mount %s", mi->ns_mountpoint); + if (umount(mi->mountpoint) < 0) + pr_perror("Can't umount forced mount %s", mi->mountpoint); if (restore_mnt_ns(ns_old, &cwd_fd)) { pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n"); @@ -4131,7 +3821,7 @@ static int ns_remount_writable(void *arg) if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { - pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->ns_mountpoint); + pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint); return 1; } return 0; @@ -4148,10 +3838,7 @@ int try_remount_writable(struct mount_info *mi, bool ns) if (!ns) remounted = REMOUNTED_RW_SERVICE; - /* All mounts in mntinfo list should have it on restore */ - BUG_ON(mi->rmi == NULL); - - if (mi->flags & MS_RDONLY && !(mi->rmi->remounted_rw & remounted)) { + if (mi->flags & MS_RDONLY && !(mi->remounted_rw & remounted)) { if (mnt_is_overmounted(mi)) { pr_err("The mount %d is overmounted so paths are invisible\n", mi->mnt_id); return -1; @@ -4163,18 +3850,18 @@ int try_remount_writable(struct mount_info *mi, bool ns) return -1; } - pr_info("Remount %d:%s writable\n", mi->mnt_id, service_mountpoint(mi)); + pr_info("Remount %d:%s writable\n", mi->mnt_id, mi->mountpoint); if (!ns) { - if (mount(NULL, service_mountpoint(mi), NULL, + if (mount(NULL, mi->mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~(MS_PROPAGATE | MS_RDONLY)), NULL) == -1) { - pr_perror("Failed to remount %d:%s writable", mi->mnt_id, service_mountpoint(mi)); + pr_perror("Failed to remount %d:%s writable", mi->mnt_id, mi->mountpoint); return -1; } } else { if (call_helper_process(ns_remount_writable, mi)) return -1; } - mi->rmi->remounted_rw |= remounted; + mi->remounted_rw |= remounted; } return 0; @@ -4189,7 +3876,7 @@ static int __remount_readonly_mounts(struct ns_id *ns) if (ns && mi->nsid != ns) continue; - if (!(mi->rmi->remounted_rw & REMOUNTED_RW)) + if (!(mi->remounted_rw && REMOUNTED_RW)) continue; /* @@ -4204,9 +3891,9 @@ static int __remount_readonly_mounts(struct ns_id *ns) pr_debug("Switched to mntns %u:%u\n", ns->id, ns->kid); } - pr_info("Remount %d:%s back to readonly\n", mi->mnt_id, mi->ns_mountpoint); + pr_info("Remount %d:%s back to readonly\n", mi->mnt_id, mi->mountpoint); if (mount(NULL, mi->ns_mountpoint, NULL, MS_REMOUNT | MS_BIND | (mi->flags & ~MS_PROPAGATE), NULL)) { - pr_perror("Failed to restore %d:%s mount flags %x", mi->mnt_id, mi->ns_mountpoint, mi->flags); + pr_perror("Failed to restore %d:%s mount flags %x", mi->mnt_id, mi->mountpoint, mi->flags); return -1; } } diff --git a/criu/namespaces.c b/criu/namespaces.c index 0c9b16a87..7fa58682b 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include "cgroup.h" #include "fdstore.h" #include "kerndat.h" -#include "util-caps.h" #include "protobuf.h" #include "util.h" @@ -284,6 +284,7 @@ int restore_ns(int rst, struct ns_desc *nd) int switch_mnt_ns(int pid, int *rst, int *cwd_fd) { + int ret; int fd; if (!cwd_fd) @@ -292,12 +293,13 @@ int switch_mnt_ns(int pid, int *rst, int *cwd_fd) fd = open(".", O_PATH); if (fd < 0) { pr_perror("unable to open current directory"); - return -1; + return fd; } - if (switch_ns(pid, &mnt_ns_desc, rst)) { + ret = switch_ns(pid, &mnt_ns_desc, rst); + if (ret < 0) { close(fd); - return -1; + return ret; } *cwd_fd = fd; @@ -306,22 +308,23 @@ int switch_mnt_ns(int pid, int *rst, int *cwd_fd) int restore_mnt_ns(int rst, int *cwd_fd) { - int exit_code = -1; + int ret = -1; - if (restore_ns(rst, &mnt_ns_desc)) + ret = restore_ns(rst, &mnt_ns_desc); + if (ret < 0) goto err_restore; - if (cwd_fd && fchdir(*cwd_fd)) { - pr_perror("Unable to restore current directory"); - goto err_restore; + if (cwd_fd) { + ret = fchdir(*cwd_fd); + if (ret) + pr_perror("unable to restore current directory"); } - exit_code = 0; err_restore: if (cwd_fd) close_safe(cwd_fd); - return exit_code; + return ret; } struct ns_id *ns_ids = NULL; @@ -888,7 +891,7 @@ int collect_user_ns(struct ns_id *ns, void *oarg) { /* * User namespace is dumped before files to get uid and gid - * mappings, which are used for converting local id-s to + * mappings, which are used for convirting local id-s to * userns id-s (userns_uid(), userns_gid()) */ if (dump_user_ns(root_item->pid->real, root_item->ids->user_ns_id)) @@ -1009,31 +1012,36 @@ int dump_user_ns(pid_t pid, int ns_id) ret = parse_id_map(pid, "uid_map", &e->uid_map); if (ret < 0) - /* - * The uid_map and gid_map is clean up in free_userns_maps - * later, so we don't need to clean these up in error cases. - */ - return -1; - + goto err; e->n_uid_map = ret; ret = parse_id_map(pid, "gid_map", &e->gid_map); if (ret < 0) - return -1; + goto err; e->n_gid_map = ret; if (check_user_ns(pid)) - return -1; + goto err; img = open_image(CR_FD_USERNS, O_DUMP, ns_id); if (!img) - return -1; + goto err; ret = pb_write_one(img, e, PB_USERNS); close_image(img); if (ret < 0) - return -1; + goto err; return 0; +err: + if (e->uid_map) { + xfree(e->uid_map[0]); + xfree(e->uid_map); + } + if (e->gid_map) { + xfree(e->gid_map[0]); + xfree(e->gid_map); + } + return -1; } void free_userns_maps(void) @@ -1209,9 +1217,20 @@ static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map) return 0; } +struct unsc_msg { + struct msghdr h; + /* + * 0th is the call address + * 1st is the flags + * 2nd is the optional (NULL in response) arguments + */ + struct iovec iov[3]; + char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))]; +}; + static int usernsd_pid; -inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd, pid_t *pid) +static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, size_t asize, int fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1249,10 +1268,7 @@ inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, ch->cmsg_type = SCM_CREDENTIALS; ucred = (struct ucred *)CMSG_DATA(ch); - if (pid) - ucred->pid = *pid; - else - ucred->pid = getpid(); + ucred->pid = getpid(); ucred->uid = getuid(); ucred->gid = getgid(); @@ -1267,7 +1283,7 @@ inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c, int *x, void *arg, } } -void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) +static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd) { struct cmsghdr *ch; struct ucred *ucred; @@ -1305,7 +1321,7 @@ static int usernsd(int sk) int flags, fd, ret; pid_t pid; - unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0, NULL); + unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0); if (recvmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: recv req error"); return -1; @@ -1314,6 +1330,11 @@ static int usernsd(int sk) unsc_msg_pid_fd(&um, &pid, &fd); pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags); + if (fd < 0 && flags & UNS_FDOUT) { + pr_err("uns: bad flags/fd %p %d %x\n", call, fd, flags); + BUG(); + } + /* * Caller has sent us bare address of the routine it * wants to call. Since the caller is fork()-ed from the @@ -1350,7 +1371,7 @@ static int usernsd(int sk) else fd = -1; - unsc_msg_init(&um, &call, &ret, NULL, 0, fd, NULL); + unsc_msg_init(&um, &call, &ret, NULL, 0, fd); if (sendmsg(sk, &um.h, 0) <= 0) { pr_perror("uns: send resp error"); return -1; @@ -1401,7 +1422,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Send the request */ - unsc_msg_init(&um, &call, &flags, arg, arg_size, fd, NULL); + unsc_msg_init(&um, &call, &flags, arg, arg_size, fd); ret = sendmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: send req error"); @@ -1416,7 +1437,7 @@ int __userns_call(const char *func_name, uns_call_t call, int flags, void *arg, /* Get the response back */ - unsc_msg_init(&um, &call, &res, NULL, 0, 0, NULL); + unsc_msg_init(&um, &call, &res, NULL, 0, 0); ret = recvmsg(sk, &um.h, 0); if (ret <= 0) { pr_perror("uns: recv resp error"); @@ -1437,11 +1458,14 @@ out: return ret; } -int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) +static int start_usernsd(void) { int sk[2]; int one = 1; + if (!(root_ns_mask & CLONE_NEWUSER)) + return 0; + /* * Seqpacket to * @@ -1449,7 +1473,7 @@ int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) * each other easily. Stream socket require manual * messages boundaries. * - * b) Make callers note the daemon death by seeing the + * b) Make callers note the damon death by seeing the * disconnected socket. In case of dgram socket * callers would just get stuck in receiving the * response. @@ -1470,39 +1494,24 @@ int start_unix_cred_daemon(pid_t *pid, int (*daemon_func)(int sk)) return -1; } - *pid = fork(); - if (*pid < 0) { - pr_perror("Can't unix daemon"); + usernsd_pid = fork(); + if (usernsd_pid < 0) { + pr_perror("Can't fork usernsd"); close(sk[0]); close(sk[1]); return -1; } - if (*pid == 0) { + if (usernsd_pid == 0) { int ret; + close(sk[0]); - ret = daemon_func(sk[1]); + ret = usernsd(sk[1]); exit(ret); } + close(sk[1]); - - return sk[0]; -} - -static int start_usernsd(void) -{ - int sk; - - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - - sk = start_unix_cred_daemon(&usernsd_pid, usernsd); - if (sk < 0) { - pr_err("failed to start usernsd\n"); - return -1; - } - - if (install_service_fd(USERNSD_SK, sk) < 0) { + if (install_service_fd(USERNSD_SK, sk[0]) < 0) { kill(usernsd_pid, SIGKILL); waitpid(usernsd_pid, NULL, 0); return -1; @@ -1619,12 +1628,10 @@ int collect_namespaces(bool for_dump) int prepare_userns_creds(void) { - if (!opts.unprivileged || has_cap_setuid(opts.cap_eff)) { - /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ - if (setuid(0) || setgid(0) || setgroups(0, NULL)) { - pr_perror("Unable to initialize id-s"); - return -1; - } + /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */ + if (setuid(0) || setgid(0) || setgroups(0, NULL)) { + pr_perror("Unable to initialize id-s"); + return -1; } /* diff --git a/criu/net.c b/criu/net.c index e5775a328..7b45f0633 100644 --- a/criu/net.c +++ b/criu/net.c @@ -51,9 +51,6 @@ #include "images/netdev.pb-c.h" #include "images/inventory.pb-c.h" -#undef LOG_PREFIX -#define LOG_PREFIX "net: " - #ifndef IFLA_NEW_IFINDEX #define IFLA_NEW_IFINDEX 49 #endif @@ -111,18 +108,15 @@ int read_ns_sys_file(char *path, char *buf, int len) } rlen = read(fd, buf, len); - if (rlen == -1) - pr_perror("Can't read ns' %s", path); close(fd); if (rlen == len) { - buf[0] = '\0'; pr_err("Too small buffer to read ns sys file %s\n", path); return -1; } - if (rlen >= 0) - buf[rlen] = '\0'; + if (rlen > 0) + buf[rlen - 1] = '\0'; return rlen; } @@ -359,23 +353,22 @@ static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntr return net_conf_op(tgt, conf, n, op, "ipv6", req, path, ARRAY_SIZE(devconfs6), devconfs6, def_conf); } -static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) +static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) { int i, ret = -1, flags = 0; char path[ARRAY_SIZE(unix_conf_entries)][MAX_CONF_UNIX_PATH] = {}; struct sysctl_req req[ARRAY_SIZE(unix_conf_entries)] = {}; SysctlEntry **conf = *rconf; - size_t n = *pn; - if (n != ARRAY_SIZE(unix_conf_entries)) { - pr_err("unix: Unexpected entries in config (%zu %zu)\n", n, ARRAY_SIZE(unix_conf_entries)); + if (*n != ARRAY_SIZE(unix_conf_entries)) { + pr_err("unix: Unexpected entries in config (%zu %zu)\n", *n, ARRAY_SIZE(unix_conf_entries)); return -EINVAL; } if (opts.weak_sysctls || op == CTL_READ) flags = CTL_FLAGS_OPTIONAL; - for (i = 0; i < n; i++) { + for (i = 0; i < *n; i++) { snprintf(path[i], MAX_CONF_UNIX_PATH, CONF_UNIX_FMT, unix_conf_entries[i]); req[i].name = path[i]; req[i].flags = flags; @@ -391,7 +384,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) } } - ret = sysctl_op(req, n, op, CLONE_NEWNET); + ret = sysctl_op(req, *n, op, CLONE_NEWNET); if (ret < 0) { pr_err("unix: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", CONF_UNIX_BASE); return -1; @@ -400,7 +393,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) if (op == CTL_READ) { bool has_entries = false; - for (i = 0; i < n; i++) { + for (i = 0; i < *n; i++) { if (req[i].flags & CTL_FLAGS_HAS) { conf[i]->has_iarg = true; if (!has_entries) @@ -413,7 +406,7 @@ static int unix_conf_op(SysctlEntry ***rconf, size_t *pn, int op) * Unix conf is optional. */ if (!has_entries) { - *pn = 0; + *n = 0; *rconf = NULL; } } @@ -669,7 +662,7 @@ static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nl ret = nla_parse_nested(data, IFLA_MACVLAN_FLAGS, info[IFLA_INFO_DATA], NULL); if (ret < 0) { - pr_err("failed to parse macvlan data\n"); + pr_err("failed ot parse macvlan data\n"); return -1; } @@ -779,7 +772,7 @@ static int dump_sit(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr pr_info("Some data for SIT provided\n"); ret = nla_parse_nested(data, IFLA_IPTUN_MAX, info[IFLA_INFO_DATA], NULL); if (ret < 0) { - pr_err("failed to parse sit data\n"); + pr_err("failed ot parse sit data\n"); return -1; } @@ -1179,7 +1172,7 @@ struct newlink_req { * request. */ struct newlink_extras { - int link; /* IFLA_LINK */ + int link; /* IFLA_LINK */ int target_netns; /* IFLA_NET_NS_FD */ }; @@ -1405,7 +1398,7 @@ static int move_veth(const char *netdev, struct ns_id *ns, struct net_link *link len_val = strlen(netdev); if (len_val >= IFNAMSIZ) return -1; - __strlcpy(mvreq.ifnam, netdev, IFNAMSIZ); + strlcpy(mvreq.ifnam, netdev, IFNAMSIZ); ret = userns_call(move_veth_cb, 0, &mvreq, sizeof(mvreq), ns->net.ns_fd); if (ret < 0) @@ -1535,7 +1528,7 @@ static int changeflags(int s, char *name, short flags) { struct ifreq ifr; - __strlcpy(ifr.ifr_name, name, IFNAMSIZ); + strlcpy(ifr.ifr_name, name, IFNAMSIZ); ifr.ifr_flags = flags; if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { @@ -1751,7 +1744,7 @@ static int __restore_link(struct ns_id *ns, struct net_link *link, int nlsk) switch (nde->type) { case ND_TYPE__LOOPBACK: /* fallthrough */ - case ND_TYPE__EXTLINK: /* see comment in images/netdev.proto */ + case ND_TYPE__EXTLINK: /* see comment in images/netdev.proto */ return restore_link_parms(link, nlsk); case ND_TYPE__VENET: return restore_one_link(ns, link, nlsk, venet_link_info, NULL); @@ -2046,10 +2039,10 @@ static inline int dump_iptables(struct cr_imgset *fds) * and iptables backend is nft to prevent duplicate dumps. */ #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) - iptables_cmd = get_legacy_iptables_bin(false, false); + iptables_cmd = get_legacy_iptables_bin(false); if (kdat.ipv6) - ip6tables_cmd = get_legacy_iptables_bin(true, false); + ip6tables_cmd = get_legacy_iptables_bin(true); #endif if (!iptables_cmd) { @@ -2128,117 +2121,6 @@ nft_ctx_free_out: } #endif -static const char *ipv4_sysctl_entries[] = { - "ping_group_range", -}; - -#define IPV4_SYSCTL_BASE "net/ipv4" -#define IPV4_SYSCTL_FMT IPV4_SYSCTL_BASE"/%s" -#define MAX_IPV4_SYSCTL_OPT 32 -#define MAX_IPV4_SYSCTL_PATH (sizeof(IPV4_SYSCTL_FMT) + MAX_IPV4_SYSCTL_OPT - 2) -#define MAX_STR_IPV4_SYSCTL_LEN 200 - -static int ipv4_sysctls_op(SysctlEntry ***rsysctl, size_t *pn, int op) -{ - int i, ret = -1, flags = 0; - char path[ARRAY_SIZE(ipv4_sysctl_entries)][MAX_IPV4_SYSCTL_PATH] = {}; - struct sysctl_req req[ARRAY_SIZE(ipv4_sysctl_entries)] = {}; - SysctlEntry **sysctl = *rsysctl; - size_t n = *pn, ri; - - if (n != ARRAY_SIZE(ipv4_sysctl_entries)) { - pr_err("ipv4: Unexpected entries in sysctl (%zu %zu)\n", n, ARRAY_SIZE(ipv4_sysctl_entries)); - return -EINVAL; - } - - if (opts.weak_sysctls || op == CTL_READ) - flags = CTL_FLAGS_OPTIONAL; - - for (i = 0, ri = 0; i < n; i++) { - snprintf(path[ri], MAX_IPV4_SYSCTL_PATH, IPV4_SYSCTL_FMT, ipv4_sysctl_entries[i]); - req[ri].name = path[ri]; - req[ri].flags = flags; - - switch (sysctl[i]->type) { - case SYSCTL_TYPE__CTL_STR: - req[ri].type = CTL_STR(MAX_STR_IPV4_SYSCTL_LEN); - - /* skip write if have no value */ - if (op == CTL_WRITE && !sysctl[i]->sarg) - continue; - - req[ri].arg = sysctl[i]->sarg; - break; - default: - pr_err("ipv4: Unknown sysctl type %d\n", sysctl[i]->type); - return -1; - } - ri++; - } - - ret = sysctl_op(req, ri, op, CLONE_NEWNET); - if (ret < 0) { - pr_err("ipv4: Failed to %s %s/\n", (op == CTL_READ) ? "read" : "write", IPV4_SYSCTL_BASE); - return -1; - } - - if (op == CTL_READ) { - bool has_entries = false; - - BUG_ON(ri != n); - for (i = 0; i < n; i++) { - if (req[i].flags & CTL_FLAGS_HAS) { - has_entries = true; - } else { - sysctl[i]->sarg = NULL; - } - } - - if (!has_entries) { - *pn = 0; - *rsysctl = NULL; - } - } - - return 0; -} - -static int ipv4_sysctls_ping_group_range_map_gid(SysctlEntry *ent, size_t size) -{ - int start, end, ustart, uend, ret; - - if (sscanf(ent->sarg, "%d %d", &start, &end) != 2) { - pr_err("Failed to parse ping_group_range: %s\n", ent->sarg); - return -1; - } - - /* - * The default is "1 0", which means no group - * is allowed to create ICMP Echo sockets. - */ - if (start == 1 && end == 0) { - pr_debug("The ping_group_range is set to default, skipping it.\n"); - ent->sarg = NULL; - return 0; - } - - if (!(root_ns_mask & CLONE_NEWUSER)) - return 0; - - ustart = userns_gid(start); - uend = userns_gid(end); - pr_debug("Mapping ping_group_range %d %d to userns -> %d %d\n", - start, end, ustart, uend); - - ret = snprintf(ent->sarg, size, "%d\t%d\n", ustart, uend); - if (ret < 0 || ret >= size) { - pr_err("Failed to map ping_group_range: %d\t%d\n", ustart, uend); - return -1; - } - - return 0; -} - static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; @@ -2253,10 +2135,6 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) int size6 = ARRAY_SIZE(devconfs6); char def_stable_secret[MAX_STR_CONF_LEN + 1] = {}; char all_stable_secret[MAX_STR_CONF_LEN + 1] = {}; - SysctlEntry *ipv4_sysctls = NULL; - size_t ipv4_sysctl_size = ARRAY_SIZE(ipv4_sysctl_entries); - char ping_group_range[MAX_STR_IPV4_SYSCTL_LEN + 1] = {}; - int ping_group_range_id = -1; NetnsId *ids; struct netns_id *p; @@ -2264,16 +2142,10 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) list_for_each_entry(p, &ns->net.ids, node) i++; - /* - * Here we allocate one single big buffer for storing multiple arrays - * of protobuf entries and pointers to entries in it and we later use - * xptr_pull_s to claim a part of this buffer of proper size for each - * particular array. Next we read data from sysctl files to those - * arrays and then finally save them into images. - */ o_buf = buf = xmalloc(i * (sizeof(NetnsId *) + sizeof(NetnsId)) + - (2 * size4 + 2 * size6 + sizex + ipv4_sysctl_size) * - (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); + size4 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + + size6 * (sizeof(SysctlEntry *) + sizeof(SysctlEntry)) * 2 + + sizex * (sizeof(SysctlEntry *) + sizeof(SysctlEntry))); if (!buf) goto out; @@ -2338,22 +2210,6 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) netns.unix_conf[i]->type = SYSCTL_TYPE__CTL_32; } - netns.n_ipv4_sysctl = ipv4_sysctl_size; - netns.ipv4_sysctl = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry *)); - ipv4_sysctls = xptr_pull_s(&buf, ipv4_sysctl_size * sizeof(SysctlEntry)); - for (i = 0; i < ipv4_sysctl_size; i++) { - sysctl_entry__init(&ipv4_sysctls[i]); - netns.ipv4_sysctl[i] = &ipv4_sysctls[i]; - if (!strcmp(ipv4_sysctl_entries[i], "ping_group_range")) { - netns.ipv4_sysctl[i]->type = SYSCTL_TYPE__CTL_STR; - netns.ipv4_sysctl[i]->sarg = ping_group_range; - ping_group_range_id = i; - } else { - /* Need to handle this case when we have more sysctls */ - BUG(); - } - } - ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; @@ -2372,16 +2228,6 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; - ret = ipv4_sysctls_op(&netns.ipv4_sysctl, &netns.n_ipv4_sysctl, CTL_READ); - if (ret < 0) - goto err_free; - - BUG_ON(ping_group_range_id == -1); - ret = ipv4_sysctls_ping_group_range_map_gid(netns.ipv4_sysctl[ping_group_range_id], - MAX_STR_IPV4_SYSCTL_LEN + 1); - if (ret < 0) - goto err_free; - ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); @@ -2404,12 +2250,12 @@ static int restore_ip_dump(int type, int pid, char *cmd) sockfd = img_raw_fd(img); if (sockfd < 0) { pr_err("Getting raw FD failed\n"); - goto out_image; + return -1; } tmp_file = tmpfile(); if (!tmp_file) { pr_perror("Failed to open tmpfile"); - goto out_image; + return -1; } while ((n = read(sockfd, buf, 1024)) > 0) { @@ -2418,34 +2264,25 @@ static int restore_ip_dump(int type, int pid, char *cmd) pr_perror("Failed to write to tmpfile " "[written: %d; total: %d]", written, n); - goto out_tmp_file; + goto close; } } if (fseek(tmp_file, 0, SEEK_SET)) { pr_perror("Failed to set file position to beginning of tmpfile"); - goto out_tmp_file; + goto close; } - if (type == CR_FD_RULE) { - /* - * Delete 3 default rules to prevent duplicates. See kernel's - * function fib_default_rules_init() for the details. - */ - run_ip_tool("rule", "flush", NULL, NULL, -1, -1, 0); - run_ip_tool("rule", "delete", "table", "local", -1, -1, 0); + if (img) { + ret = run_ip_tool(cmd, "restore", NULL, NULL, fileno(tmp_file), -1, 0); + close_image(img); } - ret = run_ip_tool(cmd, "restore", NULL, NULL, fileno(tmp_file), -1, 0); - -out_tmp_file: +close: if (fclose(tmp_file)) { pr_perror("Failed to close tmpfile"); } -out_image: - close_image(img); - return ret; } @@ -2467,7 +2304,31 @@ static inline int restore_route(int pid) static inline int restore_rule(int pid) { - return restore_ip_dump(CR_FD_RULE, pid, "rule"); + struct cr_img *img; + int ret = 0; + + img = open_image(CR_FD_RULE, O_RSTR, pid); + if (!img) { + ret = -1; + goto out; + } + + if (empty_image(img)) + goto close; + + /* + * Delete 3 default rules to prevent duplicates. See kernel's + * function fib_default_rules_init() for the details. + */ + run_ip_tool("rule", "flush", NULL, NULL, -1, -1, 0); + run_ip_tool("rule", "delete", "table", "local", -1, -1, 0); + + if (restore_ip_dump(CR_FD_RULE, pid, "rule")) + ret = -1; +close: + close_image(img); +out: + return ret; } /* @@ -2494,7 +2355,7 @@ static int prepare_xtable_lock(void) } if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { - pr_perror("Unable to convert mounts to slave mounts"); + pr_perror("Unable to conver mounts to slave mounts"); return -1; } /* @@ -2514,19 +2375,9 @@ static int prepare_xtable_lock(void) static inline int restore_iptables(int pid) { - char *iptables_cmd = "iptables-restore"; - char *ip6tables_cmd = "ip6tables-restore"; - char comm[32]; int ret = -1; struct cr_img *img; -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) - iptables_cmd = get_legacy_iptables_bin(false, true); - - if (kdat.ipv6) - ip6tables_cmd = get_legacy_iptables_bin(true, true); -#endif - img = open_image(CR_FD_IPTABLES, O_RSTR, pid); if (img == NULL) return -1; @@ -2536,19 +2387,7 @@ static inline int restore_iptables(int pid) goto ipt6; } - if (!iptables_cmd) { - pr_err("Can't restore iptables dump - no legacy version present\n"); - close_image(img); - return -1; - } - - if (snprintf(comm, sizeof(comm), "%s -w", iptables_cmd) >= sizeof(comm)) { - pr_err("Can't fit '%s -w' to buffer\n", iptables_cmd); - close_image(img); - return -1; - } - - ret = run_iptables_tool(comm, img_raw_fd(img), -1); + ret = run_iptables_tool("iptables-restore -w", img_raw_fd(img), -1); close_image(img); if (ret) return ret; @@ -2559,19 +2398,7 @@ ipt6: if (empty_image(img)) goto out; - if (!ip6tables_cmd) { - pr_err("Can't restore ip6tables dump - no legacy version present\n"); - close_image(img); - return -1; - } - - if (snprintf(comm, sizeof(comm), "%s -w", ip6tables_cmd) >= sizeof(comm)) { - pr_err("Can't fit '%s -w' to buffer\n", ip6tables_cmd); - close_image(img); - return -1; - } - - ret = run_iptables_tool(comm, img_raw_fd(img), -1); + ret = run_iptables_tool("ip6tables-restore -w", img_raw_fd(img), -1); out: close_image(img); @@ -2579,85 +2406,58 @@ out: } #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) -static inline int do_restore_nftables(struct cr_img *img) +static inline int restore_nftables(int pid) { - int exit_code = -1; + int ret = -1; + struct cr_img *img; struct nft_ctx *nft; off_t img_data_size; char *buf; - if ((img_data_size = img_raw_size(img)) < 0) { - pr_err("image size mismatch\n"); - goto out; - } - - if (read_img_str(img, &buf, img_data_size) < 0) { - pr_err("Failed to read nftables data\n"); - goto out; - } - - nft = nft_ctx_new(NFT_CTX_DEFAULT); - if (!nft) { - pr_err("Failed to create nft context object\n"); - goto buf_free_out; - } - - if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft)) { - pr_err("Failed to enable std/err output buffering\n"); - goto nft_ctx_free_out; - } - -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) - if (nft_run_cmd_from_buffer(nft, buf, strlen(buf))) -#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) - if (nft_run_cmd_from_buffer(nft, buf)) -#else - BUILD_BUG_ON(1); -#endif - { - pr_err("nft command error:\n%s\n%s\n", - nft_ctx_get_error_buffer(nft), buf); - goto nft_ctx_free_out; - } - - exit_code = 0; - -nft_ctx_free_out: - nft_ctx_free(nft); -buf_free_out: - xfree(buf); -out: - return exit_code; -} -#endif - -static inline int restore_nftables(int pid) -{ - int exit_code = -1; - struct cr_img *img; - img = open_image(CR_FD_NFTABLES, O_RSTR, pid); if (img == NULL) return -1; if (empty_image(img)) { /* Backward compatibility */ pr_info("Skipping nft restore, no image\n"); - exit_code = 0; + ret = 0; goto image_close_out; } -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) - if (!do_restore_nftables(img)) - exit_code = 0; -#else - pr_err("Unable to restore nftables. CRIU was built without libnftables support\n"); -#endif + if ((img_data_size = img_raw_size(img)) < 0) + goto image_close_out; + if (read_img_str(img, &buf, img_data_size) < 0) + goto image_close_out; + + nft = nft_ctx_new(NFT_CTX_DEFAULT); + if (!nft) + goto buf_free_out; + + if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft) || +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) + nft_run_cmd_from_buffer(nft, buf, strlen(buf))) +#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) + nft_run_cmd_from_buffer(nft, buf)) +#else + { + BUILD_BUG_ON(1); + } +#endif + goto nft_ctx_free_out; + + ret = 0; + +nft_ctx_free_out: + nft_ctx_free(nft); +buf_free_out: + xfree(buf); image_close_out: close_image(img); - return exit_code; + return ret; } +#endif int read_net_ns_img(void) { @@ -2734,12 +2534,6 @@ static int restore_netns_conf(struct ns_id *ns) goto out; } - if ((netns)->ipv4_sysctl) { - ret = ipv4_sysctls_op(&(netns)->ipv4_sysctl, &(netns)->n_ipv4_sysctl, CTL_WRITE); - if (ret) - goto out; - } - ns->net.netns = netns; out: return ret; @@ -2992,8 +2786,10 @@ static int prepare_net_ns_second_stage(struct ns_id *ns) ret = restore_rule(nsid); if (!ret) ret = restore_iptables(nsid); +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) if (!ret) ret = restore_nftables(nsid); +#endif } if (!ret) @@ -3219,45 +3015,11 @@ err: return ret; } -#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) -static inline FILE *redirect_nftables_output(struct nft_ctx *nft) -{ - FILE *fp; - int fd; - - fd = dup(log_get_fd()); - if (fd < 0) { - pr_perror("dup() to redirect nftables output failed"); - return NULL; - } - - fp = fdopen(fd, "w"); - if (!fp) { - pr_perror("fdopen() to redirect nftables output failed"); - return NULL; - } - - /** - * Without setvbuf() the output from libnftables will be - * somewhere in the log file, probably at the end. - * With setvbuf() potential output will be at the correct - * position. - */ - setvbuf(fp, NULL, _IONBF, 0); - - nft_ctx_set_output(nft, fp); - nft_ctx_set_error(nft, fp); - - return fp; -} -#endif - -static inline int nftables_lock_network_internal(bool restore) +static inline int nftables_lock_network_internal(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) - cleanup_file FILE *fp = NULL; struct nft_ctx *nft; - int ret = 0, exit_code = -1; + int ret = 0; char table[32]; char buf[128]; @@ -3268,18 +3030,9 @@ static inline int nftables_lock_network_internal(bool restore) if (!nft) return -1; - fp = redirect_nftables_output(nft); - if (!fp) - goto err2; - snprintf(buf, sizeof(buf), "create table %s", table); - ret = NFT_RUN_CMD(nft, buf); - if (ret) { - /* The network has been locked on dump. */ - if (restore && errno == EEXIST) - return 0; + if (NFT_RUN_CMD(nft, buf)) goto err2; - } snprintf(buf, sizeof(buf), "add chain %s output { type filter hook output priority 0; policy drop; }", table); if (NFT_RUN_CMD(nft, buf)) @@ -3297,16 +3050,17 @@ static inline int nftables_lock_network_internal(bool restore) if (NFT_RUN_CMD(nft, buf)) goto err1; - exit_code = 0; -out: - nft_ctx_free(nft); - return exit_code; + goto out; + err1: snprintf(buf, sizeof(buf), "delete table %s", table); NFT_RUN_CMD(nft, buf); err2: + ret = -1; pr_err("Locking network failed using nftables\n"); - goto out; +out: + nft_ctx_free(nft); + return ret; #else pr_err("CRIU was built without libnftables support\n"); return -1; @@ -3338,20 +3092,17 @@ static int iptables_network_lock_internal(void) return ret; } -int network_lock_internal(bool restore) +int network_lock_internal(void) { int ret = 0, nsret; - if (opts.network_lock_method == NETWORK_LOCK_SKIP) - return 0; - if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; if (opts.network_lock_method == NETWORK_LOCK_IPTABLES) ret = iptables_network_lock_internal(); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) - ret = nftables_lock_network_internal(restore); + ret = nftables_lock_network_internal(); if (restore_ns(nsret, &net_ns_desc)) ret = -1; @@ -3363,7 +3114,6 @@ static inline int nftables_network_unlock(void) { #if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) int ret = 0; - cleanup_file FILE *fp = NULL; struct nft_ctx *nft; char table[32]; char buf[128]; @@ -3375,10 +3125,6 @@ static inline int nftables_network_unlock(void) if (!nft) return -1; - fp = redirect_nftables_output(nft); - if (!fp) - return -1; - snprintf(buf, sizeof(buf), "delete table %s", table); if (NFT_RUN_CMD(nft, buf)) ret = -1; @@ -3391,53 +3137,19 @@ static inline int nftables_network_unlock(void) #endif } -static bool iptables_has_criu_jump_target(void) -{ - int fd, ret; - char *argv[4] = { "sh", "-c", "iptables -C INPUT -j CRIU", NULL }; - - fd = open("/dev/null", O_RDWR); - if (fd < 0) { - fd = -1; - pr_perror("failed to open /dev/null, using log fd"); - } - - ret = cr_system(fd, fd, fd, "sh", argv, CRS_CAN_FAIL); - close_safe(&fd); - return !ret; -} - static int iptables_network_unlock_internal(void) { - char delete_jump_targets[] = "*filter\n" - ":CRIU - [0:0]\n" - "-D INPUT -j CRIU\n" - "-D OUTPUT -j CRIU\n" - "COMMIT\n"; - - char delete_criu_chain[] = "*filter\n" - ":CRIU - [0:0]\n" - "-X CRIU\n" - "COMMIT\n"; - + char conf[] = "*filter\n" + ":CRIU - [0:0]\n" + "-D INPUT -j CRIU\n" + "-D OUTPUT -j CRIU\n" + "-X CRIU\n" + "COMMIT\n"; int ret = 0; - ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); + ret |= iptables_restore(false, conf, sizeof(conf) - 1); if (kdat.ipv6) - ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); - - /* For compatibility with iptables-nft backend, we need to make sure that all jump - * targets have been removed before deleting the CRIU chain. - */ - if (iptables_has_criu_jump_target()) { - ret |= iptables_restore(false, delete_jump_targets, sizeof(delete_jump_targets) - 1); - if (kdat.ipv6) - ret |= iptables_restore(true, delete_jump_targets, sizeof(delete_jump_targets) - 1); - } - - ret |= iptables_restore(false, delete_criu_chain, sizeof(delete_criu_chain) - 1); - if (kdat.ipv6) - ret |= iptables_restore(true, delete_criu_chain, sizeof(delete_criu_chain) - 1); + ret |= iptables_restore(true, conf, sizeof(conf) - 1); return ret; } @@ -3446,9 +3158,6 @@ static int network_unlock_internal(void) { int ret = 0, nsret; - if (opts.network_lock_method == NETWORK_LOCK_SKIP) - return 0; - if (switch_ns(root_item->pid->real, &net_ns_desc, &nsret)) return -1; @@ -3477,7 +3186,7 @@ int network_lock(void) if (run_scripts(ACT_NET_LOCK)) return -1; - return network_lock_internal(false); + return network_lock_internal(); } void network_unlock(void) @@ -3498,7 +3207,7 @@ void network_unlock(void) int veth_pair_add(char *in, char *out) { - cleanup_free char *e_str = NULL; + char *e_str; e_str = xmalloc(200); /* For 3 IFNAMSIZ + 8 service characters */ if (!e_str) @@ -3521,7 +3230,7 @@ int macvlan_ext_add(struct external *ext) /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the - * kernel will synchronously go on a very slow routine called + * kernel will synchonously go on a very slow routine called * synchronize_rcu() trying to put a reference on old namespaces. * * To avoid doing this more than once we pre-create all the @@ -3672,7 +3381,7 @@ int collect_net_namespaces(bool for_dump) struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net"); -struct ns_id *net_get_root_ns(void) +struct ns_id *net_get_root_ns() { static struct ns_id *root_netns = NULL; @@ -3689,7 +3398,7 @@ struct ns_id *net_get_root_ns(void) /* * socket_diag doesn't report unbound and unconnected sockets, - * so we have to get their network namespaces explicitly + * so we have to get their network namesapces explicitly */ struct ns_id *get_socket_ns(int lfd) { @@ -3789,7 +3498,7 @@ static int move_to_bridge(struct external *ext, void *arg) ret = -1; goto out; } - __strlcpy(ifr.ifr_name, br, IFNAMSIZ); + strlcpy(ifr.ifr_name, br, IFNAMSIZ); ret = ioctl(s, SIOCBRADDIF, &ifr); if (ret < 0) { pr_perror("Can't add interface %s to bridge %s", out, br); @@ -3801,7 +3510,7 @@ static int move_to_bridge(struct external *ext, void *arg) * $ ip link set dev up */ ifr.ifr_ifindex = 0; - __strlcpy(ifr.ifr_name, out, IFNAMSIZ); + strlcpy(ifr.ifr_name, out, IFNAMSIZ); ret = ioctl(s, SIOCGIFFLAGS, &ifr); if (ret < 0) { pr_perror("Can't get flags of interface %s", out); diff --git a/criu/netfilter.c b/criu/netfilter.c index e2c82764f..2212fd9f2 100644 --- a/criu/netfilter.c +++ b/criu/netfilter.c @@ -48,8 +48,8 @@ void preload_netfilter_modules(void) fd = -1; pr_perror("failed to open /dev/null, using log fd for net module preload"); } - cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, CRS_CAN_FAIL); - cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, CRS_CAN_FAIL); + cr_system(fd, fd, fd, iptable_cmd_ipv4, (char *[]){ iptable_cmd_ipv4, "-L", "-n", NULL }, 0); + cr_system(fd, fd, fd, iptable_cmd_ipv6, (char *[]){ iptable_cmd_ipv6, "-L", "-n", NULL }, 0); close_safe(&fd); } @@ -299,25 +299,7 @@ int nftables_lock_connection(struct inet_sk_desc *sk) int nftables_get_table(char *table, int n) { - int ret; - - switch(dump_criu_run_id[0]) { - case 0: - /* This is not a restore.*/ - ret = snprintf(table, n, "inet CRIU-%s", criu_run_id); - break; - case NO_DUMP_CRIU_RUN_ID: - /** - * This is a restore from an older image with no - * dump_criu_run_id available. Let's use the old ID. - */ - ret = snprintf(table, n, "inet CRIU-%d", root_item->pid->real); - break; - default: - ret = snprintf(table, n, "inet CRIU-%s", dump_criu_run_id); - } - - if (ret < 0) { + if (snprintf(table, n, "inet CRIU-%d", root_item->pid->real) < 0) { pr_err("Cannot generate CRIU's nftables table name\n"); return -1; } diff --git a/criu/page-pipe.c b/criu/page-pipe.c index 4601d8f9c..5a7e50bc1 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -56,7 +56,7 @@ static inline int ppb_resize_pipe(struct page_pipe_buf *ppb) if (new_size > PIPE_MAX_SIZE) { if (ppb->pipe_size < PIPE_MAX_SIZE) - new_size = PIPE_MAX_SIZE; + ppb->pipe_size = PIPE_MAX_SIZE; else return 1; } @@ -99,7 +99,6 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl { struct page_pipe_buf *prev = pp_prev_ppb(pp, ppb_flags); struct page_pipe_buf *ppb; - int ppb_size = 0; ppb = xmalloc(sizeof(*ppb)); if (!ppb) @@ -121,13 +120,7 @@ static struct page_pipe_buf *ppb_alloc(struct page_pipe *pp, unsigned int ppb_fl cnt_add(CNT_PAGE_PIPES, 1); ppb->pipe_off = 0; - ppb_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0); - if (ppb_size < 0) { - xfree(ppb); - pr_perror("Can't get pipe size"); - return NULL; - } - ppb->pipe_size = ppb_size / PAGE_SIZE; + ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE; pp->nr_pipes++; } @@ -381,7 +374,7 @@ int pipe_read_dest_init(struct pipe_read_dest *prd) return 0; } -int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned long int *nr_pages, +int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned long addr, unsigned int *nr_pages, unsigned int ppb_flags) { struct page_pipe_buf *ppb; @@ -406,7 +399,7 @@ int page_pipe_read(struct page_pipe *pp, struct pipe_read_dest *prd, unsigned lo } /* clamp the request if it passes the end of iovec */ - len = min((unsigned long)iov->iov_base + iov->iov_len - addr, *nr_pages * PAGE_SIZE); + len = min((unsigned long)iov->iov_base + iov->iov_len - addr, (unsigned long)(*nr_pages) * PAGE_SIZE); *nr_pages = len / PAGE_SIZE; skip += ppb->pipe_off * PAGE_SIZE; @@ -446,17 +439,17 @@ void debug_show_page_pipe(struct page_pipe *pp) pr_debug("Page pipe:\n"); pr_debug("* %u pipes %u/%u iovs:\n", pp->nr_pipes, pp->free_iov, pp->nr_iovs); list_for_each_entry(ppb, &pp->bufs, l) { - pr_debug("\tbuf %lx pages, %u iovs, flags: %x pipe_off: %lx :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, + pr_debug("\tbuf %u pages, %u iovs, flags: %x pipe_off: %x :\n", ppb->pages_in, ppb->nr_segs, ppb->flags, ppb->pipe_off); for (i = 0; i < ppb->nr_segs; i++) { iov = &ppb->iov[i]; - pr_debug("\t\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); + pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); } } pr_debug("* %u holes:\n", pp->free_hole); for (i = 0; i < pp->free_hole; i++) { iov = &pp->holes[i]; - pr_debug("\t%p - %p\n", iov->iov_base, iov->iov_base + iov->iov_len); + pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE); } } diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 463d4c506..9adf2c8b2 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include @@ -32,7 +31,7 @@ static int page_server_sk = -1; struct page_server_iov { u32 cmd; - u64 nr_pages; + u32 nr_pages; u64 vaddr; u64 dst_id; }; @@ -51,8 +50,8 @@ static void psi2iovec(struct page_server_iov *ps, struct iovec *iov) #define PS_IOV_ADD_F 6 #define PS_IOV_GET 7 -#define PS_IOV_CLOSE 0x1023 -#define PS_IOV_FORCE_CLOSE 0x1024 +#define PS_IOV_FLUSH 0x1023 +#define PS_IOV_FLUSH_N_CLOSE 0x1024 #define PS_CMD_BITS 16 #define PS_CMD_MASK ((1 << PS_CMD_BITS) - 1) @@ -158,32 +157,18 @@ static inline int send_psi(int sk, struct page_server_iov *pi) return send_psi_flags(sk, pi, 0); } -static void tcp_cork(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) - pr_pwarn("Unable to set TCP_CORK=%d", val); -} - -static void tcp_nodelay(int sk, bool on) -{ - int val = on ? 1 : 0; - if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) - pr_pwarn("Unable to set TCP_NODELAY=%d", val); -} - /* page-server xfer */ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long len) { ssize_t ret, left = len; if (opts.tls) { - pr_debug("Sending %lx bytes\n", len); + pr_debug("Sending %lu bytes / %lu pages\n", len, len / PAGE_SIZE); if (tls_send_data_from_fd(p, len)) return -1; } else { - pr_debug("Splicing %lx bytes into socket\n", len); + pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); while (left > 0) { ret = splice(p, NULL, xfer->sk, NULL, left, SPLICE_F_MOVE); @@ -192,7 +177,7 @@ static int write_pages_to_server(struct page_xfer *xfer, int p, unsigned long le return -1; } - pr_debug("\tSpliced: %lx bytes sent\n", (unsigned long)ret); + pr_debug("\tSpliced: %lu bytes sent\n", (unsigned long)ret); left -= ret; } } @@ -288,7 +273,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) * read_pagemap_page routine. */ - pr_debug("Checking %p - %p hole\n", iov->iov_base, iov->iov_base + iov->iov_len); + pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); off = (unsigned long)iov->iov_base; end = off + iov->iov_len; while (1) { @@ -300,8 +285,7 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) return -1; } - pr_debug("\tFound %" PRIx64 " - %" PRIx64 "\n", - p->pe->vaddr, p->pe->vaddr + pagemap_len(p->pe)); + pr_debug("\tFound %" PRIx64 "/%lu\n", p->pe->vaddr, pagemap_len(p->pe)); /* * The pagemap entry in parent may happen to be @@ -327,7 +311,6 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag pe.nr_pages = iov->iov_len / PAGE_SIZE; pe.has_flags = true; pe.flags = flags; - pe.has_nr_pages = true; if (flags & PE_PRESENT) { if (opts.auto_dedup && xfer->parent != NULL) { @@ -341,8 +324,7 @@ static int write_pagemap_loc(struct page_xfer *xfer, struct iovec *iov, u32 flag if (xfer->parent != NULL) { ret = check_pagehole_in_parent(xfer->parent, iov); if (ret) { - pr_err("Hole %p - %p not found in parent\n", - iov->iov_base, iov->iov_base + iov->iov_len); + pr_err("Hole %p/%zu not found in parent\n", iov->iov_base, iov->iov_len); return -1; } } @@ -624,7 +606,7 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p * * Since, iov-C is not processed completely, we need to find * "partial_read_byte" count to place out dummy-iov for - * remaining processing of iov-C. This function is performed by + * remainig processing of iov-C. This function is performed by * analyze_iov function. * * dummy-iov will be(2): {C+3,1}. dummy-iov will be placed @@ -635,18 +617,31 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p */ unsigned long handle_faulty_iov(int pid, struct iovec *riov, unsigned long faulty_index, struct iovec *bufvec, - struct iovec *aux_iov, unsigned long *aux_len) + struct iovec *aux_iov, unsigned long *aux_len, unsigned long partial_read_bytes) { struct iovec dummy; ssize_t bytes_read; + unsigned long offset = 0; unsigned long final_read_cnt = 0; + /* Handling Case 2*/ + if (riov[faulty_index].iov_len == PAGE_SIZE) { + cnt_sub(CNT_PAGES_WRITTEN, 1); + return 0; + } + /* Handling Case 3-Part 3.2*/ - dummy.iov_base = riov[faulty_index].iov_base; - dummy.iov_len = riov[faulty_index].iov_len; + offset = (partial_read_bytes) ? partial_read_bytes : PAGE_SIZE; + + dummy.iov_base = riov[faulty_index].iov_base + offset; + dummy.iov_len = riov[faulty_index].iov_len - offset; + + if (!partial_read_bytes) + cnt_sub(CNT_PAGES_WRITTEN, 1); while (dummy.iov_len) { bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0); + if (bytes_read == -1) { /* Handling faulty page read in faulty iov */ cnt_sub(CNT_PAGES_WRITTEN, 1); @@ -676,12 +671,14 @@ unsigned long handle_faulty_iov(int pid, struct iovec *riov, unsigned long fault /* * This function will position start pointer to the latest - * successfully read iov in iovec. + * successfully read iov in iovec. In case of partial read it + * returns partial_read_bytes, otherwise 0. */ static unsigned long analyze_iov(ssize_t bytes_read, struct iovec *riov, unsigned long *index, struct iovec *aux_iov, unsigned long *aux_len) { ssize_t processed_bytes = 0; + unsigned long partial_read_bytes = 0; /* correlating iovs with read bytes */ while (processed_bytes < bytes_read) { @@ -695,17 +692,13 @@ static unsigned long analyze_iov(ssize_t bytes_read, struct iovec *riov, unsigne /* handling partially processed faulty iov*/ if (processed_bytes - bytes_read) { - unsigned long partial_read_bytes = 0; - (*index) -= 1; partial_read_bytes = riov[*index].iov_len - (processed_bytes - bytes_read); aux_iov[*aux_len - 1].iov_len = partial_read_bytes; - riov[*index].iov_base += partial_read_bytes; - riov[*index].iov_len -= partial_read_bytes; } - return 0; + return partial_read_bytes; } /* @@ -730,36 +723,40 @@ static long fill_userbuf(int pid, struct page_pipe_buf *ppb, struct iovec *bufve ssize_t bytes_read; unsigned long total_read = 0; unsigned long start = 0; + unsigned long partial_read_bytes = 0; while (start < ppb->nr_segs) { bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start], ppb->nr_segs - start, 0); + if (bytes_read == -1) { - if (errno == ESRCH) { - pr_debug("Target process PID:%d not found\n", pid); - return -ESRCH; - } - if (errno != EFAULT) { - pr_perror("process_vm_readv failed"); - return -1; - } /* Handling Case 1*/ if (riov[start].iov_len == PAGE_SIZE) { cnt_sub(CNT_PAGES_WRITTEN, 1); start += 1; continue; + } else if (errno == ESRCH) { + pr_debug("Target process PID:%d not found\n", pid); + return ESRCH; } - total_read += handle_faulty_iov(pid, riov, start, bufvec, aux_iov, aux_len); - start += 1; - continue; } + partial_read_bytes = 0; + if (bytes_read > 0) { - if (analyze_iov(bytes_read, riov, &start, aux_iov, aux_len) < 0) - return -1; + partial_read_bytes = analyze_iov(bytes_read, riov, &start, aux_iov, aux_len); bufvec->iov_base += bytes_read; bufvec->iov_len -= bytes_read; total_read += bytes_read; } + + /* + * If all iovs not processed in one go, + * it means some iov in between has failed. + */ + if (start < ppb->nr_segs) + total_read += handle_faulty_iov(pid, riov, start, bufvec, aux_iov, aux_len, partial_read_bytes); + + start += 1; } return total_read; @@ -780,62 +777,40 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p struct page_pipe_buf *ppb; unsigned int cur_hole = 0, i; unsigned long ret, bytes_read; - unsigned long userbuf_len; struct iovec bufvec; - struct iovec *aux_iov; + struct iovec aux_iov[PIPE_MAX_SIZE]; unsigned long aux_len; - void *userbuf; - userbuf_len = PIPE_MAX_BUFFER_SIZE; - userbuf = mmap(NULL, userbuf_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + char *userbuf = mmap(NULL, BUFFER_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (userbuf == MAP_FAILED) { pr_perror("Unable to mmap a buffer"); return -1; } - aux_iov = xmalloc(userbuf_len / PAGE_SIZE * sizeof(aux_iov[0])); - if (!aux_iov) - goto err; list_for_each_entry(ppb, &pp->bufs, l) { - if (ppb->pipe_size * PAGE_SIZE > userbuf_len) { - void *addr; - - addr = mremap(userbuf, userbuf_len, ppb->pipe_size * PAGE_SIZE, MREMAP_MAYMOVE); - if (addr == MAP_FAILED) { - pr_perror("Unable to mmap a buffer"); - goto err; - } - userbuf_len = ppb->pipe_size * PAGE_SIZE; - userbuf = addr; - addr = xrealloc(aux_iov, ppb->pipe_size * sizeof(aux_iov[0])); - if (!addr) - goto err; - aux_iov = addr; - } timing_start(TIME_MEMDUMP); aux_len = 0; - bufvec.iov_len = userbuf_len; + bufvec.iov_len = BUFFER_SIZE; bufvec.iov_base = userbuf; bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len); - if (bytes_read == -ESRCH) { - timing_stop(TIME_MEMDUMP); - munmap(userbuf, userbuf_len); - xfree(aux_iov); - return 0; + + if (bytes_read == ESRCH) { + munmap(userbuf, BUFFER_SIZE); + return -1; } - if (bytes_read < 0) - goto err; bufvec.iov_base = userbuf; bufvec.iov_len = bytes_read; - ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK | SPLICE_F_GIFT); + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK); if (ret == -1 || ret != bytes_read) { pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret); - goto err; + munmap(userbuf, BUFFER_SIZE); + return -1; } timing_stop(TIME_MEMDUMP); @@ -847,34 +822,35 @@ int page_xfer_predump_pages(int pid, struct page_xfer *xfer, struct page_pipe *p u32 flags; ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base); - if (ret) - goto err; + if (ret) { + munmap(userbuf, BUFFER_SIZE); + return ret; + } BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\t p %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); + pr_debug("\t p %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); flags = ppb_xfer_flags(xfer, ppb); - if (xfer->write_pagemap(xfer, &iov, flags)) - goto err; + if (xfer->write_pagemap(xfer, &iov, flags)) { + munmap(userbuf, BUFFER_SIZE); + return -1; + } - if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) - goto err; + if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) { + munmap(userbuf, BUFFER_SIZE); + return -1; + } } timing_stop(TIME_MEMWRITE); } - munmap(userbuf, userbuf_len); - xfree(aux_iov); + munmap(userbuf, BUFFER_SIZE); timing_start(TIME_MEMWRITE); return dump_holes(xfer, pp, &cur_hole, NULL); -err: - munmap(userbuf, userbuf_len); - xfree(aux_iov); - return -1; } int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) @@ -888,7 +864,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) list_for_each_entry(ppb, &pp->bufs, l) { unsigned int i; - pr_debug("\tbuf %lx/%d\n", ppb->pages_in, ppb->nr_segs); + pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); for (i = 0; i < ppb->nr_segs; i++) { struct iovec iov = ppb->iov[i]; @@ -900,7 +876,7 @@ int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) BUG_ON(iov.iov_base < (void *)xfer->offset); iov.iov_base -= xfer->offset; - pr_debug("\tp %p - %p\n", iov.iov_base, iov.iov_base + iov.iov_len); + pr_debug("\tp %p [%u]\n", iov.iov_base, (unsigned int)(iov.iov_len / PAGE_SIZE)); flags = ppb_xfer_flags(xfer, ppb); @@ -1073,8 +1049,7 @@ static int page_server_add(int sk, struct page_server_iov *pi, u32 flags) struct page_xfer *lxfer = &cxfer.loc_xfer; struct iovec iov; - pr_debug("Adding %" PRIx64 " - %" PRIx64 "\n", - pi->vaddr, pi->vaddr + pi->nr_pages * PAGE_SIZE); + pr_debug("Adding %" PRIx64 "/%u\n", pi->vaddr, pi->nr_pages); if (prep_loc_xfer(pi)) return -1; @@ -1139,17 +1114,13 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) { struct pstree_item *item; struct page_pipe *pp; - unsigned long len, nr_pages; + unsigned long len; int ret; item = pstree_item_by_virt(pi->dst_id); pp = dmpi(item)->mem_pp; - /* page_pipe_read() uses 'unsigned long *' but pi->nr_pages is u64. - * Use a temporary variable to fix the incompatible pointer type - * on 32-bit platforms (e.g. armv7). */ - nr_pages = pi->nr_pages; - ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &nr_pages, PPB_LAZY); + ret = page_pipe_read(pp, &pipe_read_dest, pi->vaddr, &pi->nr_pages, PPB_LAZY); if (ret) return ret; @@ -1158,7 +1129,6 @@ static int page_server_get_pages(int sk, struct page_server_iov *pi) * .dst_id all remain intact. */ - pi->nr_pages = nr_pages; if (pi->nr_pages == 0) { pr_debug("no iovs found, zero pages\n"); return -1; @@ -1253,8 +1223,8 @@ static int page_server_serve(int sk) ret = page_server_add(sk, &pi, flags); break; } - case PS_IOV_CLOSE: - case PS_IOV_FORCE_CLOSE: { + case PS_IOV_FLUSH: + case PS_IOV_FLUSH_N_CLOSE: { int32_t status = 0; ret = 0; @@ -1280,9 +1250,7 @@ static int page_server_serve(int sk) break; } - if (ret) - break; - if (pi.cmd == PS_IOV_CLOSE || pi.cmd == PS_IOV_FORCE_CLOSE) + if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE)) break; } @@ -1291,8 +1259,6 @@ static int page_server_serve(int sk) ret = -1; } - tls_terminate_session(ret != 0); - if (ret == 0 && opts.ps_socket == -1) { char c; @@ -1306,6 +1272,7 @@ static int page_server_serve(int sk) } } + tls_terminate_session(); page_server_close(); pr_info("Session over\n"); @@ -1356,7 +1323,7 @@ static int fill_page_pipe(struct page_read *pr, struct page_pipe *pp) static int page_pipe_from_pagemap(struct page_pipe **pp, int pid) { struct page_read pr; - unsigned long nr_pages = 0; + int nr_pages = 0; if (open_page_read(pid, &pr, PR_TASK) <= 0) { pr_err("Failed to open page read for %d\n", pid); @@ -1430,7 +1397,7 @@ int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) if (opts.ps_socket != -1) { ask = opts.ps_socket; - pr_info("Reusing ps socket %d\n", ask); + pr_info("Re-using ps socket %d\n", ask); goto no_server; } @@ -1476,7 +1443,7 @@ static int connect_to_page_server(void) if (opts.ps_socket != -1) { page_server_sk = opts.ps_socket; - pr_info("Reusing ps socket %d\n", page_server_sk); + pr_info("Re-using ps socket %d\n", page_server_sk); goto out; } @@ -1523,9 +1490,9 @@ int disconnect_from_page_server(void) * the parent process) so we must order the * page-server to terminate itself. */ - pi.cmd = PS_IOV_FORCE_CLOSE; + pi.cmd = PS_IOV_FLUSH_N_CLOSE; else - pi.cmd = PS_IOV_CLOSE; + pi.cmd = PS_IOV_FLUSH; if (send_psi(page_server_sk, &pi)) goto out; @@ -1537,7 +1504,7 @@ int disconnect_from_page_server(void) ret = 0; out: - tls_terminate_session(ret != 0); + tls_terminate_session(); close_safe(&page_server_sk); return ret ?: status; @@ -1559,13 +1526,13 @@ struct ps_async_read { static LIST_HEAD(async_reads); -static inline void async_read_set_goal(struct ps_async_read *ar, unsigned long nr_pages) +static inline void async_read_set_goal(struct ps_async_read *ar, int nr_pages) { ar->goal = sizeof(ar->pi) + nr_pages * PAGE_SIZE; ar->nr_pages = nr_pages; } -static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned long nr_pages, ps_async_read_complete complete, +static void init_ps_async_read(struct ps_async_read *ar, void *buf, int nr_pages, ps_async_read_complete complete, void *priv) { ar->pages = buf; @@ -1575,7 +1542,7 @@ static void init_ps_async_read(struct ps_async_read *ar, void *buf, unsigned lon async_read_set_goal(ar, nr_pages); } -static int page_server_start_async_read(void *buf, unsigned long nr_pages, ps_async_read_complete complete, void *priv) +static int page_server_start_async_read(void *buf, int nr_pages, ps_async_read_complete complete, void *priv) { struct ps_async_read *ar; @@ -1675,7 +1642,7 @@ int connect_to_page_server_to_recv(int epfd) return epoll_add_rfd(epfd, &ps_rfd); } -int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long nr_pages) +int request_remote_pages(unsigned long img_id, unsigned long addr, int nr_pages) { struct page_server_iov pi = { .cmd = PS_IOV_GET, @@ -1692,7 +1659,7 @@ int request_remote_pages(unsigned long img_id, unsigned long addr, unsigned long return 0; } -static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv) +static int page_server_start_sync_read(void *buf, int nr, ps_async_read_complete complete, void *priv) { struct ps_async_read ar; int ret = 1; @@ -1703,7 +1670,7 @@ static int page_server_start_sync_read(void *buf, unsigned long nr, ps_async_rea return ret; } -int page_server_start_read(void *buf, unsigned long nr, ps_async_read_complete complete, void *priv, unsigned flags) +int page_server_start_read(void *buf, int nr, ps_async_read_complete complete, void *priv, unsigned flags) { if (flags & PR_ASYNC) return page_server_start_async_read(buf, nr, complete, priv); diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c index 457c0d649..00f088ff3 100644 --- a/criu/pagemap-cache.c +++ b/criu/pagemap-cache.c @@ -1,6 +1,5 @@ #include #include -#include #include "page.h" #include "pagemap-cache.h" @@ -11,7 +10,6 @@ #include "vma.h" #include "mem.h" #include "kerndat.h" -#include "fault-injection.h" #undef LOG_PREFIX #define LOG_PREFIX "pagemap-cache: " @@ -24,8 +22,6 @@ #define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64)) -#define PAGE_REGIONS_MAX_NR 32768 - /* * It's a workaround for a kernel bug. In the 3.19 kernel when pagemap are read * for a few vma-s for one read call, it returns incorrect data. @@ -54,23 +50,10 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz pmc->pid = pid; pmc->map_len = PAGEMAP_LEN(map_size); pmc->vma_head = vma_head; - pmc->regs_max_len = PAGE_PFN(map_size); - if (pmc->regs_max_len > PAGE_REGIONS_MAX_NR) - pmc->regs_max_len = PAGE_REGIONS_MAX_NR; - pmc->regs_len = 0; - pmc->regs_idx = 0; - pmc->regs = NULL; - pmc->map = NULL; - if (kdat.has_pagemap_scan && !fault_injected(FI_DONT_USE_PAGEMAP_SCAN)) { - pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region)); - if (!pmc->regs) - goto err; - } else { - pmc->map = xmalloc(pmc->map_len); - if (!pmc->map) - goto err; - } + pmc->map = xmalloc(pmc->map_len); + if (!pmc->map) + goto err; if (pagemap_cache_disabled) pr_warn_once("The pagemap cache is disabled\n"); @@ -104,11 +87,17 @@ err: return -1; } +static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr) +{ + return &pmc->map[PAGE_PFN(addr - pmc->start)]; +} + static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) { unsigned long low = vma->e->start & PMC_MASK; unsigned long high = low + PMC_SIZE; size_t len = vma_area_len(vma); + size_t size_map; if (high > kdat.task_size) high = kdat.task_size; @@ -126,7 +115,7 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) * fit in solid manner, iow -- either the whole vma fits * the cache window, either plain read is used. * - * The benefit (apart reducing the number of read() calls) + * The benefit (apart redusing the number of read() calls) * is to walk page tables less. */ if (!pagemap_cache_disabled && len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) { @@ -160,89 +149,39 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma) pr_debug("\t%d: simple mode [l:%lx h:%lx]\n", pmc->pid, pmc->start, pmc->end); } - return pmc_fill(pmc, pmc->start, pmc->end); -} - -int pmc_fill(pmc_t *pmc, u64 start, u64 end) -{ - size_t size_map, off; - - pmc->start = start; - pmc->end = end; - size_map = PAGEMAP_LEN(pmc->end - pmc->start); BUG_ON(pmc->map_len < size_map); BUG_ON(pmc->fd < 0); - if (pmc->regs) { - struct pm_scan_arg args = { - .size = sizeof(struct pm_scan_arg), - .flags = 0, - .start = pmc->start, - .end = pmc->end, - .vec = (long)pmc->regs, - .vec_len = pmc->regs_max_len, - .max_pages = 0, - /* - * Request pages that are in RAM or swap, excluding - * zero-filled and file-backed pages. - */ - .category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE, - .category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE, - .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED, - .return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_SOFT_DIRTY, - }; - long ret; - - if (kdat.has_pagemap_scan_guard_pages) - args.return_mask |= PAGE_IS_GUARD; - - ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args); - if (ret == -1) { - pr_perror("PAGEMAP_SCAN"); - pmc_zap(pmc); - return -1; - } - pmc->regs_len = ret; - pmc->regs_idx = 0; - pmc->end = args.walk_end; - } else { - for (off = 0; off != size_map;) { - ssize_t ret; - char *ptr = (char *)pmc->map; - - ret = pread(pmc->fd, ptr + off, size_map - off, PAGEMAP_PFN_OFF(pmc->start) + off); - if (ret == -1) { - pmc_zap(pmc); - pr_perror("Can't read %d's pagemap file", pmc->pid); - return -1; - } - off += ret; - } + if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) { + pmc_zap(pmc); + pr_perror("Can't read %d's pagemap file", pmc->pid); + return -1; } return 0; } -int pmc_get_map(pmc_t *pmc, const struct vma_area *vma) +u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma) { /* Hit */ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end)) - return 0; + return __pmc_get_map(pmc, vma->e->start); /* Miss, refill the cache */ if (pmc_fill_cache(pmc, vma)) { pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end); - return -1; + return NULL; } - return 0; + + /* Hit for sure */ + return __pmc_get_map(pmc, vma->e->start); } void pmc_fini(pmc_t *pmc) { close_safe(&pmc->fd); xfree(pmc->map); - xfree(pmc->regs); pmc_reset(pmc); } diff --git a/criu/pagemap.c b/criu/pagemap.c index 6c9c4f7fe..77e519dd1 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -30,10 +30,10 @@ * One "job" for the preadv() syscall in pagemap.c */ struct page_read_iov { - off_t from; /* offset in pi file where to start reading from */ - off_t end; /* the end of the read == sum to.iov_len -s */ + off_t from; /* offset in pi file where to start reading from */ + off_t end; /* the end of the read == sum to.iov_len -s */ struct iovec *to; /* destination iovs */ - unsigned int nr; /* their number */ + unsigned int nr; /* their number */ struct list_head l; }; @@ -168,15 +168,15 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) return 0; } -static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, unsigned long int nr) +static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr) { if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) { - pr_err("Page read err %" PRIx64 ":%" PRIx64 " vs %lx:%lx\n", pe->vaddr, pe->nr_pages, vaddr, nr); + pr_err("Page read err %" PRIx64 ":%u vs %lx:%u\n", pe->vaddr, pe->nr_pages, vaddr, nr); BUG(); } } -static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned long int nr, void *buf, unsigned flags) +static int read_parent_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { struct page_read *ppr = pr->parent; int ret; @@ -195,7 +195,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned */ do { - unsigned long int p_nr; + int p_nr; pr_debug("\tpr%lu-%u Read from parent\n", pr->img_id, pr->id); ret = ppr->seek_pagemap(ppr, vaddr); @@ -210,7 +210,7 @@ static int read_parent_page(struct page_read *pr, unsigned long vaddr, unsigned * read as much as we can. */ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE; - pr_info("\tparent has %lu pages in\n", p_nr); + pr_info("\tparent has %u pages in\n", p_nr); if (p_nr > nr) p_nr = nr; @@ -261,7 +261,7 @@ static int read_local_page(struct page_read *pr, unsigned long vaddr, unsigned l break; } - if (opts.auto_dedup && !pr->disable_dedup) { + if (opts.auto_dedup) { ret = punch_hole(pr, pr->pi_off, len, false); if (ret == -1) return -1; @@ -374,7 +374,7 @@ int pagemap_enqueue_iovec(struct page_read *pr, void *buf, unsigned long len, st return 0; } -static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) +static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { int ret; unsigned long len = nr * PAGE_SIZE; @@ -402,7 +402,7 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, unsi * We cannot use maybe_read_page_local() for streaming images as it uses * pread(), seeking in the file. Instead, we use this custom page reader. */ -static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { unsigned long len = nr * PAGE_SIZE; int fd; @@ -445,7 +445,7 @@ static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vadd return ret; } -static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigned long int nr_pages, void *priv) +static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) { int ret = 0; struct page_read *pr = priv; @@ -463,7 +463,7 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, unsigne return ret; } -static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) +static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { int ret; @@ -474,9 +474,9 @@ static int maybe_read_page_remote(struct page_read *pr, unsigned long vaddr, uns return ret; } -static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, unsigned long nr, void *buf, unsigned flags) +static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf, unsigned flags) { - pr_info("pr%lu-%u Read %lx %lu pages\n", pr->img_id, pr->id, vaddr, nr); + pr_info("pr%lu-%u Read %lx %u pages\n", pr->img_id, pr->id, vaddr, nr); pagemap_bound_check(pr->pe, vaddr, nr); if (pagemap_in_parent(pr->pe)) { @@ -535,6 +535,7 @@ static int process_async_reads(struct page_read *pr) fd = img_raw_fd(pr->pi); list_for_each_entry_safe(piov, n, &pr->async, l) { ssize_t ret; + off_t start = piov->from; struct iovec *iovs = piov->to; pr_debug("Read piov iovs %d, from %ju, len %ju, first %p:%zu\n", piov->nr, piov->from, @@ -553,16 +554,13 @@ static int process_async_reads(struct page_read *pr) } } - if (ret < 0) { - pr_err("Can't read async pr bytes (%zd / %ju read, %ju off, %d iovs)\n", ret, - piov->end - piov->from, piov->from, piov->nr); - return -1; - } - - if (opts.auto_dedup && punch_hole(pr, piov->from, ret, false)) - return -1; - if (ret != piov->end - piov->from) { + if (ret < 0) { + pr_err("Can't read async pr bytes (%zd / %ju read, %ju off, %d iovs)\n", ret, + piov->end - piov->from, piov->from, piov->nr); + return -1; + } + /* * The preadv() can return less than requested. It's * valid and doesn't mean error or EOF. We should advance @@ -576,6 +574,9 @@ static int process_async_reads(struct page_read *pr) goto more; } + if (opts.auto_dedup && punch_hole(pr, start, ret, false)) + return -1; + BUG_ON(pr->io_complete); /* FIXME -- implement once needed */ list_del(&piov->l); @@ -682,9 +683,6 @@ static void init_compat_pagemap_entry(PagemapEntry *pe) pe->flags |= PE_PARENT; else if (!pe->has_flags) pe->flags = PE_PRESENT; - - if (!pe->has_nr_pages) - pe->nr_pages = pe->compat_nr_pages; } /* @@ -795,7 +793,6 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p pr->bunch.iov_base = NULL; pr->pmes = NULL; pr->pieok = false; - pr->disable_dedup = false; pr->pmi = open_image_at(dfd, i_typ, O_RSTR, img_id); if (!pr->pmi) @@ -856,14 +853,6 @@ int open_page_read(unsigned long img_id, struct page_read *pr, int pr_flags) #define DUP_IDS_BASE 1000 -void page_read_disable_dedup(struct page_read *pr) -{ - pr_debug("disable dedup, id: %d\n", pr->id); - pr->disable_dedup = true; - if (pr->parent) - page_read_disable_dedup(pr->parent); -} - void dup_page_read(struct page_read *src, struct page_read *dst) { static int dup_ids = 1; diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index e19847b37..7175adee1 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -9,6 +9,7 @@ #include "common/compiler.h" #include "types.h" #include "protobuf.h" +#include "images/sa.pb-c.h" #include "images/timer.pb-c.h" #include "images/creds.pb-c.h" #include "images/core.pb-c.h" @@ -103,24 +104,17 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0])); BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0])); BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0])); - BUILD_BUG_ON(sizeof(ce->cap_amb[0]) != sizeof(c->cap_amb[0])); BUG_ON(ce->n_cap_inh != CR_CAP_SIZE); BUG_ON(ce->n_cap_prm != CR_CAP_SIZE); BUG_ON(ce->n_cap_eff != CR_CAP_SIZE); BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE); - BUG_ON(ce->n_cap_amb != CR_CAP_SIZE); memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE); memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE); memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE); memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE); - memcpy(ce->cap_amb, c->cap_amb, sizeof(c->cap_amb[0]) * CR_CAP_SIZE); - if (c->no_new_privs > 0) { - ce->no_new_privs = c->no_new_privs; - ce->has_no_new_privs = true; - } ce->secbits = c->secbits; ce->n_groups = c->ngroups; @@ -138,13 +132,6 @@ static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c return ce->groups ? 0 : -ENOMEM; } -static void init_parasite_rseq_arg(struct parasite_check_rseq *rseq) -{ - rseq->has_rseq = kdat.has_rseq; - rseq->has_ptrace_get_rseq_conf = kdat.has_ptrace_get_rseq_conf; - rseq->rseq_inited = false; -} - int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core) { ThreadCoreEntry *tc = core->thread_core; @@ -157,8 +144,6 @@ int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEn pc = args->creds; pc->cap_last_cap = kdat.last_cap; - init_parasite_rseq_arg(&args->rseq); - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl); if (ret < 0) return ret; @@ -201,33 +186,237 @@ int parasite_dump_thread_seized(struct parasite_thread_ctl *tctl, struct parasit ret = compel_get_thread_regs(tctl, save_task_regs, core); if (ret) { pr_err("Can't obtain regs for thread %d\n", pid); - return -1; + goto err_rth; } ret = compel_arch_fetch_thread_area(tctl); if (ret) { pr_err("Can't obtain thread area of %d\n", pid); - return -1; + goto err_rth; } compel_arch_get_tls_thread(tctl, &args->tls); - init_parasite_rseq_arg(&args->rseq); - ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD); if (ret) { pr_err("Can't init thread in parasite %d\n", pid); - return -1; + goto err_rth; } ret = alloc_groups_copy_creds(creds, pc); if (ret) { pr_err("Can't copy creds for thread %d\n", pid); - return -1; + goto err_rth; } + compel_release_thread(tctl); + tid->ns[0].virt = args->tid; return dump_thread_core(pid, core, args); + +err_rth: + compel_release_thread(tctl); + return -1; +} + +int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + TaskCoreEntry *tc = item->core[0]->tc; + struct parasite_dump_sa_args *args; + int ret, sig; + SaEntry *sa, **psa; + + args = compel_parasite_args(ctl, struct parasite_dump_sa_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); + if (ret < 0) + return ret; + + psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); + if (!psa) + return -1; + + sa = (SaEntry *)(psa + SIGMAX - 2); + + tc->n_sigactions = SIGMAX - 2; + tc->sigactions = psa; + + for (sig = 1; sig <= SIGMAX; sig++) { + int i = sig - 1; + + if (sig == SIGSTOP || sig == SIGKILL) + continue; + + sa_entry__init(sa); + ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); + ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); + ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); +#ifdef CONFIG_MIPS + sa->has_mask_extended = 1; + BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); + memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); +#else + BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); + memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); +#endif + sa->has_compat_sigaction = true; + sa->compat_sigaction = !compel_mode_native(ctl); + + *(psa++) = sa++; + } + + return 0; +} + +static void encode_itimer(struct itimerval *v, ItimerEntry *ie) +{ + ie->isec = v->it_interval.tv_sec; + ie->iusec = v->it_interval.tv_usec; + ie->vsec = v->it_value.tv_sec; + ie->vusec = v->it_value.tv_usec; +} + +int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + struct parasite_dump_itimers_args *args; + int ret; + + args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); + if (ret < 0) + return ret; + + encode_itimer((&args->real), (core->tc->timers->real)); + encode_itimer((&args->virt), (core->tc->timers->virt)); + encode_itimer((&args->prof), (core->tc->timers->prof)); + + return 0; +} + +static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) +{ + int sz; + + /* + * Will be free()-ed in core_entry_free() + */ + + sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); + tte->posix = xmalloc(sz); + if (!tte->posix) + return -1; + + tte->n_posix = n; + *pte = (PosixTimerEntry *)(tte->posix + n); + return 0; +} + +static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) +{ + pid_t vtid = 0; + int i; + + if (rtid == 0) + return 0; + + if (!(root_ns_mask & CLONE_NEWPID)) { + /* Non-pid-namespace case */ + pte->notify_thread_id = rtid; + pte->has_notify_thread_id = true; + return 0; + } + + /* Pid-namespace case */ + if (!kdat.has_nspid) { + pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + if (item->threads[i].real != rtid) + continue; + + vtid = item->threads[i].ns[0].virt; + break; + } + + if (vtid == 0) { + pr_err("Unable to convert the notify thread id %d\n", rtid); + return -1; + } + + pte->notify_thread_id = vtid; + pte->has_notify_thread_id = true; + return 0; +} + +static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, + PosixTimerEntry *pte) +{ + pte->it_id = vp->spt.it_id; + pte->clock_id = vp->spt.clock_id; + pte->si_signo = vp->spt.si_signo; + pte->it_sigev_notify = vp->spt.it_sigev_notify; + pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); + + pte->overrun = v->overrun; + + pte->isec = v->val.it_interval.tv_sec; + pte->insec = v->val.it_interval.tv_nsec; + pte->vsec = v->val.it_value.tv_sec; + pte->vnsec = v->val.it_value.tv_nsec; + + if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) + return -1; + + return 0; +} + +int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, + struct pstree_item *item) +{ + CoreEntry *core = item->core[0]; + TaskTimersEntry *tte = core->tc->timers; + PosixTimerEntry *pte; + struct proc_posix_timer *temp; + struct parasite_dump_posix_timers_args *args; + int ret, exit_code = -1; + int args_size; + int i; + + if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) + return -1; + + args_size = posix_timers_dump_size(proc_args->timer_n); + args = compel_parasite_args_s(ctl, args_size); + args->timer_n = proc_args->timer_n; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + args->timer[i].it_id = temp->spt.it_id; + i++; + } + + ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); + if (ret < 0) + goto end_posix; + + i = 0; + list_for_each_entry(temp, &proc_args->timers, list) { + posix_timer_entry__init(&pte[i]); + if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) + goto end_posix; + tte->posix[i] = &pte[i]; + i++; + } + + exit_code = 0; +end_posix: + free_posix_timers(proc_args); + return exit_code; } int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc) @@ -235,7 +424,6 @@ int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_mis struct parasite_dump_misc *ma; ma = compel_parasite_args(ctl, struct parasite_dump_misc); - ma->has_membarrier_get_registrations = kdat.has_membarrier_get_registrations; if (compel_rpc_call_sync(PARASITE_CMD_DUMP_MISC, ctl) < 0) return -1; @@ -314,7 +502,6 @@ int parasite_dump_cgroup(struct parasite_ctl *ctl, struct parasite_dump_cgroup_a struct parasite_dump_cgroup_args *ca; ca = compel_parasite_args(ctl, struct parasite_dump_cgroup_args); - memcpy(ca->thread_cgrp, cgroup->thread_cgrp, sizeof(ca->thread_cgrp)); ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_CGROUP, ctl); if (ret) { pr_err("Parasite failed to dump /proc/self/cgroup\n"); @@ -421,7 +608,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, ictx->flags |= INFECT_NO_MEMFD; if (fault_injected(FI_PARASITE_CONNECT)) ictx->flags |= INFECT_FAIL_CONNECT; - if (fault_injected(FI_NO_BREAKPOINTS) || !kdat.has_breakpoints) + if (fault_injected(FI_NO_BREAKPOINTS)) ictx->flags |= INFECT_NO_BREAKPOINTS; if (kdat.compat_cr) ictx->flags |= INFECT_COMPATIBLE; diff --git a/criu/path.c b/criu/path.c index f9ac6f5ae..1b71c4cb1 100644 --- a/criu/path.c +++ b/criu/path.c @@ -6,7 +6,6 @@ #include "mount.h" #include "path.h" #include "log.h" -#include "util.h" #include "common/bug.h" char *cut_root_for_bind(char *target_root, char *source_root) @@ -42,30 +41,64 @@ out: char *mnt_get_sibling_path(struct mount_info *m, struct mount_info *p, char *buf, int len) { struct mount_info *pa = m->parent; - char *rpath, fsrpath[PATH_MAX]; + char *rpath, *cut_root, *path = buf; + int off = 0; if (pa == NULL) return NULL; - rpath = get_relative_path(m->ns_mountpoint, pa->ns_mountpoint); - if (!rpath) { - pr_warn("child - parent mountpoint mismatch %s - %s\n", m->ns_mountpoint, pa->ns_mountpoint); - return NULL; - } + rpath = m->mountpoint + strlen(pa->mountpoint); + if (rpath[0] == '/') + rpath++; - if (snprintf(fsrpath, sizeof(fsrpath), "%s/%s", pa->root, rpath) >= sizeof(fsrpath)) { - pr_warn("snrptintf truncation \"%s / %s\"\n", pa->root, rpath); - return NULL; - } + /* + * Get a path to a sibling of "m" with parent "p", + * return NULL is p can't have a sibling of m. + * + * Here are two cases: + * When a parent of "m" has longer root than "p": + * / pm->root / rpath + * | cut_root | + * / p->root / + * In this case, a sibling path is a sum of p->mountpoint, + * cut_root and rpath. + * + * When a parent of m has shorter root than "p": + * / pm->root / rpath + * | cut_root | + * / p->root / rpath +strlen(cut_root) + * In this case, a sibling path is a sum of p->mountpoint and + * rpath - strlen(cut_root). + */ - rpath = get_relative_path(fsrpath, p->root); - if (!rpath) - return NULL; - - if (snprintf(buf, len, "%s/%s", p->ns_mountpoint, rpath) >= sizeof(fsrpath)) { - pr_warn("snrptintf truncation \"%s / %s\"\n", p->ns_mountpoint, rpath); + cut_root = cut_root_for_bind(pa->root, p->root); + if (cut_root == NULL) return NULL; + if (p->mountpoint[1] != 0) /* not "/" */ { + off = snprintf(path, len, "%s", p->mountpoint); + if (path[off - 1] == '/') /* p->mountpoint = "./" */ + off--; } + len -= off; + path += off; + + if (strlen(pa->root) > strlen(p->root)) { + off = snprintf(path, len, "/%s", cut_root); + len -= off; + path += off; + } else { + int length = strlen(cut_root); + if (strncmp(rpath, cut_root, length)) + return NULL; + rpath += strlen(cut_root); + if (length > 0 && (rpath[0] && rpath[0] != '/')) + return NULL; + } + if (rpath[0] == '/') + rpath++; + + if (rpath[0] != '\0') + snprintf(path, len, "/%s", rpath); return buf; } diff --git a/criu/pidfd-store.c b/criu/pidfd-store.c index 110f7802a..98b478b30 100644 --- a/criu/pidfd-store.c +++ b/criu/pidfd-store.c @@ -13,7 +13,6 @@ #include "log.h" #include "util.h" #include "pidfd-store.h" -#include "sockets.h" struct pidfd_entry { pid_t pid; @@ -95,12 +94,13 @@ int init_pidfd_store_sk(pid_t pid, int sk) * This is similar to how fdstore_init() works. */ if (addrlen == sizeof(sa_family_t)) { - if (sk_setbufs(pidfd_store_sk, buf)) { + if (setsockopt(pidfd_store_sk, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0], sizeof(buf[0])) < 0 || + setsockopt(pidfd_store_sk, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1], sizeof(buf[1])) < 0) { + pr_perror("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE"); goto err; } - addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d-%s", pid, sk, - criu_run_id); + addrlen = snprintf(addr.sun_path, sizeof(addr.sun_path), "X/criu-pidfd-store-%d-%d", pid, sk); addrlen += sizeof(addr.sun_family); addr.sun_path[0] = 0; diff --git a/criu/pidfd.c b/criu/pidfd.c deleted file mode 100644 index ae32025b0..000000000 --- a/criu/pidfd.c +++ /dev/null @@ -1,305 +0,0 @@ -#include "common/lock.h" -#include "imgset.h" -#include "pidfd.h" -#include "fdinfo.h" -#include "pidfd.pb-c.h" -#include "protobuf.h" -#include "pstree.h" -#include -#include -#include -#include "common/bug.h" -#include "rst-malloc.h" - -#include "compel/plugins/std/syscall-codes.h" - -#undef LOG_PREFIX -#define LOG_PREFIX "pidfd: " - -#ifndef PIDFD_THREAD -#define PIDFD_THREAD O_EXCL -#endif - -struct pidfd_info { - PidfdEntry *pidfe; - struct file_desc d; - - struct dead_pidfd *dead; - struct pidfd_info *next; -}; - -struct dead_pidfd { - unsigned int ino; - int creator_id; - - struct hlist_node hash; - struct pidfd_info *list; -}; - -#define DEAD_PIDFD_HASH_SIZE 32 -static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE]; - -void init_dead_pidfd_hash(void) -{ - for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) - INIT_HLIST_HEAD(&dead_pidfd_hash[i]); -} - -static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino) -{ - struct dead_pidfd *dead; - struct hlist_head *chain; - - chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE]; - hlist_for_each_entry(dead, chain, hash) { - if (dead->ino == ino) { - return dead; - } - } - - return NULL; -} - -int is_pidfd_link(char *link) -{ - /* - * pidfs was introduced in Linux 6.9 - * before which anonymous-inodes were used - */ - return is_anon_link_type(link, "[pidfd]"); -} - -static void pr_info_pidfd(char *action, PidfdEntry *pidfe) -{ - pr_info("%s: id %#08x flags %u NSpid %d ino %u\n", - action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino - ); -} - -static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p) -{ - struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT}; - FileEntry fe = FILE_ENTRY__INIT; - - if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info)) - return -1; - - if (p->flags & PIDFD_THREAD) { - pr_err("PIDFD_THREAD flag is currently not supported\n"); - return -1; - } - - /* - * Check if the pid pidfd refers to is part of process tree - * This ensures the process will exist on restore. - */ - if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) { - pr_err("pidfd pid %d is not a part of process tree..\n", - pidfd_info.pid); - return -1; - } - - pidfd_info.pidfe.id = id; - pidfd_info.pidfe.flags = (p->flags & ~O_RDWR); - pidfd_info.pidfe.fown = (FownEntry *)&p->fown; - - fe.type = FD_TYPES__PIDFD; - fe.id = pidfd_info.pidfe.id; - fe.pidfd = &pidfd_info.pidfe; - - pr_info_pidfd("Dumping", &pidfd_info.pidfe); - return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); -} - -const struct fdtype_ops pidfd_dump_ops = { - .type = FD_TYPES__PIDFD, - .dump = dump_one_pidfd, -}; - -static int pidfd_open(pid_t pid, int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int create_tmp_process(void) -{ - int tmp_process; - tmp_process = fork(); - if (tmp_process < 0) { - pr_perror("Could not fork"); - return -1; - } else if (tmp_process == 0) { - while(1) - sleep(1); - } - return tmp_process; -} - -static int kill_helper(pid_t pid) -{ - int status; - sigset_t blockmask, oldmask; - - /* - * Block SIGCHLD to prevent interfering from sigchld_handler() - * and to properly handle the tmp process termination without - * a race condition. A similar approach is used in cr_system(). - */ - sigemptyset(&oldmask); - sigemptyset(&blockmask); - sigaddset(&blockmask, SIGCHLD); - if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { - pr_perror("Cannot set mask of blocked signals"); - goto err; - } - - if (kill(pid, SIGKILL) < 0) { - pr_perror("Could not kill temporary process with pid: %d", pid); - goto err; - } - - if (waitpid(pid, &status, 0) != pid) { - pr_perror("Could not wait on temporary process with pid: %d", pid); - goto err; - } - - /* Restore the original signal mask after tmp process has terminated */ - if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) { - pr_perror("Cannot clear blocked signals"); - goto err; - } - - if (!WIFSIGNALED(status)) { - pr_err("Expected temporary process to be terminated by a signal\n"); - goto err; - } - - if (WTERMSIG(status) != SIGKILL) { - pr_err("Expected temporary process to be terminated by SIGKILL\n"); - goto err; - } - - return 0; -err: - return -1; -} - -static int open_one_pidfd(struct file_desc *d, int *new_fd) -{ - struct pidfd_info *info, *child; - struct dead_pidfd *dead = NULL; - pid_t pid; - int pidfd; - - info = container_of(d, struct pidfd_info, d); - if (info->pidfe->nspid != -1) { - pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags); - if (pidfd < 0) { - pr_perror("Could not open pidfd for %d", info->pidfe->nspid); - goto err_close; - } - goto out; - } - - dead = lookup_dead_pidfd(info->pidfe->ino); - BUG_ON(!dead); - - if (info->dead && info->dead->creator_id != info->pidfe->id) { - int ret = recv_desc_from_peer(&info->d, &pidfd); - if (ret != 0) { - if (ret != 1) - pr_err("Can't get fd\n"); - return ret; - } - goto out; - } - - pid = create_tmp_process(); - if (pid < 0) - goto err_close; - - for (child = dead->list; child; child = child->next) { - if (child == info) - continue; - pidfd = pidfd_open(pid, child->pidfe->flags); - if (pidfd < 0) { - pr_perror("Could not open pidfd for %d", child->pidfe->nspid); - goto err_close; - } - - if (send_desc_to_peer(pidfd, &child->d)) { - pr_perror("Can't send file descriptor"); - close(pidfd); - return -1; - } - close(pidfd); - } - - pidfd = pidfd_open(pid, info->pidfe->flags); - if (pidfd < 0) { - pr_perror("Could not open pidfd for %d", info->pidfe->nspid); - goto err_close; - } - if (kill_helper(pid)) - goto err_close; -out: - if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) { - goto err_close; - } - - *new_fd = pidfd; - return 0; -err_close: - pr_err("Can't create pidfd %#08x NSpid: %d flags: %u\n", - info->pidfe->id, info->pidfe->nspid, info->pidfe->flags); - return -1; -} - -static struct file_desc_ops pidfd_desc_ops = { - .type = FD_TYPES__PIDFD, - .open = open_one_pidfd -}; - -static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i) -{ - struct dead_pidfd *dead; - struct pidfd_info *info = obj; - - info->pidfe = pb_msg(msg, PidfdEntry); - pr_info_pidfd("Collected ", info->pidfe); - - info->dead = NULL; - if (info->pidfe->nspid != -1) - goto out; - - dead = lookup_dead_pidfd(info->pidfe->ino); - if (!dead) { - dead = xmalloc(sizeof(*dead)); - if (!dead) { - pr_err("Could not allocate memory..\n"); - return -1; - } - - INIT_HLIST_NODE(&dead->hash); - dead->list = NULL; - dead->ino = info->pidfe->ino; - dead->creator_id = info->pidfe->id; - hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]); - } - - info->dead = dead; - info->next = dead->list; - dead->list = info; - if (dead->creator_id > info->pidfe->id) - dead->creator_id = info->pidfe->id; - -out: - return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops); -} - -struct collect_image_info pidfd_cinfo = { - .fd_type = CR_FD_PIDFD, - .pb_type = PB_PIDFD, - .priv_size = sizeof(struct pidfd_info), - .collect = collect_one_pidfd, -}; diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 60c7f1e94..265dcf82b 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -18,15 +18,6 @@ ifeq ($(ARCH),mips) ccflags-y += -mno-abicalls -fno-pic endif -# -mshstk required for CET instructions -ifeq ($(ARCH),x86) - ccflags-y += -mshstk -endif - -ifeq ($(ARCH),riscv64) - ccflags-y += -fno-stack-protector -endif - LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o @@ -47,10 +38,6 @@ ifeq ($(ARCH),ppc64) restorer-obj-y += ./$(ARCH_DIR)/vdso-trampoline.o endif -ifeq ($(ARCH),riscv64) - restorer-obj-y += ./$(ARCH_DIR)/vdso-lookup.o -endif - define gen-pie-rules $(1)-obj-y += $(1).o $(1)-obj-e += pie.lib.a diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library index d96a7ac32..da2a2fab3 100644 --- a/criu/pie/Makefile.library +++ b/criu/pie/Makefile.library @@ -27,7 +27,3 @@ CFLAGS += $(CFLAGS_PIE) ifeq ($(ARCH),mips) CFLAGS += -fno-stack-protector -DCR_NOGLIBC -mno-abicalls -fno-pic endif - -ifeq ($(ARCH),riscv64) - ccflags-y += -fno-stack-protector -endif \ No newline at end of file diff --git a/criu/pie/parasite-vdso.c b/criu/pie/parasite-vdso.c index f3ad3107f..355007fa9 100644 --- a/criu/pie/parasite-vdso.c +++ b/criu/pie/parasite-vdso.c @@ -45,7 +45,6 @@ static int remap_one(char *who, unsigned long *from, unsigned long to, size_t si static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) { unsigned long vvar_size = rt->sym.vvar_size; - unsigned long vvar_vclock_size = rt->sym.vvar_vclock_size; unsigned long vdso_size = rt->sym.vdso_size; int ret; @@ -55,24 +54,8 @@ static int park_at(struct vdso_maps *rt, unsigned long vdso, unsigned long vvar) std_log_set_gettimeofday(NULL); /* stop using vdso for timings */ - if (vvar) { - /* - * v6.13-rc1~172^2~9 splits the vvar vma in two parts vvar and - * vvar_clock. The last one is mapped right after the first - * one. - */ - if (vvar_vclock_size) { - unsigned long from; - - vvar_size -= vvar_vclock_size; - from = rt->vvar_start + vvar_size; - - ret = remap_one("rt-vvar", &from, vvar + vvar_size, vvar_vclock_size); - if (ret) - return ret; - } + if (vvar) ret = remap_one("rt-vvar", &rt->vvar_start, vvar, vvar_size); - } if (!ret) vdso_update_gtod_addr(rt); diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index c966e9e62..bc0a33cd4 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -3,17 +3,15 @@ #include #include #include +#include #include #include #include -#include "linux/rseq.h" - #include "common/config.h" #include "int.h" #include "types.h" #include -#include "linux/mount.h" #include "parasite.h" #include "fcntl.h" #include "prctl.h" @@ -101,7 +99,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) } if (spliced_bytes != args->nr_pages * PAGE_SIZE) { sys_close(p); - pr_err("Can't splice all pages to pipe (%ld/%ld)\n", spliced_bytes, args->nr_pages); + pr_err("Can't splice all pages to pipe (%ld/%d)\n", spliced_bytes, args->nr_pages); return -1; } @@ -169,7 +167,6 @@ static int dump_posix_timers(struct parasite_dump_posix_timers_args *args) } static int dump_creds(struct parasite_dump_creds *args); -static int check_rseq(struct parasite_check_rseq *rseq); static int dump_thread_common(struct parasite_dump_thread *ti) { @@ -200,74 +197,11 @@ static int dump_thread_common(struct parasite_dump_thread *ti) goto out; } - ret = check_rseq(&ti->rseq); - if (ret) { - pr_err("Unable to check if rseq() is initialized: %d\n", ret); - goto out; - } - ret = dump_creds(ti->creds); out: return ret; } -/* - * Returns a membarrier() registration command (it is a bitmask) if the process - * was registered for specified (as a bit index) membarrier()-issuing command; - * returns zero otherwise. - */ -static int get_membarrier_registration_mask(int cmd_bit) -{ - unsigned cmd = 1 << cmd_bit; - int ret; - - /* - * Issuing a barrier will be successful only if the process was registered - * for this type of membarrier. All errors are a sign that the type issued - * was not registered (EPERM) or not supported by kernel (EINVAL or ENOSYS). - */ - ret = sys_membarrier(cmd, 0, 0); - if (ret && ret != -EPERM && ret != -EINVAL && ret != -ENOSYS) { - pr_err("membarrier(1 << %d) returned %d\n", cmd_bit, ret); - return -1; - } - pr_debug("membarrier(1 << %d) returned %d\n", cmd_bit, ret); - /* - * For supported registrations, MEMBARRIER_CMD_REGISTER_xxx = MEMBARRIER_CMD_xxx << 1. - * See: enum membarrier_cmd in include/uapi/linux/membarrier.h in kernel sources. - */ - return ret ? 0 : cmd << 1; -} - -/* - * It would be better to check the following with BUILD_BUG_ON, but we might - * have an old linux/membarrier.h header without necessary enum values. - */ -#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED 3 -#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE 5 -#define MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ 7 -#define MEMBARRIER_CMDBIT_GET_REGISTRATIONS 9 - -static int dump_membarrier_compat(int *membarrier_registration_mask) -{ - int ret; - - *membarrier_registration_mask = 0; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED); - if (ret < 0) - return -1; - *membarrier_registration_mask |= ret; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_SYNC_CORE); - if (ret < 0) - return -1; - *membarrier_registration_mask |= ret; - ret = get_membarrier_registration_mask(MEMBARRIER_CMDBIT_PRIVATE_EXPEDITED_RSEQ); - if (ret < 0) - return -1; - *membarrier_registration_mask |= ret; - return 0; -} - static int dump_misc(struct parasite_dump_misc *args) { int ret; @@ -282,19 +216,6 @@ static int dump_misc(struct parasite_dump_misc *args) args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0); args->thp_disabled = sys_prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); - if (args->has_membarrier_get_registrations) { - ret = sys_membarrier(1 << MEMBARRIER_CMDBIT_GET_REGISTRATIONS, 0, 0); - if (ret < 0) { - pr_err("membarrier(1 << %d) returned %d\n", MEMBARRIER_CMDBIT_GET_REGISTRATIONS, ret); - return -1; - } - args->membarrier_registration_mask = ret; - } else { - ret = dump_membarrier_compat(&args->membarrier_registration_mask); - if (ret) - return ret; - } - ret = sys_prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&args->child_subreaper, 0, 0, 0); if (ret) pr_err("PR_GET_CHILD_SUBREAPER failed (%d)\n", ret); @@ -324,7 +245,6 @@ static int dump_creds(struct parasite_dump_creds *args) args->cap_prm[i] = data[i].prm; args->cap_inh[i] = data[i].inh; args->cap_bnd[i] = 0; - args->cap_amb[i] = 0; for (j = 0; j < 32; j++) { if (j + i * 32 > args->cap_last_cap) @@ -337,21 +257,8 @@ static int dump_creds(struct parasite_dump_creds *args) if (ret) args->cap_bnd[i] |= (1 << j); } - - for (j = 0; j < 32; j++) { - if (j + i * 32 > args->cap_last_cap) - break; - ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, j + i * 32, 0, 0); - if (ret < 0) { - pr_err("Unable to read ambient capability %d: %d\n", j + i * 32, ret); - return -1; - } - if (ret) - args->cap_amb[i] |= (1 << j); - } } - args->no_new_privs = sys_prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0); ret = sys_getgroups(0, NULL); @@ -406,97 +313,6 @@ grps_err: return -1; } -static int check_rseq(struct parasite_check_rseq *rseq) -{ - int ret; - unsigned long rseq_abi_pointer; - unsigned long rseq_abi_size; - uint32_t rseq_signature; - void *addr; - - /* no need to do hacky check if we can get all info from ptrace() */ - if (!rseq->has_rseq || rseq->has_ptrace_get_rseq_conf) - return 0; - - /* - * We need to determine if victim process has rseq() - * initialized, but we have no *any* proper kernel interface - * supported at this point. - * Our plan: - * 1. We know that if we call rseq() syscall and process already - * has current->rseq filled, then we get: - * -EINVAL if current->rseq != rseq || rseq_len != sizeof(*rseq), - * -EPERM if current->rseq_sig != sig), - * -EBUSY if current->rseq == rseq && rseq_len == sizeof(*rseq) && - * current->rseq_sig != sig - * if current->rseq == NULL (rseq() wasn't used) then we go to: - * IS_ALIGNED(rseq ...) check, if we fail it we get -EINVAL and it - * will be hard to distinguish case when rseq() was initialized or not. - * Let's construct arguments payload - * with: - * 1. correct rseq_abi_size - * 2. aligned and correct rseq_abi_pointer - * And see what rseq() return to us. - * If ret value is: - * 0: it means that rseq *wasn't* used and we successfully registered it, - * -EINVAL or : it means that rseq is already initialized, - * so we *have* to dump it. But as we have has_ptrace_get_rseq_conf = false, - * we should just fail dump as it's unsafe to skip rseq() dump for processes - * with rseq() initialized. - * -EPERM or -EBUSY: should not happen as we take a fresh memory area for rseq - */ - addr = (void *)sys_mmap(NULL, sizeof(struct criu_rseq), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, - 0); - if (addr == MAP_FAILED) { - pr_err("mmap() failed for struct rseq ret = %lx\n", (unsigned long)addr); - return -1; - } - - memset(addr, 0, sizeof(struct criu_rseq)); - - /* sys_mmap returns page aligned addresses */ - rseq_abi_pointer = (unsigned long)addr; - rseq_abi_size = (unsigned long)sizeof(struct criu_rseq); - /* it's not so important to have unique signature for us, - * because rseq_abi_pointer is guaranteed to be unique - */ - rseq_signature = 0x12345612; - - pr_info("\ttrying sys_rseq(%lx, %lx, %x, %x)\n", rseq_abi_pointer, rseq_abi_size, 0, rseq_signature); - ret = sys_rseq((void *)rseq_abi_pointer, rseq_abi_size, 0, rseq_signature); - if (ret) { - if (ret == -EINVAL) { - pr_info("\trseq is initialized in the victim\n"); - rseq->rseq_inited = true; - - ret = 0; - } else { - pr_err("\tunexpected failure of sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer, - rseq_abi_size, 0, rseq_signature, ret); - - ret = -1; - } - } else { - ret = sys_rseq((void *)rseq_abi_pointer, sizeof(struct criu_rseq), RSEQ_FLAG_UNREGISTER, - rseq_signature); - if (ret) { - pr_err("\tfailed to unregister sys_rseq(%lx, %lx, %x, %x) = %d\n", rseq_abi_pointer, - rseq_abi_size, RSEQ_FLAG_UNREGISTER, rseq_signature, ret); - - ret = -1; - /* we can't do munmap() because rseq is registered and we failed to unregister it */ - goto out_nounmap; - } - - rseq->rseq_inited = false; - ret = 0; - } - - sys_munmap(addr, sizeof(struct criu_rseq)); -out_nounmap: - return ret; -} - static int fill_fds_fown(int fd, struct fd_opts *p) { int flags, ret; @@ -638,7 +454,7 @@ static int get_proc_fd(void) ret = sys_mount("proc", proc_mountpoint, "proc", MS_MGC_VAL, NULL); if (ret) { if (ret == -EPERM) - pr_err("can't dump unprivileged task whose /proc doesn't belong to it\n"); + pr_err("can't dump unpriviliged task whose /proc doesn't belong to it\n"); else pr_err("mount failed (%d)\n", ret); sys_rmdir(proc_mountpoint); @@ -829,7 +645,7 @@ static int parasite_dump_cgroup(struct parasite_dump_cgroup_args *args) return -1; } - cgroup = sys_openat(proc, args->thread_cgrp, O_RDONLY, 0); + cgroup = sys_openat(proc, "self/cgroup", O_RDONLY, 0); sys_close(proc); if (cgroup < 0) { pr_err("can't get /proc/self/cgroup fd\n"); diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 0a8aba41b..4304691bb 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "linux/userfaultfd.h" @@ -28,7 +27,6 @@ #include #include #include -#include "mman.h" #include "signal.h" #include "prctl.h" #include "criu-log.h" @@ -50,17 +48,7 @@ #include "images/inventory.pb-c.h" #include "shmem.h" - -/* - * sys_getgroups() buffer size. Not too much, to avoid stack overflow. - */ -#define MAX_GETGROUPS_CHECKED (512 / sizeof(unsigned int)) - -/* - * Memory overhead limit for reading VMA when auto_dedup is enabled. - * An arbitrarily chosen trade-off point between speed and memory usage. - */ -#define AUTO_DEDUP_OVERHEAD_BYTES (128 << 20) +#include "restorer.h" #ifndef PR_SET_PDEATHSIG #define PR_SET_PDEATHSIG 1 @@ -78,10 +66,6 @@ #define FALLOC_FL_PUNCH_HOLE 0x02 #endif -#ifndef ARCH_RT_SIGRETURN_RST -#define ARCH_RT_SIGRETURN_RST ARCH_RT_SIGRETURN -#endif - #define sys_prctl_safe(opcode, val1, val2, val3) \ ({ \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ @@ -108,7 +92,7 @@ bool fault_injected(enum faults f) * Hint: compel on aarch64 shall learn relocs for that. */ static unsigned __page_size; -unsigned long page_size(void) +unsigned page_size(void) { return __page_size; } @@ -200,58 +184,37 @@ static int lsm_set_label(char *label, char *type, int procfd) return 0; } -static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type, uid_t uid) +static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_type) { CredsEntry *ce = &args->creds; int b, i, ret; struct cap_header hdr; struct cap_data data[_LINUX_CAPABILITY_U32S_3]; - int ruid, euid, suid, fsuid; - int rgid, egid, sgid, fsgid; + + /* + * We're still root here and thus can do it without failures. + */ /* * Setup supplementary group IDs early. */ if (args->groups) { - /* - * We may be in an unprivileged user namespace where setgroups - * is disabled. If the current list of groups is already what - * we want, skip the call to setgroups. - */ - unsigned int gids[MAX_GETGROUPS_CHECKED]; - int n = sys_getgroups(MAX_GETGROUPS_CHECKED, gids); - if (n != ce->n_groups || memcmp(gids, args->groups, n * sizeof(*gids))) { - ret = sys_setgroups(ce->n_groups, args->groups); - if (ret) { - pr_err("Can't setgroups([%zu gids]): %d\n", ce->n_groups, ret); - return -1; - } + ret = sys_setgroups(ce->n_groups, args->groups); + if (ret) { + pr_err("Can't setup supplementary group IDs: %d\n", ret); + return -1; } } - /* - * Compare xids with current values. If all match then we can skip - * setting them (which requires extra capabilities). - */ - fsuid = sys_setfsuid(-1); - fsgid = sys_setfsgid(-1); - if (sys_getresuid(&ruid, &euid, &suid) == 0 && sys_getresgid(&rgid, &egid, &sgid) == 0 && ruid == ce->uid && - euid == ce->euid && suid == ce->suid && rgid == ce->gid && egid == ce->egid && sgid == ce->sgid && - fsuid == ce->fsuid && fsgid == ce->fsgid) { - goto skip_xids; - } - /* * First -- set the SECURE_NO_SETUID_FIXUP bit not to * lose caps bits when changing xids. */ - if (!uid) { - ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); - if (ret) { - pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); - return -1; - } + ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0); + if (ret) { + pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret); + return -1; } /* @@ -284,18 +247,15 @@ static int restore_creds(struct thread_creds_args *args, int procfd, int lsm_typ return -1; } -skip_xids: /* * Third -- restore securebits. We don't need them in any * special state any longer. */ - if (sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0) != ce->secbits) { - ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); - if (ret) { - pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); - return -1; - } + ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0); + if (ret) { + pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret); + return -1; } /* @@ -311,18 +271,10 @@ skip_xids: /* already set */ continue; ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0); - if (!ret) - continue; - if (!ce->has_no_new_privs || !ce->no_new_privs || args->cap_prm[b] & (1 << i)) { + if (ret) { pr_err("Unable to drop capability %d: %d\n", i + b * 32, ret); return -1; } - /* - * If prctl(NO_NEW_PRIVS) is going to be set then it - * will prevent inheriting the capabilities not in - * the permitted set. - */ - pr_warn("Unable to drop capability %d from bset: %d (but NO_NEW_PRIVS will drop it)\n", i + b * 32, ret); } } @@ -348,22 +300,6 @@ skip_xids: return -1; } - for (b = 0; b < CR_CAP_SIZE; b++) { - for (i = 0; i < 32; i++) { - if (b * 32 + i > args->cap_last_cap) - break; - if ((args->cap_amb[b] & (1 << i)) == 0) - /* don't set */ - continue; - ret = sys_prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); - if (!ret) - continue; - pr_err("Unable to raise ambient capability %d: %d\n", i + b * 32, ret); - return -1; - } - } - - if (lsm_type != LSMTYPE__SELINUX) { /* * SELinux does not support setting the process context for @@ -379,14 +315,6 @@ skip_xids: if (lsm_set_label(args->lsm_sockcreate, "sockcreate", procfd) < 0) return -1; - if (ce->has_no_new_privs && ce->no_new_privs) { - ret = sys_prctl(PR_SET_NO_NEW_PRIVS, ce->no_new_privs, 0, 0, 0); - if (ret) { - pr_err("Unable to set no_new_privs=%d: %d\n", ce->no_new_privs, ret); - return -1; - } - } - return 0; } @@ -454,13 +382,13 @@ static int restore_dumpable_flag(MmEntry *mme) static void restore_sched_info(struct rst_sched_param *p) { - struct sched_param param; + struct sched_param parm; pr_info("Restoring scheduler params %d.%d.%d\n", p->policy, p->nice, p->prio); sys_setpriority(PRIO_PROCESS, 0, p->nice); - param.sched_priority = p->prio; - sys_sched_setscheduler(0, p->policy, ¶m); + parm.sched_priority = p->prio; + sys_sched_setscheduler(0, p->policy, &parm); } static void restore_rlims(struct task_restore_args *ta) @@ -497,28 +425,6 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) return 0; } -static int restore_rseq(struct rst_rseq_param *rseq) -{ - int ret; - - if (!rseq->rseq_abi_pointer) { - pr_debug("rseq: nothing to restore\n"); - return 0; - } - - pr_debug("rseq: rseq_abi_pointer = %lx signature = %x\n", (unsigned long)rseq->rseq_abi_pointer, - rseq->signature); - - ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 0, rseq->signature); - if (ret) { - pr_err("failed sys_rseq(%lx, %lx, %x, %x) = %d\n", (unsigned long)rseq->rseq_abi_pointer, - (unsigned long)rseq->rseq_abi_size, 0, rseq->signature, ret); - return -1; - } - - return 0; -} - static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) { unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; @@ -643,107 +549,19 @@ static int restore_thread_common(struct thread_restore_args *args) restore_tls(&args->tls); - if (restore_rseq(&args->rseq)) - return -1; - return 0; } static void noinline rst_sigreturn(unsigned long new_sp, struct rt_sigframe *sigframe) { - ARCH_RT_SIGRETURN_RST(new_sp, sigframe); -} - -static int send_cg_set(int sk, int cg_set) -{ - struct cmsghdr *ch; - struct msghdr h; - /* - * 0th is the dummy call address for compatibility with userns helper - * 1st is the cg_set - */ - struct iovec iov[2]; - char cmsg[CMSG_SPACE(sizeof(struct ucred))] = {}; - int ret, *dummy = NULL; - struct ucred *ucred; - - iov[0].iov_base = &dummy; - iov[0].iov_len = sizeof(dummy); - iov[1].iov_base = &cg_set; - iov[1].iov_len = sizeof(cg_set); - - h.msg_iov = iov; - h.msg_iovlen = sizeof(iov) / sizeof(struct iovec); - h.msg_name = NULL; - h.msg_namelen = 0; - h.msg_flags = 0; - - h.msg_control = cmsg; - h.msg_controllen = sizeof(cmsg); - ch = CMSG_FIRSTHDR(&h); - ch->cmsg_len = CMSG_LEN(sizeof(struct ucred)); - ch->cmsg_level = SOL_SOCKET; - ch->cmsg_type = SCM_CREDENTIALS; - - ucred = (struct ucred *)CMSG_DATA(ch); - /* - * We still have privilege in this namespace so we can send - * thread id instead of pid of main thread, uid, gid as 0 - * since these 2 are ignored in cgroupd - */ - ucred->pid = sys_gettid(); - ucred->uid = 0; - ucred->gid = 0; - - ret = sys_sendmsg(sk, &h, 0); - if (ret < 0) { - pr_err("Unable to send packet to cgroupd %d\n", ret); - return -1; - } - - return 0; -} - -/* - * As the cgroupd socket is shared among threads and processes, this - * should be called with task_entries->cgroupd_sync_lock held. - */ -static int recv_cg_set_restore_ack(int sk) -{ - struct cmsghdr *ch; - struct msghdr h = {}; - char cmsg[CMSG_SPACE(sizeof(struct ucred))]; - struct ucred *cred; - int ret; - - h.msg_control = cmsg; - h.msg_controllen = sizeof(cmsg); - - ret = sys_recvmsg(sk, &h, 0); - if (ret < 0) { - pr_err("Unable to receive from cgroupd %d\n", ret); - return -1; - } - - if (h.msg_controllen != sizeof(cmsg)) { - pr_err("The message from cgroupd is truncated\n"); - return -1; - } - - ch = CMSG_FIRSTHDR(&h); - cred = (struct ucred *)CMSG_DATA(ch); - if (cred->pid != sys_gettid()) { - pr_err("cred pid %d != gettid\n", cred->pid); - return -1; - } - return 0; + ARCH_RT_SIGRETURN(new_sp, sigframe); } /* * Threads restoration via sigreturn. Note it's locked * routine and calls for unlock at the end. */ -__visible long __export_restore_thread(struct thread_restore_args *args) +long __export_restore_thread(struct thread_restore_args *args) { struct rt_sigframe *rt_sigframe; k_rtsigset_t to_block; @@ -756,10 +574,6 @@ __visible long __export_restore_thread(struct thread_restore_args *args) goto core_restore_end; } - /* restore original shadow stack */ - if (arch_shstk_restore(&args->shstk)) - goto core_restore_end; - /* All signals must be handled by thread leader */ ksigfillset(&to_block); ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t)); @@ -770,24 +584,6 @@ __visible long __export_restore_thread(struct thread_restore_args *args) rt_sigframe = (void *)&args->mz->rt_sigframe; - if (args->cg_set != -1) { - int err = 0; - - mutex_lock(&task_entries_local->cgroupd_sync_lock); - - pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set); - - err = send_cg_set(args->cgroupd_sk, args->cg_set); - if (!err) - err = recv_cg_set_restore_ack(args->cgroupd_sk); - - mutex_unlock(&task_entries_local->cgroupd_sync_lock); - sys_close(args->cgroupd_sk); - - if (err) - goto core_restore_end; - } - if (restore_thread_common(args)) goto core_restore_end; @@ -813,7 +609,7 @@ __visible long __export_restore_thread(struct thread_restore_args *args) if (restore_seccomp(args)) BUG(); - ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type, args->ta->uid); + ret = restore_creds(args->creds_args, args->ta->proc_fd, args->ta->lsm_type); ret = ret || restore_dumpable_flag(&args->ta->mm); ret = ret || restore_pdeath_sig(args); if (ret) @@ -924,7 +720,7 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) /* * This restores aio ring header, content, head and in-kernel position * of tail. To set tail, we write to /dev/null and use the fact this - * operation is synchronous for the device. Also, we unmap temporary + * operation is synchronious for the device. Also, we unmap temporary * anonymous area, used to store content of ring buffer during restore * and mapped in premap_private_vma(). */ @@ -1112,23 +908,6 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) pr_info("Remap %lx->%lx len %lx\n", src, dst, len); - /* - * SHSTK VMAs are a bit special, in fact we create shstk vma right in the - * shstk_vma_restore() and populate it with contents from a premapped VMA - * (which in turns is just a normal anonymous VMA!). Then, we munmap() this - * premapped VMA. After, we need to adjust vma_premmaped_start(vma_entry) - * to point to a created shstk vma and treat it as a premmaped one in vma_remap(). - */ - if (vma_entry_is(vma_entry, VMA_AREA_SHSTK)) { - if (shstk_vma_restore(vma_entry)) { - pr_err("Unable to prepare shadow stack vma for remap %lx -> %lx\n", src, dst); - return -1; - } - - /* shstk_vma_restore() modifies vma premapped address */ - src = vma_premmaped_start(vma_entry); - } - if (src - dst < len) guard = dst; else if (dst - src < len) @@ -1153,7 +932,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) * |G|----tgt----| | * * 3. remap src to any other place. - * G prevents src from being remapped on tgt again + * G prevents src from being remaped on tgt again * | |-------------| -> |+++++src+++++| * |G|---tgt-----| | * @@ -1173,7 +952,7 @@ static int vma_remap(VmaEntry *vma_entry, int uffd) /* Move src to non-overlapping place (step 3) */ addr = sys_mmap(NULL, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); - if (IS_ERR((void *)addr)) { + if (addr == (unsigned long)MAP_FAILED) { pr_err("Unable to reserve memory (%lx)\n", addr); return -1; } @@ -1234,7 +1013,7 @@ static int timerfd_arm(struct task_restore_args *args) t->val.it_value.tv_sec += (time_t)ts.tv_sec; - pr_debug("Adjust id %x it_value(%llu, %llu) -> it_value(%llu, %llu)\n", t->id, + pr_debug("Adjust id %#x it_value(%llu, %llu) -> it_value(%llu, %llu)\n", t->id, (unsigned long long)ts.tv_sec, (unsigned long long)ts.tv_nsec, (unsigned long long)t->val.it_value.tv_sec, (unsigned long long)t->val.it_value.tv_nsec); @@ -1253,23 +1032,9 @@ static int timerfd_arm(struct task_restore_args *args) static int create_posix_timers(struct task_restore_args *args) { - int ret, i, exit_code = -1; - kernel_timer_t next_id = 0, timer_id; + int ret, i; + kernel_timer_t next_id; struct sigevent sev; - bool create_restore_ids = false; - - if (!args->posix_timers_n) - return 0; - - /* prctl returns EINVAL if PR_TIMER_CREATE_RESTORE_IDS isn't supported. */ - ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, - PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0); - if (ret == 0) { - create_restore_ids = true; - } else if (ret != -EINVAL) { - pr_err("Can't enabled PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); - return -1; - } for (i = 0; i < args->posix_timers_n; i++) { sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify; @@ -1281,61 +1046,30 @@ static int create_posix_timers(struct task_restore_args *args) #endif sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr; - if (create_restore_ids) { - /* - * With enabled PR_TIMER_CREATE_RESTORE_IDS, the - * timer_create syscall creates a new timer with the - * specified ID. - */ - timer_id = args->posix_timers[i].spt.it_id; - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); - if (ret < 0) { - pr_err("Can't create posix timer - %d: %d\n", i, ret); - goto out; - } - if (timer_id != args->posix_timers[i].spt.it_id) { - pr_err("Unexpected timer id %u (expected %lu)\n", - timer_id, args->posix_timers[i].spt.it_id); - goto out; - } - continue; - } - while (1) { - ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &timer_id); + ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id); if (ret < 0) { pr_err("Can't create posix timer - %d\n", i); - goto out; + return ret; } - if (timer_id != next_id) { - pr_err("Can't create timers, kernel don't give them consequently\n"); - goto out; - } - next_id++; - - if (timer_id == args->posix_timers[i].spt.it_id) + if (next_id == args->posix_timers[i].spt.it_id) break; - ret = sys_timer_delete(timer_id); + ret = sys_timer_delete(next_id); if (ret < 0) { - pr_err("Can't remove temporaty posix timer 0x%x\n", timer_id); - goto out; + pr_err("Can't remove temporaty posix timer 0x%x\n", next_id); + return ret; + } + + if ((long)next_id > args->posix_timers[i].spt.it_id) { + pr_err("Can't create timers, kernel don't give them consequently\n"); + return -1; } } } - exit_code = 0; -out: - if (create_restore_ids) { - ret = sys_prctl(PR_TIMER_CREATE_RESTORE_IDS, - PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0); - if (ret != 0) { - pr_err("Can't disable PR_TIMER_CREATE_RESTORE_IDS: %d\n", ret); - exit_code = -1; - } - } - return exit_code; + return 0; } static void restore_posix_timers(struct task_restore_args *args) @@ -1358,26 +1092,11 @@ unsigned long vdso_rt_size = 0; void *bootstrap_start = NULL; unsigned int bootstrap_len = 0; -__visible void __export_unmap(void) +void __export_unmap(void) { sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size); } -static int unregister_libc_rseq(struct rst_rseq_param *rseq) -{ - long ret; - - if (!rseq->rseq_abi_pointer) - return 0; - - ret = sys_rseq(decode_pointer(rseq->rseq_abi_pointer), rseq->rseq_abi_size, 1, rseq->signature); - if (ret) { - pr_err("Failed to unregister libc rseq %ld\n", ret); - return -1; - } - return 0; -} - /* * This function unmaps all VMAs, which don't belong to * the restored process or the restorer. @@ -1571,40 +1290,6 @@ static int fd_poll(int inotify_fd) return sys_ppoll(&pfd, 1, &tmo, NULL, sizeof(sigset_t)); } -/* - * Call preadv() but limit size of the read. Zero `max_to_read` skips the limit. - */ -static ssize_t preadv_limited(int fd, struct iovec *iovs, int nr, off_t offs, size_t max_to_read) -{ - size_t saved_last_iov_len = 0; - ssize_t ret; - - if (max_to_read) { - for (int i = 0; i < nr; ++i) { - if (iovs[i].iov_len <= max_to_read) { - max_to_read -= iovs[i].iov_len; - continue; - } - - if (!max_to_read) { - nr = i; - break; - } - - saved_last_iov_len = iovs[i].iov_len; - iovs[i].iov_len = max_to_read; - nr = i + 1; - break; - } - } - - ret = sys_preadv(fd, iovs, nr, offs); - if (saved_last_iov_len) - iovs[nr - 1].iov_len = saved_last_iov_len; - - return ret; -} - /* * In the worst case buf size should be: * sizeof(struct inotify_event) * 2 + PATH_MAX @@ -1644,7 +1329,7 @@ static int cleanup_inotify_events(int inotify_fd) /* * When we restore inotifies we can open and close files we create a watch - * for. So we need to cleanup these auxiliary events which we've generated. + * for. So wee need to cleanup these auxiliary events which we've generated. * * note: For now we don't have a way to c/r events in queue but we need to * at least leave the queue clean from events generated by our own. @@ -1665,54 +1350,6 @@ int cleanup_current_inotify_events(struct task_restore_args *task_args) return 0; } -/* - * Restore membarrier() registrations. - */ -static int restore_membarrier_registrations(int mask) -{ - unsigned long bitmap[1] = { mask }; - int i, err, ret = 0; - - if (!mask) - return 0; - - pr_info("Restoring membarrier() registrations %x\n", mask); - - for_each_bit(i, bitmap) { - err = sys_membarrier(1 << i, 0, 0); - if (!err) - continue; - pr_err("Can't restore membarrier(1 << %d) registration: %d\n", i, err); - ret = -1; - } - - return ret; -} - -static int restore_madv_guard_regions(struct task_restore_args *args) -{ - int i, ret; - - for (i = 0; i < args->vmas_n; i++) { - VmaEntry *vma_entry = args->vmas + i; - size_t len; - - if (!vma_entry_is(vma_entry, VMA_AREA_GUARD)) - continue; - - len = vma_entry->end - vma_entry->start; - ret = sys_madvise(vma_entry->start, len, MADV_GUARD_INSTALL); - if (ret) { - pr_err("madvise(%" PRIx64 ", %zu, MADV_GUARD_INSTALL) " - "failed with %d\n", - vma_entry->start, len, ret); - return -1; - } - } - - return 0; -} - /* * The main routine to restore task via sigreturn. * This one is very special, we never return there @@ -1720,7 +1357,7 @@ static int restore_madv_guard_regions(struct task_restore_args *args) * and jump execution to some predefined ip read from * core file. */ -__visible long __export_restore_task(struct task_restore_args *args) +long __export_restore_task(struct task_restore_args *args) { long ret = -1; int i; @@ -1780,15 +1417,12 @@ __visible long __export_restore_task(struct task_restore_args *args) pr_debug("lazy-pages: uffd %d\n", args->uffd); } - if (arch_shstk_switch_to_restorer(&args->shstk)) - goto core_restore_end; - /* * Park vdso/vvar in a safe place if architecture doesn't support * mapping them with arch_prctl(). * Always preserve/map rt-vdso pair if it's possible, regardless * it's presence in original task: vdso will be used for fast - * gettimeofday() in restorer's log timings. + * getttimeofday() in restorer's log timings. */ if (!args->can_map_vdso && vdso_is_present(&args->vdso_maps_rt)) { /* It's already checked in kdat, but let's check again */ @@ -1802,16 +1436,6 @@ __visible long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } - /* - * We may have rseq registered already if CRIU compiled against - * a fresh Glibc with rseq support. Anyway, we need to unregister it - * before doing unmap_old_vmas or we will get SIGSEGV from the kernel, - * for instance once the kernel will want to update (struct rseq).cpu_id field: - * https://github.com/torvalds/linux/blob/ce522ba9ef7e/kernel/rseq.c#L89 - */ - if (unregister_libc_rseq(&args->libc_rseq)) - goto core_restore_end; - if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len, bootstrap_start, bootstrap_len, args->task_size)) goto core_restore_end; @@ -1856,13 +1480,17 @@ __visible long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } - ret = sys_prctl(PR_SET_THP_DISABLE, args->thp_disabled, 0, 0, 0); - if (ret) { - pr_err("Cannot restore THP_DISABLE=%d flag: %ld\n", args->thp_disabled, ret); - goto core_restore_end; - } - if (args->uffd > -1) { + /* re-enable THP if we disabled it previously */ + if (args->has_thp_enabled) { + int ret; + ret = sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0); + if (ret) { + pr_err("Cannot re-enable THP: %d\n", ret); + goto core_restore_end; + } + } + pr_debug("lazy-pages: closing uffd %d\n", args->uffd); /* * All userfaultfd configuration has finished at this point. @@ -1904,12 +1532,7 @@ __visible long __export_restore_task(struct task_restore_args *args) while (nr) { pr_debug("Preadv %lx:%d... (%d iovs)\n", (unsigned long)iovs->iov_base, (int)iovs->iov_len, nr); - /* - * If we're requested to punch holes in the file after reading we do - * it to save memory. Limit the reads then to an arbitrary block size. - */ - r = preadv_limited(args->vma_ios_fd, iovs, nr, rio->off, - args->auto_dedup ? AUTO_DEDUP_OVERHEAD_BYTES : 0); + r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off); if (r < 0) { pr_err("Can't read pages data (%d)\n", (int)r); goto core_restore_end; @@ -1996,9 +1619,6 @@ __visible long __export_restore_task(struct task_restore_args *args) for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) { if (vma_entry->madv & (1ul << m)) { - if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR))) - continue; - ret = sys_madvise(vma_entry->start, vma_entry_len(vma_entry), m); if (ret) { pr_err("madvise(%" PRIx64 ", %" PRIu64 ", %ld) " @@ -2010,13 +1630,6 @@ __visible long __export_restore_task(struct task_restore_args *args) } } - /* - * Restore madvise(MADV_GUARD_INSTALL) - */ - ret = restore_madv_guard_regions(args); - if (ret) - goto core_restore_end; - /* * Tune up the task fields. */ @@ -2046,24 +1659,6 @@ __visible long __export_restore_task(struct task_restore_args *args) .exe_fd = args->fd_exe_link, }; ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0); - if (ret) { - pr_debug("prctl PR_SET_MM_MAP failed with %d\n", (int)ret); - pr_debug(" .start_code = %" PRIx64 "\n", prctl_map.start_code); - pr_debug(" .end_code = %" PRIx64 "\n", prctl_map.end_code); - pr_debug(" .start_data = %" PRIx64 "\n", prctl_map.start_data); - pr_debug(" .end_data = %" PRIx64 "\n", prctl_map.end_data); - pr_debug(" .start_stack = %" PRIx64 "\n", prctl_map.start_stack); - pr_debug(" .start_brk = %" PRIx64 "\n", prctl_map.start_brk); - pr_debug(" .brk = %" PRIx64 "\n", prctl_map.brk); - pr_debug(" .arg_start = %" PRIx64 "\n", prctl_map.arg_start); - pr_debug(" .arg_end = %" PRIx64 "\n", prctl_map.arg_end); - pr_debug(" .env_start = %" PRIx64 "\n", prctl_map.env_start); - pr_debug(" .env_end = %" PRIx64 "\n", prctl_map.env_end); - pr_debug(" .auxv_size = %" PRIu32 "\n", prctl_map.auxv_size); - for (i = 0; i < prctl_map.auxv_size / sizeof(uint64_t); i++) - pr_debug(" .auxv[%d] = %" PRIx64 "\n", i, prctl_map.auxv[i]); - pr_debug(" .exe_fd = %" PRIu32 "\n", prctl_map.exe_fd); - } if (ret == -EINVAL) { ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0); ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0); @@ -2193,7 +1788,6 @@ __visible long __export_restore_task(struct task_restore_args *args) } if (ret != thread_args[i].pid) { pr_err("Unable to create a thread: %ld\n", ret); - sys_close(fd); mutex_unlock(&task_entries_local->last_pid_mutex); goto core_restore_end; } @@ -2218,9 +1812,6 @@ __visible long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } - if (restore_membarrier_registrations(args->membarrier_registration_mask) < 0) - goto core_restore_end; - pr_info("%ld: Restored\n", sys_getpid()); restore_finish_stage(task_entries_local, CR_STATE_RESTORE); @@ -2281,21 +1872,13 @@ __visible long __export_restore_task(struct task_restore_args *args) * turning off TCP repair is CAP_SYS_NED_ADMIN protected, * thus restore* creds _after_ all of the above. */ - ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type, args->uid); + ret = restore_creds(args->t->creds_args, args->proc_fd, args->lsm_type); ret = ret || restore_dumpable_flag(&args->mm); ret = ret || restore_pdeath_sig(args->t); ret = ret || restore_child_subreaper(args->child_subreaper); futex_set_and_wake(&thread_inprogress, args->nr_threads); - /* - * Shadow stack of the leader can be locked only after all other - * threads were cloned, otherwise they may start with read-only - * shadow stack. - */ - if (arch_shstk_restore(&args->shstk)) - goto core_restore_end; - restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); if (ret) @@ -2312,7 +1895,7 @@ __visible long __export_restore_task(struct task_restore_args *args) * code below doesn't fail due to bad timing values. */ -#define itimer_armed(args, i) (args->itimers[i].it_value.tv_sec || args->itimers[i].it_value.tv_usec) +#define itimer_armed(args, i) (args->itimers[i].it_interval.tv_sec || args->itimers[i].it_interval.tv_usec) if (itimer_armed(args, 0)) sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL); diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 45fb6a648..f1e3239ff 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -49,25 +48,10 @@ static bool __ptr_struct_oob(uintptr_t ptr, size_t struct_size, uintptr_t start, return __ptr_oob(ptr, start, size) || __ptr_struct_end_oob(ptr, struct_size, start, size); } -/* Local strlen implementation */ -static size_t __strlen(const char *str) -{ - const char *ptr; - - if (!str) - return 0; - - ptr = str; - while (*ptr != '\0') - ptr++; - - return ptr - str; -} - /* * Elf hash, see format specification. */ -static unsigned long elf_sysv_hash(const unsigned char *name) +static unsigned long elf_hash(const unsigned char *name) { unsigned long h = 0, g; @@ -81,15 +65,6 @@ static unsigned long elf_sysv_hash(const unsigned char *name) return h; } -/* * The GNU hash format. Taken from glibc. */ -static unsigned long elf_gnu_hash(const unsigned char *name) -{ - unsigned long h = 5381; - for (unsigned char c = *name; c != '\0'; c = *++name) - h = h * 33 + c; - return h; -} - #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BORD ELFDATA2MSB /* 0x02 */ #else @@ -98,51 +73,30 @@ static unsigned long elf_gnu_hash(const unsigned char *name) static int has_elf_identity(Ehdr_t *ehdr) { - /* check ELF magic */ - - if (ehdr->e_ident[EI_MAG0] != ELFMAG0 || - ehdr->e_ident[EI_MAG1] != ELFMAG1 || - ehdr->e_ident[EI_MAG2] != ELFMAG2 || - ehdr->e_ident[EI_MAG3] != ELFMAG3) { - pr_err("Invalid ELF magic\n"); - return false; - }; - - /* check ELF class */ + /* + * See Elf specification for this magic values. + */ #if defined(CONFIG_VDSO_32) - if (ehdr->e_ident[EI_CLASS] != ELFCLASS32) { - pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); - return false; + static const char elf_ident[] = { + 0x7f, 0x45, 0x4c, 0x46, 0x01, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; #else - if (ehdr->e_ident[EI_CLASS] != ELFCLASS64) { - pr_err("Unsupported ELF class: %d\n", ehdr->e_ident[EI_CLASS]); - return false; + static const char elf_ident[] = { + 0x7f, 0x45, 0x4c, 0x46, 0x02, BORD, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; #endif - /* check ELF data encoding */ - if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB) { - pr_err("Unsupported ELF data encoding: %d\n", ehdr->e_ident[EI_DATA]); + BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident)); + + if (memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) { + pr_err("ELF header magic mismatch\n"); return false; - }; - /* check ELF version */ - if (ehdr->e_ident[EI_VERSION] != EV_CURRENT) { - pr_err("Unsupported ELF version: %d\n", ehdr->e_ident[EI_VERSION]); - return false; - }; - /* check ELF OSABI */ - if (ehdr->e_ident[EI_OSABI] != ELFOSABI_NONE && - ehdr->e_ident[EI_OSABI] != ELFOSABI_LINUX) { - pr_err("Unsupported OSABI version: %d\n", ehdr->e_ident[EI_OSABI]); - return false; - }; + } return true; } -static int parse_elf_phdr(uintptr_t mem, size_t size, - Phdr_t **dynamic, Phdr_t **load, bool *is_32bit) +static int parse_elf_phdr(uintptr_t mem, size_t size, Phdr_t **dynamic, Phdr_t **load) { Ehdr_t *ehdr = (void *)mem; uintptr_t addr; @@ -157,8 +111,6 @@ static int parse_elf_phdr(uintptr_t mem, size_t size, if (!has_elf_identity(ehdr)) return -EINVAL; - *is_32bit = ehdr->e_ident[EI_CLASS] != ELFCLASS64; - addr = mem + ehdr->e_phoff; if (__ptr_oob(addr, mem, size)) goto err_oob; @@ -197,14 +149,11 @@ err_oob: * Output parameters are: * @dyn_strtab - address of the symbol table * @dyn_symtab - address of the string table section - * @dyn_hash - address of the symbol hash table - * @use_gnu_hash - the format of hash DT_HASH or DT_GNU_HASH + * @dyn_hash - address of the symbol hash table */ -static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, - Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, - Dyn_t **dyn_hash, bool *use_gnu_hash) +static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, Dyn_t **dyn_strtab, Dyn_t **dyn_symtab, + Dyn_t **dyn_hash) { - Dyn_t *dyn_gnu_hash = NULL, *dyn_sysv_hash = NULL; Dyn_t *dyn_syment = NULL; Dyn_t *dyn_strsz = NULL; uintptr_t addr; @@ -235,52 +184,16 @@ static int parse_elf_dynamic(uintptr_t mem, size_t size, Phdr_t *dynamic, dyn_syment = d; pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val); } else if (d->d_tag == DT_HASH) { - dyn_sysv_hash = d; + *dyn_hash = d; pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); - } else if (d->d_tag == DT_GNU_HASH) { - /* - * This is complicated. - * - * Looking at the Linux kernel source, the following can be seen - * regarding which hashing style the VDSO uses on each arch: - * - * aarch64: not specified (depends on linker, can be - * only GNU hash style) - * arm: --hash-style=sysv - * loongarch: --hash-style=sysv - * mips: --hash-style=sysv - * powerpc: --hash-style=both - * riscv: --hash-style=both - * s390: --hash-style=both - * x86: --hash-style=both - * - * Some architectures are using both hash-styles, that - * is the easiest for CRIU. Some architectures are only - * using the old style (sysv), that is what CRIU supports. - * - * Starting with Linux 6.11, aarch64 unfortunately decided - * to switch from '--hash-style=sysv' to ''. Specifying - * nothing unfortunately may mean GNU hash style only and not - * 'both' (depending on the linker). - */ - dyn_gnu_hash = d; - pr_debug("DT_GNU_HASH: %lx\n", (unsigned long)d->d_un.d_ptr); } } - if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || - (!dyn_gnu_hash && !dyn_sysv_hash)) { + if (!*dyn_strtab || !*dyn_symtab || !dyn_strsz || !dyn_syment || !*dyn_hash) { pr_err("Not all dynamic entries are present\n"); return -EINVAL; } - /* - * Prefer DT_HASH over DT_GNU_HASH as it's been more tested and - * as a result more stable. - */ - *use_gnu_hash = !dyn_sysv_hash; - *dyn_hash = dyn_sysv_hash ?: dyn_gnu_hash; - return 0; err_oob: @@ -295,156 +208,60 @@ typedef unsigned long Hash_t; typedef Word_t Hash_t; #endif -typedef uint32_t Hash32_t; - -static bool elf_symbol_match(uintptr_t mem, size_t size, - uintptr_t dynsymbol_names, Sym_t *sym, - const char *symbol, const size_t vdso_symbol_length) -{ - uintptr_t addr = (uintptr_t)sym; - char *name; - - if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) - return false; - - if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) - return false; - - addr = dynsymbol_names + sym->st_name; - if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) - return false; - name = (void *)addr; - - return !std_strncmp(name, symbol, vdso_symbol_length); -} - - -static unsigned long elf_symbol_lookup(uintptr_t mem, size_t size, - const char *symbol, uint32_t symbol_hash, unsigned int sym_off, - uintptr_t dynsymbol_names, Dyn_t *dyn_symtab, Phdr_t *load, - uint32_t nbucket, uint32_t nchain, void *_bucket, Hash_t *chain, - const size_t vdso_symbol_length, bool use_gnu_hash) -{ - unsigned int j; - uintptr_t addr; - - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; - - if (use_gnu_hash) { - Hash32_t *h, hash_val, *bucket = _bucket; - - j = bucket[symbol_hash % nbucket]; - if (j == STN_UNDEF) - return 0; - - h = bucket + nbucket + (j - sym_off); - - symbol_hash |= 1; - do { - Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; - - hash_val = *h++; - if ((hash_val | 1) == symbol_hash && - elf_symbol_match(mem, size, dynsymbol_names, sym, - symbol, vdso_symbol_length)) - return sym->st_value; - j++; - } while (!(hash_val & 1)); - } else { - Hash_t *bucket = _bucket; - - j = bucket[symbol_hash % nbucket]; - if (j == STN_UNDEF) - return 0; - - for (; j < nchain && j != STN_UNDEF; j = chain[j]) { - Sym_t *sym = (void *)addr + sizeof(Sym_t) * j; - - if (elf_symbol_match(mem, size, dynsymbol_names, sym, - symbol, vdso_symbol_length)) - return sym->st_value; - } - } - return 0; -} - -static int parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, - struct vdso_symtable *t, uintptr_t dynsymbol_names, - Hash_t *hash, Dyn_t *dyn_symtab, bool use_gnu_hash, - bool is_32bit) +static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, struct vdso_symtable *t, + uintptr_t dynsymbol_names, Hash_t *hash, Dyn_t *dyn_symtab) { ARCH_VDSO_SYMBOLS_LIST const char *vdso_symbols[VDSO_SYMBOL_MAX] = { ARCH_VDSO_SYMBOLS }; const size_t vdso_symbol_length = sizeof(t->symbols[0].name) - 1; - void *bucket = NULL; - Hash_t *chain = NULL; - uint32_t nbucket, nchain = 0; + Hash_t nbucket, nchain; + Hash_t *bucket, *chain; - unsigned int sym_off = 0; - unsigned int i = 0; + unsigned int i, j, k; + uintptr_t addr; - unsigned long (*elf_hash)(const unsigned char *); - - if (use_gnu_hash) { - uint32_t *gnu_hash = (uint32_t *)hash; - uint32_t bloom_sz; - - nbucket = gnu_hash[0]; - sym_off = gnu_hash[1]; - bloom_sz = gnu_hash[2]; - if (is_32bit) { - uint32_t *bloom; - bloom = (uint32_t *)&gnu_hash[4]; - bucket = (Hash_t *)(&bloom[bloom_sz]); - } else { - uint64_t *bloom; - bloom = (uint64_t *)&gnu_hash[4]; - bucket = (Hash_t *)(&bloom[bloom_sz]); - } - elf_hash = &elf_gnu_hash; - pr_debug("nbucket %lx sym_off %lx bloom_sz %lx bucket %lx\n", - (unsigned long)nbucket, (unsigned long)sym_off, - (unsigned long)bloom_sz, - (unsigned long)bucket); - } else { - nbucket = hash[0]; - nchain = hash[1]; - bucket = &hash[2]; - chain = &hash[nbucket + 2]; - elf_hash = &elf_sysv_hash; - pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", - (unsigned long)nbucket, (unsigned long)nchain, - (unsigned long)bucket, (unsigned long)chain); - } + nbucket = hash[0]; + nchain = hash[1]; + bucket = &hash[2]; + chain = &hash[nbucket + 2]; + pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n", (long)nbucket, (long)nchain, (unsigned long)bucket, + (unsigned long)chain); for (i = 0; i < VDSO_SYMBOL_MAX; i++) { const char *symbol = vdso_symbols[i]; - unsigned long addr, symbol_hash; - const size_t symbol_length = __strlen(symbol); + k = elf_hash((const unsigned char *)symbol); - symbol_hash = elf_hash((const unsigned char *)symbol); - addr = elf_symbol_lookup(mem, size, symbol, symbol_hash, - sym_off, dynsymbol_names, dyn_symtab, load, - nbucket, nchain, bucket, chain, - vdso_symbol_length, use_gnu_hash); - pr_debug("symbol %s at address %lx\n", symbol, addr); - if (!addr) - continue; + for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { + Sym_t *sym; + char *name; - /* XXX: provide strncpy() implementation for PIE */ - if (symbol_length > vdso_symbol_length) { - pr_err("strlen(%s) %zd, only %zd bytes available\n", - symbol, symbol_length, vdso_symbol_length); - return -EINVAL; + addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + + addr += sizeof(Sym_t) * j; + if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) + continue; + sym = (void *)addr; + + if (ELF_ST_TYPE(sym->st_info) != STT_FUNC && ELF_ST_BIND(sym->st_info) != STB_GLOBAL) + continue; + + addr = dynsymbol_names + sym->st_name; + if (__ptr_struct_oob(addr, vdso_symbol_length, mem, size)) + continue; + name = (void *)addr; + + if (std_strncmp(name, symbol, vdso_symbol_length)) + continue; + + /* XXX: provide strncpy() implementation for PIE */ + memcpy(t->symbols[i].name, name, vdso_symbol_length); + t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr; + break; } - memcpy(t->symbols[i].name, symbol, symbol_length); - t->symbols[i].offset = addr - load->p_vaddr; } - - return 0; } int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) @@ -454,8 +271,6 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) Dyn_t *dyn_symtab = NULL; Dyn_t *dyn_hash = NULL; Hash_t *hash = NULL; - bool use_gnu_hash; - bool is_32bit; uintptr_t dynsymbol_names; uintptr_t addr; @@ -466,7 +281,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) /* * We need PT_LOAD and PT_DYNAMIC here. Each once. */ - ret = parse_elf_phdr(mem, size, &dynamic, &load, &is_32bit); + ret = parse_elf_phdr(mem, size, &dynamic, &load); if (ret < 0) return ret; if (!load || !dynamic) { @@ -481,8 +296,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) * needed. Note that we're interested in a small set of tags. */ - ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, - &dyn_hash, &use_gnu_hash); + ret = parse_elf_dynamic(mem, size, dynamic, &dyn_strtab, &dyn_symtab, &dyn_hash); if (ret < 0) return ret; @@ -496,11 +310,7 @@ int vdso_fill_symtable(uintptr_t mem, size_t size, struct vdso_symtable *t) goto err_oob; hash = (void *)addr; - ret = parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab, - use_gnu_hash, is_32bit); - - if (ret <0) - return ret; + parse_elf_symbols(mem, size, load, t, dynsymbol_names, hash, dyn_symtab); return 0; diff --git a/criu/pipes.c b/criu/pipes.c index daada8830..43ff06e3d 100644 --- a/criu/pipes.c +++ b/criu/pipes.c @@ -434,7 +434,7 @@ int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms /* steal_pipe has to be able to fit all data from a target pipe */ if (fcntl(steal_pipe[1], F_SETPIPE_SZ, pipe_size) < 0) { pr_perror("Unable to set a pipe size"); - goto err_close; + goto err; } bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK); diff --git a/criu/plugin.c b/criu/plugin.c index f9322a3c2..3fe03c7cd 100644 --- a/criu/plugin.c +++ b/criu/plugin.c @@ -54,14 +54,6 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path) __assign_hook(DUMP_EXT_MOUNT, "cr_plugin_dump_ext_mount"); __assign_hook(RESTORE_EXT_MOUNT, "cr_plugin_restore_ext_mount"); __assign_hook(DUMP_EXT_LINK, "cr_plugin_dump_ext_link"); - __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); - __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); - __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); - __assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices"); - __assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices"); - __assign_hook(POST_FORKING, "cr_plugin_post_forking"); - __assign_hook(RESTORE_INIT, "cr_plugin_restore_init"); - __assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late"); #undef __assign_hook @@ -259,17 +251,6 @@ int cr_plugin_init(int stage) goto err; } - if (stage == CR_PLUGIN_STAGE__RESTORE) { - int ret; - - if (check_inventory_plugins()) - goto err; - - ret = run_plugins(RESTORE_INIT); - if (ret < 0 && ret != -ENOTSUP) - goto err; - } - exit_code = 0; err: closedir(d); diff --git a/criu/proc_parse.c b/criu/proc_parse.c index f51f2e801..f3491e781 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -41,14 +41,10 @@ #include "path.h" #include "fault-injection.h" #include "memfd.h" -#include "hugetlb.h" -#include "pidfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" #include "images/mnt.pb-c.h" -#include "pidfd.pb-c.h" -#include "plugin.h" #include @@ -74,8 +70,6 @@ struct buffer { static struct buffer __buf; static char *buf = __buf.buf; -/* only ever goes from false to true, if at all */ -static bool uprobes_vma_exists = false; /* * This is how AIO ring buffers look like in proc @@ -109,21 +103,7 @@ bool is_vma_range_fmt(char *line) return __is_vma_range_fmt(line); } -bool handle_vma_plugin(int *fd, struct stat *stat) -{ - int ret; - - ret = run_plugins(HANDLE_DEVICE_VMA, *fd, stat); - if (ret < 0) { - pr_perror("handle_device_vma plugin failed"); - return false; - } - - return true; -} - -static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, - int *shstk) +static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { char *tok; @@ -146,8 +126,6 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; - else if (_vmflag_match(tok, "dp")) - *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -164,16 +142,11 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); - else if (_vmflag_match(tok, "wf")) - *madv |= (1ul << MADV_WIPEONFORK); /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) *io_pf = 1; - if (_vmflag_match(tok, "ss")) - *shstk = 1; - /* * Anything else is just ignored. */ @@ -184,49 +157,25 @@ static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf, void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf) { - int shstk = 0; - - __parse_vmflags(buf, flags, madv, io_pf, &shstk); + __parse_vmflags(buf, flags, madv, io_pf); } static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) { int io_pf = 0; - int shstk = 0; - __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf, - &shstk); - - if (shstk) - vma_area->e->status |= VMA_AREA_SHSTK; + __parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf); /* * vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP - * - * The uprobes vma is also mapped by the kernel with VM_IO, among other flags */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED) - && !vma_area_is(vma_area, VMA_AREA_UPROBES)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) vma_area->e->has_madv = true; - - /* - * We set MAP_PRIVATE flag on vma_area->e->flags right after parsing - * a first line of VMA entry in /proc//smaps file: - * 7fa84fa70000-7fa84fa95000 rw-p 00000000 00:00 0 - * but it's too early and we can't distinguish between MAP_DROPPABLE - * and MAP_PRIVATE mappings yet, as they both private mappings in nature - * and at this point we haven't yet read "VmFlags:" line in smaps. - * - * Let's detect this situation and drop MAP_PRIVATE flag while keep - * MAP_DROPPABLE, otherwise restorer's restore_mapping() helper will fail. - */ - if ((vma_area->e->flags & MAP_PRIVATE) && (vma_area->e->flags & MAP_DROPPABLE)) - vma_area->e->flags &= ~MAP_PRIVATE; } static inline int is_anon_shmem_map(dev_t dev) @@ -310,7 +259,7 @@ static int vma_stat(struct vma_area *vma, int fd) static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct vma_file_info *vfi, int *vm_file_fd, const char *path) { - int fd, hugetlb_flag = 0; + int fd; dev_t vfi_dev; /* @@ -349,20 +298,35 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct vfi_dev = makedev(vfi->dev_maj, vfi->dev_min); - if (is_hugetlb_dev(vfi_dev, &hugetlb_flag) || is_anon_shmem_map(vfi_dev)) { - vma->e->status |= VMA_AREA_REGULAR; + if (is_memfd(vfi_dev)) { + char tmp[PATH_MAX]; + strlcpy(tmp, fname, PATH_MAX); + strip_deleted(tmp, strlen(tmp)); + + /* + * The error EPERM will be shown in the following pr_perror(). + * It comes from the previous open() call. + */ + pr_perror("Can't open mapped [%s]", tmp); + + /* + * TODO Perhaps we could do better than failing and dump the + * memory like what is being done in shmem.c + */ + return -1; + } + + if (is_anon_shmem_map(vfi_dev)) { if (!(vma->e->flags & MAP_SHARED)) - vma->e->status |= VMA_ANON_PRIVATE; - else - vma->e->status |= VMA_ANON_SHARED; + return -1; vma->e->flags |= MAP_ANONYMOUS; + vma->e->status |= VMA_ANON_SHARED; vma->e->shmid = vfi->ino; - vma->e->flags |= hugetlb_flag; if (!strncmp(fname, "/SYSV", 5)) { vma->e->status |= VMA_AREA_SYSVIPC; - } else if (vma->e->flags & MAP_SHARED) { + } else { if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } @@ -378,44 +342,18 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct } if (vma_stat(vma, fd)) { - goto closefd; + close(fd); + return -1; } - if (vma->vmst->st_ino != vfi->ino) { - goto errmsg; - } - - /* - * If devices don't match it could be because file is on a btrfs subvolume, - * which means that device number returned by stat will not match what is - * seen in smaps and other places. To deal with that we need a more involved - * check. - */ - if (vma->vmst->st_dev != vfi_dev) { - int mnt_id; - struct ns_id *ns; - - if (get_fd_mntid(fd, &mnt_id)) - goto errmsg; - - ns = lookup_nsid_by_mnt_id(mnt_id); - if (!ns) - goto errmsg; - - if (!phys_stat_dev_match(vma->vmst->st_dev, vfi_dev, ns, fname)) - goto errmsg; - - vma->mnt_id = mnt_id; + if (vma->vmst->st_dev != vfi_dev || vma->vmst->st_ino != vfi->ino) { + pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); + close(fd); + return -1; } *vm_file_fd = fd; return 0; - -errmsg: - pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start); -closefd: - close(fd); - return -1; } static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi, @@ -602,20 +540,11 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat } else if (!strcmp(file_path, "[vdso]")) { if (handle_vdso_vma(vma_area)) goto err; - } else if (!strcmp(file_path, "[vvar]") || - !strcmp(file_path, "[vvar_vclock]")) { + } else if (!strcmp(file_path, "[vvar]")) { if (handle_vvar_vma(vma_area)) goto err; } else if (!strcmp(file_path, "[heap]")) { vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP; - } else if (!strcmp(file_path, "[uprobes]")) { - uprobes_vma_exists = true; - if (!opts.allow_uprobes) { - pr_err("PID %d has uprobes vma. Consider using --" OPT_ALLOW_UPROBES ".\n", - pid); - goto err; - } - vma_area->e->status |= VMA_AREA_UPROBES; } else { vma_area->e->status = VMA_AREA_REGULAR; } @@ -647,25 +576,17 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat } } else if (*vm_file_fd >= 0) { struct stat *st_buf = vma_area->vmst; - int hugetlb_flag = 0; - if (S_ISREG(st_buf->st_mode)) { + if (S_ISREG(st_buf->st_mode)) /* regular file mapping -- supported */; - pr_debug("Found regular file mapping, OK\n"); - } else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) { + else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) /* devzero mapping -- also makes sense */; - pr_debug("Found devzero mapping, OK\n"); - } else if (handle_vma_plugin(vm_file_fd, st_buf)) { - pr_info("Found device file mapping, plugin is available\n"); - vma_area->e->status |= VMA_EXT_PLUGIN; - } else { - /* non-regular mapping with no supporting plugin */ + else { pr_err("Can't handle non-regular mapping on %d's map %" PRIx64 "\n", pid, vma_area->e->start); goto err; } - if ((is_anon_shmem_map(st_buf->st_dev) || is_hugetlb_dev(st_buf->st_dev, NULL)) && - !strncmp(file_path, "/SYSV", 5)) { + if (is_anon_shmem_map(st_buf->st_dev) && !strncmp(file_path, "/SYSV", 5)) { vma_area->e->flags |= MAP_ANONYMOUS; vma_area->e->status |= VMA_ANON_SHARED; vma_area->e->shmid = st_buf->st_ino; @@ -674,28 +595,10 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat pr_info("path: %s\n", file_path); vma_area->e->status |= VMA_AREA_SYSVIPC; } else { - /* We dump memfd backed mapping, both normal and hugepage anonymous share - * mapping using memfd approach when possible. - */ - if (is_memfd(st_buf->st_dev) || is_anon_shmem_map(st_buf->st_dev) || - can_dump_with_memfd_hugetlb(st_buf->st_dev, &hugetlb_flag, file_path, vma_area)) { + if (is_anon_shmem_map(st_buf->st_dev)) { vma_area->e->status |= VMA_AREA_MEMFD; - vma_area->e->flags |= hugetlb_flag; if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; - } else if (is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag)) { - vma_area->e->flags |= hugetlb_flag; - vma_area->e->flags |= MAP_ANONYMOUS; - - if (vma_area->e->flags & MAP_SHARED) { - vma_area->e->status |= VMA_ANON_SHARED; - vma_area->e->shmid = st_buf->st_ino; - } else { - vma_area->e->status |= VMA_ANON_PRIVATE; - } - - close_safe(vm_file_fd); - return 0; } if (vma_area->e->flags & MAP_PRIVATE) @@ -742,21 +645,7 @@ err_bogus_mapfile: static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area_list, unsigned long *prev_end, struct vma_file_info *vfi, struct vma_file_info *prev_vfi) { - if (vma_area->e->status & VMA_EXT_PLUGIN) { - /* Unsupported VMAs that provide special plugins for - * backup can be treated as regular VMAs and criu - * should only save their metadata in the dump files. - * There can be several special backup plugins hooks - * that might run at different stages during checkpoint - * and restore. - */ - pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n", - vma_area->e->start, vma_area->e->end); - } else if (vma_area->e->status & VMA_AREA_UPROBES) { - pr_debug("Skipping uprobes vma %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, - vma_area->e->end); - return 0; - } else if (vma_area->e->status & VMA_UNSUPP) { + if (vma_area->e->status & VMA_UNSUPP) { pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, vma_area->e->end); return -1; @@ -807,7 +696,7 @@ static int task_size_check(pid_t pid, VmaEntry *entry) int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap) { - struct vma_area *vma_area = NULL, *prev_vma_area = NULL; + struct vma_area *vma_area = NULL; unsigned long start, end, pgoff, prev_end = 0; char r, w, x, s; int ret = -1, vm_file_fd = -1; @@ -841,7 +730,17 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du eof = (str == NULL); if (!eof && !__is_vma_range_fmt(str)) { - if (!strncmp(str, "VmFlags: ", 9)) { + if (!strncmp(str, "Nonlinear", 9)) { + BUG_ON(!vma_area); + pr_err("Nonlinear mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start, + vma_area->e->end); + /* + * VMA is already on list and will be + * freed later as list get destroyed. + */ + vma_area = NULL; + goto err; + } else if (!strncmp(str, "VmFlags: ", 9)) { BUG_ON(!vma_area); parse_vma_vmflags(&str[9], vma_area); continue; @@ -849,22 +748,8 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du continue; } - if (vma_area && vma_area_is(vma_area, VMA_AREA_VVAR) && - prev_vma_area && vma_area_is(prev_vma_area, VMA_AREA_VVAR)) { - if (prev_vma_area->e->end != vma_area->e->start) { - pr_err("two nonconsecutive vvar vma-s: " - "%" PRIx64 "-%" PRIx64 " %" PRIx64 "-%" PRIx64 "\n", - prev_vma_area->e->start, prev_vma_area->e->end, - vma_area->e->start, vma_area->e->end); - goto err; - } - /* Merge all vvar vma-s into one. */ - prev_vma_area->e->end = vma_area->e->end; - } else { - if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) - goto err; - prev_vma_area = vma_area; - } + if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi)) + goto err; if (eof) break; @@ -904,7 +789,6 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du goto err; } - pr_debug("Handling VMA with the following smaps entry: %s\n", str); if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; @@ -963,7 +847,7 @@ int parse_pid_stat(pid_t pid, struct proc_pid_stat *s) *tok = '\0'; *p = '\0'; - __strlcpy(s->comm, tok + 1, sizeof(s->comm)); + strlcpy(s->comm, tok + 1, sizeof(s->comm)); n = sscanf(p + 1, " %c %d %d %d %d %d %u %lu %lu %lu %lu " @@ -1100,13 +984,12 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) cr->s.sigpnd = 0; cr->s.shdpnd = 0; - cr->s.sigblk = 0; cr->s.seccomp_mode = SECCOMP_MODE_DISABLED; if (bfdopenr(&f)) return -1; - while (done < 14) { + while (done < 13) { str = breadline(&f); if (str == NULL) break; @@ -1190,13 +1073,6 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) continue; } - if (!strncmp(str, "CapAmb:", 7)) { - if (cap_parse(str + 8, cr->cap_amb)) - goto err_parse; - done++; - continue; - } - if (!strncmp(str, "Seccomp:", 8)) { if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) { goto err_parse; @@ -1224,23 +1100,13 @@ int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data) goto err_parse; cr->s.sigpnd |= sigpnd; - done++; - continue; - } - if (!strncmp(str, "SigBlk:", 7)) { - unsigned long long sigblk = 0; - - if (sscanf(str + 7, "%llx", &sigblk) != 1) - goto err_parse; - cr->s.sigblk |= sigblk; - done++; continue; } } /* seccomp and nspids are optional */ - expected_done = (parsed_seccomp ? 13 : 12); + expected_done = (parsed_seccomp ? 11 : 10); if (kdat.has_nspid) expected_done++; if (done == expected_done) @@ -1477,7 +1343,7 @@ static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname) goto err; new->mountpoint[0] = '.'; - ret = sscanf(str, "%i %i %u:%u %ms %4094s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, + ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root, new->mountpoint + 1, &opt, &n); if (ret != 7) goto err; @@ -1576,12 +1442,12 @@ bool add_skip_mount(const char *mountpoint) return true; } -static bool should_skip_mount(char *mountpoint) +static bool should_skip_mount(const char *mountpoint) { struct str_node *pos; list_for_each_entry(pos, &skip_mount_list, node) { - if (is_same_path(mountpoint, pos->string)) + if (strcmp(mountpoint, pos->string) == 0) return true; } @@ -1626,59 +1492,6 @@ out: return exit_code; } -static int get_mountinfo_sdev_from_mntid(int mnt_id, unsigned int *sdev) -{ - int exit_code = -1; - FILE *f; - - f = fopen_proc(PROC_SELF, "mountinfo"); - if (!f) - return -1; - - while (fgets(buf, BUF_SIZE, f)) { - unsigned int kmaj, kmin; - int id; - - if (sscanf(buf, "%i %*i %u:%u", &id, &kmaj, &kmin) != 3) { - pr_err("Failed to parse mountinfo line %s\n", buf); - goto err; - } - - if (id == mnt_id) { - *sdev = MKKDEV(kmaj, kmin); - exit_code = 0; - break; - } - } -err: - fclose(f); - return exit_code; -} - -/* This works even on btrfs where stat does not show right sdev */ -int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo) -{ - struct mount_info *mi; - int ret, mnt_id; - - ret = get_fd_mntid(fd, &mnt_id); - if (ret < 0) - return -1; - - /* Simple case mnt_id is in dumped mntns */ - mi = lookup_mnt_id(mnt_id); - if (mi) { - *sdev = mi->s_dev_rt; - return 0; - } - - if (!parse_mountinfo) - return -1; - - /* Complex case mnt_id is in mntns created by criu */ - return get_mountinfo_sdev_from_mntid(mnt_id, sdev); -} - struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) { struct mount_info *list = NULL; @@ -1693,7 +1506,7 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) int ret = -1; char *fsname = NULL; - new = mnt_entry_alloc(false); + new = mnt_entry_alloc(); if (!new) goto end; @@ -1710,27 +1523,27 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) * fail loudly at "dump" stage if an opened file or another mnt * depends on this one. */ - if (for_dump && should_skip_mount(new->ns_mountpoint)) { - pr_info("\tskip %s @ %s\n", fsname, new->ns_mountpoint); + if (for_dump && should_skip_mount(new->mountpoint + 1)) { + pr_info("\tskip %s @ %s\n", fsname, new->mountpoint); mnt_entry_free(new); new = NULL; goto end; } pr_info("\ttype %s source %s mnt_id %d s_dev %#x %s @ %s flags %#x options %s\n", fsname, new->source, - new->mnt_id, new->s_dev, new->root, new->ns_mountpoint, new->flags, new->options); + new->mnt_id, new->s_dev, new->root, new->mountpoint, new->flags, new->options); if (new->fstype->parse) { ret = new->fstype->parse(new); if (ret < 0) { - pr_err("Failed to parse FS specific data on %s\n", service_mountpoint(new)); + pr_err("Failed to parse FS specific data on %s\n", new->mountpoint); mnt_entry_free(new); new = NULL; goto end; } if (ret > 0) { - pr_info("\tskipping fs mounted at %s\n", service_mountpoint(new) + 1); + pr_info("\tskipping fs mounted at %s\n", new->mountpoint + 1); mnt_entry_free(new); new = NULL; ret = 0; @@ -1741,8 +1554,10 @@ struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump) if (fsname) free(fsname); - if (new) - mntinfo_add_list_before(&list, new); + if (new) { + new->next = list; + list = new; + } if (ret) goto err; @@ -1845,12 +1660,6 @@ nodata: typedef struct bpfmap_fmt { char *fmt; void *value; - /* - * If newer kernels are adding additional entries, these entries need - * to be marked as optional in the protobuf definition and the parsing - * must be able to ignore it if running on an older kernel. - */ - protobuf_c_boolean *optional; } bpfmap_fmt; static int parse_bpfmap(struct bfd *f, char *str, BpfmapFileEntry *bpf) @@ -1863,34 +1672,28 @@ static int parse_bpfmap(struct bfd *f, char *str, BpfmapFileEntry *bpf) * uint32_t value_size * uint32_t max_entries * uint32_t map_flags - * uint64_t map_extra * uint64_t memlock * uint32_t map_id * boolean frozen */ - /* This needs to be in the same order as in the fdinfo entry. */ bpfmap_fmt map[] = { - { "map_type: %u", &bpf->map_type, NULL }, - { "key_size: %u", &bpf->key_size, NULL }, - { "value_size: %u", &bpf->value_size, NULL }, - { "max_entries: %u", &bpf->max_entries, NULL }, - { "map_flags: %" PRIx32 "", &bpf->map_flags, NULL }, - { "map_extra: %" PRIx64 "", &bpf->map_extra, &bpf->has_map_extra }, - { "memlock: %" PRIu64 "", &bpf->memlock, NULL }, - { "map_id: %u", &bpf->map_id, NULL }, - { "frozen: %d", &bpf->frozen, NULL }, + { "map_type: %u", &bpf->map_type }, + { "key_size: %u", &bpf->key_size }, + { "value_size: %u", &bpf->value_size }, + { "max_entries: %u", &bpf->max_entries }, + { "map_flags: %" PRIx32 "", &bpf->map_flags }, + { "memlock: %" PRIu64 "", &bpf->memlock }, + { "map_id: %u", &bpf->map_id }, + { "frozen: %d", &bpf->frozen }, }; size_t n = sizeof(map) / sizeof(bpfmap_fmt); int i; for (i = 0; i < n; i++) { - if (sscanf(str, map[i].fmt, map[i].value) != 1) { - if (map[i].optional) - continue; + if (sscanf(str, map[i].fmt, map[i].value) != 1) return -1; - } if (i == n - 1) break; @@ -1902,9 +1705,6 @@ static int parse_bpfmap(struct bfd *f, char *str, BpfmapFileEntry *bpf) } } - if (bpf->has_map_extra && bpf->map_extra) - pr_warn("Non-zero value for fdinfo map_extra entry found. This will not be restored.\n"); - return 0; } @@ -2042,7 +1842,10 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) " pos:%lli ino:%lx sdev:%x", &e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos, (long *)&e->inode, &e->dev); - if (ret == 3) { + if (ret < 3 || ret > 6) { + eventpoll_tfd_entry__free_unpacked(e, NULL); + goto parse_err; + } else if (ret == 3) { e->has_dev = false; e->has_inode = false; e->has_pos = false; @@ -2050,7 +1853,7 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) e->has_dev = true; e->has_inode = true; e->has_pos = true; - } else { + } else if (ret < 6) { eventpoll_tfd_entry__free_unpacked(e, NULL); goto parse_err; } @@ -2224,33 +2027,6 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (ret) goto parse_err; - entry_met = true; - continue; - } - if (fdinfo_field(str, "ino") || fdinfo_field(str, "NSpid") || fdinfo_field(str, "Pid")) { - struct pidfd_dump_info *pidfd_info = arg; - - if (type != FD_TYPES__PIDFD) - continue; - - if (fdinfo_field(str, "ino")) { - ret = sscanf(str, "%*s %u", &pidfd_info->pidfe.ino); - if (ret != 1) - goto parse_err; - } else if (fdinfo_field(str, "Pid")) { - ret = sscanf(str, "%*s %d", &pidfd_info->pid); - if (ret != 1) - goto parse_err; - } else if (fdinfo_field(str, "NSpid")) { - char *last; - - last = strrchr(str, '\t'); - if (!last || sscanf(last, "%d", &pidfd_info->pidfe.nspid) != 1) { - pr_err("Unable to parse: %s\n", str); - goto parse_err; - } - } - entry_met = true; continue; } @@ -2302,10 +2078,10 @@ static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked) char fl_flag[10], fl_type[15], fl_option[10]; if (is_blocked) { - num = sscanf(buf, "%lld: -> %9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } else { - num = sscanf(buf, "%lld:%9s %14s %9s %d %x:%x:%ld %lld %31s", &fl->fl_id, fl_flag, fl_type, fl_option, + num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option, &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end); } @@ -2653,8 +2429,7 @@ err: return -1; } -int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *retl, - unsigned int *n) +int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n) { FILE *f; int ret; @@ -2662,7 +2437,7 @@ int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args unsigned int n_internal = 0; struct cg_ctl *intern, *ext; - f = fopen_proc(pid, "task/%d/cgroup", tid); + f = fopen_proc(pid, "cgroup"); if (!f) return -1; @@ -2849,7 +2624,7 @@ int aufs_parse(struct mount_info *new) { int ret = 0; - if (!strcmp(new->ns_mountpoint, "./")) { + if (!strcmp(new->mountpoint, "./")) { opts.aufs = true; ret = parse_aufs_branches(new); } @@ -2946,8 +2721,3 @@ int parse_uptime(uint64_t *upt) fclose(f); return 0; } - -bool found_uprobes_vma(void) -{ - return uprobes_vma_exists; -} diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index e0dbfccc2..ff16b9f5b 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,7 +68,6 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" -#include "images/pidfd.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; diff --git a/criu/pstree.c b/criu/pstree.c index cee8b5741..d5080e515 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -63,7 +63,6 @@ CoreEntry *core_entry_alloc(int th, int tsk) sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); - sz += CR_CAP_SIZE * sizeof(ce->cap_amb[0]); /* * @groups are dynamic and allocated * on demand. @@ -123,12 +122,10 @@ CoreEntry *core_entry_alloc(int th, int tsk) ce->n_cap_prm = CR_CAP_SIZE; ce->n_cap_eff = CR_CAP_SIZE; ce->n_cap_bnd = CR_CAP_SIZE; - ce->n_cap_amb = CR_CAP_SIZE; ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0])); ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0])); ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); - ce->cap_amb = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_amb[0])); if (arch_alloc_thread_info(core)) { xfree(core); @@ -182,7 +179,7 @@ void free_pstree(struct pstree_item *root_item) struct pstree_item *item = root_item, *parent; while (item) { - if (has_children(item)) { + if (!list_empty(&item->children)) { item = list_first_entry(&item->children, struct pstree_item, sibling); continue; } @@ -225,7 +222,6 @@ struct pstree_item *__alloc_pstree_item(bool rst) item->pid->ns[0].virt = -1; item->pid->real = -1; item->pid->state = TASK_UNDEF; - item->pid->stop_signo = -1; item->born_sid = -1; item->pid->item = item; futex_init(&item->task_st); @@ -237,21 +233,17 @@ int init_pstree_helper(struct pstree_item *ret) { BUG_ON(!ret->parent); ret->pid->state = TASK_HELPER; - rsti(ret)->clone_flags = 0; - INIT_LIST_HEAD(&rsti(ret)->fds); + rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS; + if (shared_fdt_prepare(ret) < 0) + return -1; task_entries->nr_helpers++; return 0; } -bool has_children(struct pstree_item *item) -{ - return !list_empty(&item->children); -} - /* Deep first search on children */ struct pstree_item *pstree_item_next(struct pstree_item *item) { - if (has_children(item)) + if (!list_empty(&item->children)) return list_first_entry(&item->children, struct pstree_item, sibling); while (item->parent) { @@ -388,26 +380,17 @@ static int prepare_pstree_for_shell_job(pid_t pid) } for_each_pstree_item(pi) { - if (pi->sid == current_sid) { - pr_err("Current sid %d intersects with sid of (%d) in images\n", current_sid, vpid(pi)); - return -1; - } if (pi->sid == old_sid) pi->sid = current_sid; - - if (pi->pgid == current_sid) { - pr_err("Current sid %d intersects with pgid of (%d) in images\n", current_sid, - vpid(pi)); - return -1; - } - if (pi->pgid == old_sid) - pi->pgid = current_sid; } + + if (lookup_create_item(current_sid) == NULL) + return -1; } /* root_item is a group leader */ if (root_item->pgid == vpid(root_item)) - goto add_fake_session_leader; + return 0; old_gid = root_item->pgid; if (old_gid != current_gid) { @@ -420,21 +403,14 @@ static int prepare_pstree_for_shell_job(pid_t pid) } for_each_pstree_item(pi) { - if (current_gid != current_sid && pi->pgid == current_gid) { - pr_err("Current gid %d intersects with pgid of (%d) in images\n", current_gid, - vpid(pi)); - return -1; - } if (pi->pgid == old_gid) pi->pgid = current_gid; } + + if (lookup_create_item(current_gid) == NULL) + return -1; } - if (old_gid != current_gid && !lookup_create_item(current_gid)) - return -1; -add_fake_session_leader: - if (old_sid != current_sid && !lookup_create_item(current_sid)) - return -1; return 0; } @@ -888,7 +864,7 @@ static int prepare_pstree_kobj_ids(void) if (!item->ids) { if (item == root_item) { pr_err("No IDS for root task.\n"); - pr_err("Images corrupted or too old criu was used for dump.\n"); + pr_err("Images currupted or too old criu was used for dump.\n"); return -1; } @@ -966,7 +942,7 @@ static int prepare_pstree_kobj_ids(void) * this namespace is either inherited from the * criu or is created for the init task (only) */ - pr_err("Can't restore sub-task in NS (cflags %lx)\n", cflags); + pr_err("Can't restore sub-task in NS\n"); return -1; } } @@ -975,31 +951,6 @@ static int prepare_pstree_kobj_ids(void) return 0; } -static int prepare_pstree_rseqs(void) -{ - struct pstree_item *item; - - for_each_pstree_item(item) { - struct rst_rseq *rseqs; - size_t sz = sizeof(*rseqs) * item->nr_threads; - - if (!task_alive(item)) - continue; - - rseqs = shmalloc(sz); - if (!rseqs) { - pr_err("prepare_pstree_rseqs shmalloc(%lu) failed\n", (unsigned long)sz); - return -1; - } - - memset(rseqs, 0, sz); - - rsti(item)->rseqe = rseqs; - } - - return 0; -} - int prepare_pstree(void) { int ret; @@ -1057,17 +1008,6 @@ int prepare_pstree(void) * pstree with properly injected helper tasks. */ ret = prepare_pstree_ids(pid); - if (!ret) - /* - * We need to alloc shared buffers for RseqEntry'es - * arrays (one RseqEntry per pstree item thread). - * - * We need shared memory because we perform - * open_core() on the late stage inside - * restore_one_alive_task(), so that's the only - * way to transfer that data to the main CRIU process. - */ - ret = prepare_pstree_rseqs(); return ret; } diff --git a/criu/seize.c b/criu/seize.c index d0cf7b36c..95bf9ef0c 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -16,7 +16,6 @@ #include "pstree.h" #include "criu-log.h" #include -#include "plugin.h" #include "proc_parse.h" #include "seccomp.h" #include "seize.h" @@ -25,72 +24,13 @@ #include "xmalloc.h" #include "util.h" -static bool compel_interrupt_only_mode; - -/* - * Disables the use of freeze cgroups for process seizing, even if explicitly - * requested via the --freeze-cgroup option or already set in a frozen state. - * This is necessary for plugins (e.g., CUDA) that do not function correctly - * when processes are frozen using cgroups. - */ -void __attribute__((used)) set_compel_interrupt_only_mode(void) -{ - compel_interrupt_only_mode = true; -} - -char *task_comm_info(pid_t pid, char *comm, size_t size) -{ - bool is_read = false; - - if (!pr_quelled(LOG_INFO)) { - int saved_errno = errno; - char path[32]; - int fd; - - snprintf(path, sizeof(path), "/proc/%d/comm", pid); - fd = open(path, O_RDONLY); - if (fd >= 0) { - ssize_t n = read(fd, comm, size); - if (n > 0) { - is_read = true; - /* Replace '\n' printed by kernel with '\0' */ - comm[n - 1] = '\0'; - } else { - pr_warn("Failed to read %s: %s\n", path, strerror(errno)); - } - close(fd); - } else { - pr_warn("Failed to open %s: %s\n", path, strerror(errno)); - } - errno = saved_errno; - } - - if (!is_read) - comm[0] = '\0'; - - return comm; -} - -/* - * NOTE: Don't run simultaneously, it uses local static buffer! - */ -char *__task_comm_info(pid_t pid) -{ - static char comm[32]; - - return task_comm_info(pid, comm, sizeof(comm)); -} - #define NR_ATTEMPTS 5 static const char frozen[] = "FROZEN"; static const char freezing[] = "FREEZING"; static const char thawed[] = "THAWED"; -enum freezer_state { FREEZER_ERROR = -1, - THAWED, - FROZEN, - FREEZING }; +enum freezer_state { FREEZER_ERROR = -1, THAWED, FROZEN, FREEZING }; /* Track if we are running on cgroup v2 system. */ static bool cgroup_v2 = false; @@ -191,11 +131,11 @@ static enum freezer_state get_freezer_state(int fd) return get_freezer_v1_state(fd); } -static enum freezer_state origin_freezer_state = FREEZER_ERROR; +static bool freezer_thawed; const char *get_real_freezer_state(void) { - return origin_freezer_state == THAWED ? thawed : frozen; + return freezer_thawed ? thawed : frozen; } static int freezer_write_state(int fd, enum freezer_state new_state) @@ -206,12 +146,12 @@ static int freezer_write_state(int fd, enum freezer_state new_state) if (new_state == THAWED) { if (cgroup_v2) state[0] = '0'; - else if (__strlcpy(state, thawed, sizeof(state)) >= sizeof(state)) + else if (strlcpy(state, thawed, sizeof(state)) >= sizeof(state)) return -1; } else if (new_state == FROZEN) { if (cgroup_v2) state[0] = '1'; - else if (__strlcpy(state, frozen, sizeof(state)) >= sizeof(state)) + else if (strlcpy(state, frozen, sizeof(state)) >= sizeof(state)) return -1; } else { return -1; @@ -252,7 +192,7 @@ static int freezer_restore_state(void) int fd; int ret; - if (!opts.freeze_cgroup || origin_freezer_state != FROZEN) + if (!opts.freeze_cgroup || freezer_thawed) return 0; fd = freezer_open(); @@ -309,13 +249,13 @@ static int seize_cgroup_tree(char *root_path, enum freezer_state state) if (ret == 0) continue; if (errno != ESRCH) { - pr_perror("Unexpected error for pid %d (comm %s)", pid, __task_comm_info(pid)); + pr_perror("Unexpected error"); fclose(f); return -1; } if (!compel_interrupt_task(pid)) { - pr_debug("SEIZE %d (comm %s): success\n", pid, __task_comm_info(pid)); + pr_debug("SEIZE %d: success\n", pid); processes_to_wait++; } else if (state == FROZEN) { char buf[] = "/proc/XXXXXXXXXX/exe"; @@ -332,7 +272,7 @@ static int seize_cgroup_tree(char *root_path, enum freezer_state state) * before it compete exit procedure. The caller simply * should wait a bit and try freezing again. */ - pr_err("zombie %d (comm %s) found while seizing\n", pid, __task_comm_info(pid)); + pr_err("zombie found while seizing\n"); fclose(f); return -EAGAIN; } @@ -413,7 +353,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup || compel_interrupt_only_mode) + if (!opts.freeze_cgroup) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -508,73 +448,13 @@ static int log_unfrozen_stacks(char *root) return 0; } -static int prepare_freezer_for_interrupt_only_mode(void) -{ - enum freezer_state state = THAWED; - int fd; - int exit_code = -1; - - BUG_ON(!compel_interrupt_only_mode); - - fd = freezer_open(); - if (fd < 0) - return -1; - - state = get_freezer_state(fd); - if (state == FREEZER_ERROR) { - goto err; - } - - origin_freezer_state = state == FREEZING ? FROZEN : state; - - if (state != THAWED) { - pr_warn("unfreezing cgroup for plugin compatibility\n"); - if (freezer_write_state(fd, THAWED)) - goto err; - } - - exit_code = 0; -err: - close(fd); - return exit_code; -} - -static void cgroupv1_freezer_kludges(int fd, int iter, const struct timespec *req) { - /* As per older kernel docs (freezer-subsystem.txt before - * the kernel commit ef9fe980c6fcc1821), if FREEZING is seen, - * userspace should either retry or thaw. While current - * kernel cgroup v1 docs no longer mention a need to retry, - * even recent kernels can't reliably freeze a cgroup v1. - * - * Let's keep asking the kernel to freeze from time to time. - * In addition, do occasional thaw/sleep/freeze. - * - * This is still a game of chances (the real fix belongs to the kernel) - * but these kludges might improve the probability of success. - * - * Cgroup v2 does not have this problem. - */ - switch (iter % 32) { - case 9: - case 20: - freezer_write_state(fd, FROZEN); - break; - case 31: - freezer_write_state(fd, THAWED); - nanosleep(req, NULL); - freezer_write_state(fd, FROZEN); - break; - } -} - static int freeze_processes(void) { int fd, exit_code = -1; enum freezer_state state = THAWED; static const unsigned long step_ms = 100; - /* Since opts.timeout is in seconds, multiply it by 1000 to convert to milliseconds. */ - unsigned long nr_attempts = (opts.timeout * 1000) / step_ms; + unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; unsigned long i = 0; const struct timespec req = { @@ -583,12 +463,14 @@ static int freeze_processes(void) }; if (unlikely(!nr_attempts)) { - /* If the timeout is 0, wait for at least 10 seconds. */ - nr_attempts = (10 * 1000) / step_ms; + /* + * If timeout is turned off, lets + * wait for at least 10 seconds. + */ + nr_attempts = (10 * 1000000) / step_ms; } - pr_debug("freezing cgroup %s: %lu x %lums attempts, timeout: %us\n", - opts.freeze_cgroup, nr_attempts, step_ms, opts.timeout); + pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms); fd = freezer_open(); if (fd < 0) @@ -599,10 +481,9 @@ static int freeze_processes(void) close(fd); return -1; } - - origin_freezer_state = state == FREEZING ? FROZEN : state; - if (state == THAWED) { + freezer_thawed = true; + if (freezer_write_state(fd, FROZEN)) { close(fd); return -1; @@ -615,25 +496,22 @@ static int freeze_processes(void) * not read @tasks pids while freezer in * transition stage. */ - while (1) { + for (; i <= nr_attempts; i++) { state = get_freezer_state(fd); if (state == FREEZER_ERROR) { close(fd); return -1; } - if (state == FROZEN || i++ == nr_attempts || alarm_timeouted()) + if (state == FROZEN) break; - - if (!cgroup_v2) - cgroupv1_freezer_kludges(fd, i, &req); - + if (alarm_timeouted()) + goto err; nanosleep(&req, NULL); } - if (state != FROZEN) { - pr_err("Unable to freeze cgroup %s (%lu x %lums attempts, timeout: %us)\n", - opts.freeze_cgroup, i, step_ms, opts.timeout); + if (i > nr_attempts) { + pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup); if (!pr_quelled(LOG_DEBUG)) log_unfrozen_stacks(opts.freeze_cgroup); goto err; @@ -656,10 +534,8 @@ static int freeze_processes(void) } err: - if (exit_code == 0 || origin_freezer_state == THAWED) { - if (freezer_write_state(fd, THAWED)) - exit_code = -1; - } + if (exit_code == 0 || freezer_thawed) + exit_code = freezer_write_state(fd, THAWED); if (close(fd)) { pr_perror("Unable to thaw tasks"); @@ -707,18 +583,15 @@ static int collect_children(struct pstree_item *item) goto free; } + pr_info("Seized task %d, state %d\n", pid, ret); + c = alloc_pstree_item(); if (c == NULL) { ret = -1; goto free; } - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto free; - } - - if (!opts.freeze_cgroup || compel_interrupt_only_mode) + if (!opts.freeze_cgroup) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -741,11 +614,6 @@ static int collect_children(struct pstree_item *item) else processes_to_wait--; - if (ret == TASK_STOPPED) - c->pid->stop_signo = compel_parse_stop_signo(pid); - - pr_info("Seized task %d, state %d\n", pid, ret); - c->pid->real = pid; c->parent = item; c->pid->state = ret; @@ -777,7 +645,7 @@ static void unseize_task_and_threads(const struct pstree_item *item, int st) * the item->state is the state task was in when we seized one. */ - compel_resume_task_sig(item->pid->real, item->pid->state, st, item->pid->stop_signo); + compel_resume_task(item->pid->real, item->pid->state, st); if (st == TASK_DEAD) return; @@ -908,8 +776,7 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if ((!opts.freeze_cgroup || compel_interrupt_only_mode) && - compel_interrupt_task(pid)) + if (!opts.freeze_cgroup && compel_interrupt_task(pid)) continue; ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); @@ -965,7 +832,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup && !compel_interrupt_only_mode) + if (opts.freeze_cgroup) attempts = 1; /* @@ -1008,7 +875,7 @@ static int collect_task(struct pstree_item *item) if (ret < 0) goto err_close; - if ((item->pid->state == TASK_DEAD) && has_children(item)) { + if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) { pr_err("Zombie with children?! O_o Run, run, run!\n"); goto err_close; } @@ -1048,7 +915,7 @@ static int cgroup_version(void) int collect_pstree(void) { pid_t pid = root_item->pid->real; - int ret, exit_code = -1; + int ret = -1; struct proc_status_creds creds; timing_start(TIME_FREEZING); @@ -1065,31 +932,12 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && !compel_interrupt_only_mode) { - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } + if (opts.freeze_cgroup && freeze_processes()) + goto err; - if (freeze_processes()) - goto err; - } else { - if (opts.freeze_cgroup && prepare_freezer_for_interrupt_only_mode()) - goto err; - - /* - * Call PAUSE_DEVICES after prepare_freezer_for_interrupt_only_mode() - * to be able to checkpoint containers in a frozen state. - */ - ret = run_plugins(PAUSE_DEVICES, pid); - if (ret < 0 && ret != -ENOTSUP) { - goto err; - } - - if (compel_interrupt_task(pid)) { - set_cr_errno(ESRCH); - goto err; - } + if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { + set_cr_errno(ESRCH); + goto err; } ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL); @@ -1101,9 +949,6 @@ int collect_pstree(void) else processes_to_wait--; - if (ret == TASK_STOPPED) - root_item->pid->stop_signo = compel_parse_stop_signo(pid); - pr_info("Seized task %d, state %d\n", pid, ret); root_item->pid->state = ret; @@ -1115,35 +960,17 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && !compel_interrupt_only_mode && - freezer_wait_processes()) { + if (opts.freeze_cgroup && freezer_wait_processes()) { + ret = -1; goto err; } - exit_code = 0; + ret = 0; timing_stop(TIME_FREEZING); timing_start(TIME_FROZEN); err: /* Freezing stage finished in time - disable timer. */ alarm(0); - return exit_code; -} - -int checkpoint_devices(void) -{ - struct pstree_item *iter; - int ret, exit_code = -1; - - for_each_pstree_item(iter) { - if (!task_alive(iter)) - continue; - ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real); - if (ret < 0 && ret != -ENOTSUP) - goto err; - } - - exit_code = 0; -err: - return exit_code; + return ret; } diff --git a/criu/servicefd.c b/criu/servicefd.c index dfb019066..06a8d3eba 100644 --- a/criu/servicefd.c +++ b/criu/servicefd.c @@ -313,4 +313,4 @@ int clone_service_fd(struct pstree_item *me) ret = 0; return ret; -} \ No newline at end of file +} diff --git a/criu/setproctitle.c b/criu/setproctitle.c deleted file mode 100644 index 9e01678d2..000000000 --- a/criu/setproctitle.c +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include -#include -#include - -#ifdef CONFIG_HAS_LIBBSD -#include -#else - -#include "setproctitle.h" - -/* - * setproctitle_init is in the libbsd since v0.6.0. This macro allows to - * compile criu with libbsd<0.6.0. - */ -#ifndef CONFIG_HAS_SETPROCTITLE_INIT -#define setproctitle_init(argc, argv, envp) -#endif - -#define setproctitle(fmt, ...) -#endif - -void __setproctitle_init(int argc, char *argv[], char *envp[]) -{ - setproctitle_init(argc, argv, envp); -} - -#ifndef SPT_MAXTITLE -#define SPT_MAXTITLE 255 -#endif - -void __setproctitle(const char *fmt, ...) -{ - char buf[SPT_MAXTITLE + 1]; - va_list args; - - va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - setproctitle("%s", buf); -} diff --git a/criu/shmem.c b/criu/shmem.c index bc7aa3669..1b83327ef 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -26,7 +26,6 @@ #include "memfd.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" -#include "namespaces.h" #ifndef SEEK_DATA #define SEEK_DATA 3 @@ -82,12 +81,11 @@ struct shmem_info { * an region. Each time when we found a process with a smaller pid, * we reset self_count, so we can't have only one counter. */ - int count; /* the number of regions */ + int count; /* the number of regions */ int self_count; /* the number of regions, which belongs to "pid" */ }; - /* For sysvipc restore */ - struct { + struct { /* For sysvipc restore */ struct list_head att; /* list of shmem_sysv_att-s */ int want_write; }; @@ -206,34 +204,26 @@ static int expand_shmem(struct shmem_info *si, unsigned long new_size) return 0; } -static int update_shmem_pmaps(struct shmem_info *si, pmc_t *pmc, VmaEntry *vma) +static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma) { unsigned long shmem_pfn, vma_pfn, vma_pgcnt; - u64 vaddr; if (!is_shmem_tracking_en()) - return 0; + return; vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE); - for (vma_pfn = 0, vaddr = vma->start; vma_pfn < vma_pgcnt; ++vma_pfn, vaddr += PAGE_SIZE) { - struct page_info page_info = {}; - - if (should_dump_page(pmc, vma, vaddr, &page_info)) - return -1; - - if (page_info.next != vaddr) { - vaddr = page_info.next - PAGE_SIZE; + for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) { + if (!should_dump_page(vma, map[vma_pfn])) continue; - } shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE); - if (page_info.softdirty) + if (map[vma_pfn] & PME_SOFT_DIRTY) set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY); + else if (page_is_zero(map[vma_pfn])) + set_pstate(si->pstate_map, shmem_pfn, PST_ZERO); else set_pstate(si->pstate_map, shmem_pfn, PST_DUMP); } - - return 0; } int collect_sysv_shmem(unsigned long shmid, unsigned long size) @@ -543,24 +533,13 @@ out: return ret; } -struct open_map_file_args { - unsigned long addr, size; -}; - -static int open_map_file(void *args, int fd, pid_t pid) -{ - struct open_map_file_args *vma = args; - - return open_proc_rw(pid, "map_files/%lx-%lx", vma->addr, vma->addr + vma->size); -} - static int open_shmem(int pid, struct vma_area *vma) { VmaEntry *vi = vma->e; struct shmem_info *si; void *addr = MAP_FAILED; int f = -1; - int flags, is_hugetlb, memfd_flag = 0; + int flags; si = shmem_find(vi->shmid); pr_info("Search for %#016" PRIx64 " shmem 0x%" PRIx64 " %p/%d\n", vi->start, vi->shmid, si, si ? si->pid : -1); @@ -584,17 +563,9 @@ static int open_shmem(int pid, struct vma_area *vma) goto out; } - is_hugetlb = vi->flags & MAP_HUGETLB; - flags = MAP_SHARED; - if (is_hugetlb) { - int size_flag = vi->flags & MAP_HUGETLB_SIZE_MASK; - flags |= MAP_HUGETLB | size_flag; - memfd_flag |= MFD_HUGETLB | size_flag; - } - - if (kdat.has_memfd && (!is_hugetlb || kdat.has_memfd_hugetlb)) { - f = memfd_create("", memfd_flag); + if (kdat.has_memfd) { + f = memfd_create("", 0); if (f < 0) { pr_perror("Unable to create memfd"); goto err; @@ -627,11 +598,7 @@ static int open_shmem(int pid, struct vma_area *vma) } if (f == -1) { - struct open_map_file_args args = { - .addr = (unsigned long)addr, - .size = si->size, - }; - f = userns_call(open_map_file, UNS_FDOUT, &args, sizeof(args), -1); + f = open_proc_rw(getpid(), "map_files/%lx-%lx", (unsigned long)addr, (unsigned long)addr + si->size); if (f < 0) goto err; } @@ -656,7 +623,7 @@ err: return -1; } -int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) +int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map) { struct shmem_info *si; unsigned long size = vma->pgoff + (vma->end - vma->start); @@ -670,9 +637,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; } - - if (update_shmem_pmaps(si, pmc, vma)) - return -1; + update_shmem_pmaps(si, map, vma); return 0; } @@ -689,9 +654,7 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc) if (expand_shmem(si, size)) return -1; - - if (update_shmem_pmaps(si, pmc, vma)) - return -1; + update_shmem_pmaps(si, map, vma); return 0; } @@ -762,7 +725,7 @@ static int do_dump_one_shmem(int fd, void *addr, struct shmem_info *si) unsigned long pgaddr; int st = -1; - if (fd >= 0 && pfn >= next_hole_pfn && next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) + if (pfn >= next_hole_pfn && next_data_segment(fd, pfn, &next_data_pnf, &next_hole_pfn)) goto err_xfer; if (si->pstate_map && is_shmem_tracking_en()) { @@ -820,62 +783,24 @@ static int dump_one_shmem(struct shmem_info *si) { int fd, ret = -1; void *addr; - unsigned long cur, remaining; pr_info("Dumping shared memory %ld\n", si->shmid); - fd = __open_proc(si->pid, EPERM, O_RDONLY, "map_files/%lx-%lx", si->start, si->end); - if (fd >= 0) { - addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - pr_perror("Can't map shmem 0x%lx (0x%lx-0x%lx)", si->shmid, si->start, si->end); - goto errc; - } - } else { - if (errno != EPERM || !opts.unprivileged) { - goto err; - } + fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end); + if (fd < 0) + goto err; - pr_debug("Could not access map_files/ link, falling back to /proc/$pid/mem\n"); - - fd = open_proc(si->pid, "mem"); - if (fd < 0) { - goto err; - } - - addr = mmap(NULL, si->size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (addr == MAP_FAILED) { - pr_perror("Can't map empty space for shmem 0x%lx (0x%lx-0x%lx)", si->shmid, si->start, si->end); - goto errc; - } - - if (lseek(fd, si->start, SEEK_SET) < 0) { - pr_perror("Can't seek virtual memory"); - goto errc; - } - - cur = 0; - remaining = si->size; - do { - ret = read(fd, addr + cur, remaining); - if (ret <= 0) { - pr_perror("Can't read virtual memory"); - goto errc; - } - remaining -= ret; - cur += ret; - } while (remaining > 0); - - close(fd); - fd = -1; + addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n", si->shmid, si->start, si->end); + goto errc; } ret = do_dump_one_shmem(fd, addr, si); munmap(addr, si->size); errc: - if (fd >= 0) - close(fd); + close(fd); err: return ret; } diff --git a/criu/sigact.c b/criu/sigact.c deleted file mode 100644 index 5174644d2..000000000 --- a/criu/sigact.c +++ /dev/null @@ -1,319 +0,0 @@ -#include "types.h" -#include "infect.h" -#include "protobuf.h" -#include "pstree.h" -#include "parasite.h" -#include "restorer.h" -#include "sigact.h" - -/* - * If parent's sigaction has blocked SIGKILL (which is non-sense), - * this parent action is non-valid and shouldn't be inherited. - * Used to mark parent_act* no more valid. - */ -static rt_sigaction_t parent_act[SIGMAX]; -#ifdef CONFIG_COMPAT -static rt_sigaction_t_compat parent_act_compat[SIGMAX]; -#endif - -static bool sa_inherited(int sig, rt_sigaction_t *sa) -{ - rt_sigaction_t *pa; - int i; - - if (current == root_item) - return false; /* XXX -- inherit from CRIU? */ - - pa = &parent_act[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static void *stack32; -rt_sigaction_t sigchld_act; - -#ifdef CONFIG_COMPAT -static bool sa_compat_inherited(int sig, rt_sigaction_t_compat *sa) -{ - rt_sigaction_t_compat *pa; - int i; - - if (current == root_item) - return false; - - pa = &parent_act_compat[sig]; - - /* Omitting non-valid sigaction */ - if (pa->rt_sa_mask.sig[0] & (1 << SIGKILL)) - return false; - - for (i = 0; i < _KNSIG_WORDS; i++) - if (pa->rt_sa_mask.sig[i] != sa->rt_sa_mask.sig[i]) - return false; - - return pa->rt_sa_handler == sa->rt_sa_handler && pa->rt_sa_flags == sa->rt_sa_flags && - pa->rt_sa_restorer == sa->rt_sa_restorer; -} - -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t_compat act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, (u32)e->sigaction); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, (u32)e->restorer); - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); - - if (sig == SIGCHLD) { - memcpy(&sigchld_act, &act, sizeof(rt_sigaction_t_compat)); - return 0; - } - - if (sa_compat_inherited(sig - 1, &act)) - return 1; - - if (!stack32) { - stack32 = alloc_compat_syscall_stack(); - if (!stack32) - return -1; - } - - ret = arch_compat_rt_sigaction(stack32, sig, &act); - if (ret < 0) { - pr_err("Can't restore compat sigaction: %d\n", ret); - return ret; - } - - parent_act_compat[sig - 1] = act; - /* Mark SIGKILL blocked which makes native sigaction non-valid */ - parent_act[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; - - return 1; -} -#else -static int restore_compat_sigaction(int sig, SaEntry *e) -{ - return -1; -} -#endif - -static int restore_native_sigaction(int sig, SaEntry *e) -{ - rt_sigaction_t act; - int ret; - - ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction)); - ASSIGN_TYPED(act.rt_sa_flags, e->flags); - ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer)); -#ifdef CONFIG_MIPS - e->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(e->mask) * 2 != sizeof(act.rt_sa_mask.sig)); - - memcpy(&(act.rt_sa_mask.sig[0]), &e->mask, sizeof(act.rt_sa_mask.sig[0])); - memcpy(&(act.rt_sa_mask.sig[1]), &e->mask_extended, sizeof(act.rt_sa_mask.sig[1])); -#else - BUILD_BUG_ON(sizeof(e->mask) != sizeof(act.rt_sa_mask.sig)); - memcpy(act.rt_sa_mask.sig, &e->mask, sizeof(act.rt_sa_mask.sig)); -#endif - if (sig == SIGCHLD) { - sigchld_act = act; - return 0; - } - - if (sa_inherited(sig - 1, &act)) - return 1; - - /* - * A pure syscall is used, because glibc - * sigaction overwrites se_restorer. - */ - ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t)); - if (ret < 0) { - pr_perror("Can't restore sigaction"); - return ret; - } - - parent_act[sig - 1] = act; - /* Mark SIGKILL blocked which makes compat sigaction non-valid */ -#ifdef CONFIG_COMPAT - parent_act_compat[sig - 1].rt_sa_mask.sig[0] |= 1 << SIGKILL; -#endif - - return 1; -} - -static int prepare_sigactions_from_core(TaskCoreEntry *tc) -{ - int sig, i; - - if (tc->n_sigactions != SIGMAX - 2) { - pr_err("Bad number of sigactions in the image (%d, want %d)\n", (int)tc->n_sigactions, SIGMAX - 2); - return -1; - } - - pr_info("Restore on-core sigactions for %d\n", vpid(current)); - - for (sig = 1, i = 0; sig <= SIGMAX; sig++) { - int ret; - SaEntry *e; - bool sigaction_is_compat; - - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - e = tc->sigactions[i++]; - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - if (ret < 0) - return ret; - } - - return 0; -} - -/* Returns number of restored signals, -1 or negative errno on fail */ -static int restore_one_sigaction(int sig, struct cr_img *img, int pid) -{ - bool sigaction_is_compat; - SaEntry *e; - int ret = 0; - - BUG_ON(sig == SIGKILL || sig == SIGSTOP); - - ret = pb_read_one_eof(img, &e, PB_SIGACT); - if (ret == 0) { - if (sig != SIGMAX_OLD + 1) { /* backward compatibility */ - pr_err("Unexpected EOF %d\n", sig); - return -1; - } - pr_warn("This format of sigacts-%d.img is deprecated\n", pid); - return -1; - } - if (ret < 0) - return ret; - - sigaction_is_compat = e->has_compat_sigaction && e->compat_sigaction; - if (sigaction_is_compat) - ret = restore_compat_sigaction(sig, e); - else - ret = restore_native_sigaction(sig, e); - - sa_entry__free_unpacked(e, NULL); - - return ret; -} - -static int prepare_sigactions_from_image(void) -{ - int pid = vpid(current); - struct cr_img *img; - int sig, rst = 0; - int ret = 0; - - pr_info("Restore sigacts for %d\n", pid); - - img = open_image(CR_FD_SIGACT, O_RSTR, pid); - if (!img) - return -1; - - for (sig = 1; sig <= SIGMAX; sig++) { - if (sig == SIGKILL || sig == SIGSTOP) - continue; - - ret = restore_one_sigaction(sig, img, pid); - if (ret < 0) - break; - if (ret) - rst++; - } - - pr_info("Restored %d/%d sigacts\n", rst, SIGMAX - 3 /* KILL, STOP and CHLD */); - - close_image(img); - return ret; -} - -int prepare_sigactions(CoreEntry *core) -{ - int ret; - - if (!task_alive(current)) - return 0; - - if (core->tc->n_sigactions != 0) - ret = prepare_sigactions_from_core(core->tc); - else - ret = prepare_sigactions_from_image(); - - if (stack32) { - free_compat_syscall_stack(stack32); - stack32 = NULL; - } - - return ret; -} - -int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - TaskCoreEntry *tc = item->core[0]->tc; - struct parasite_dump_sa_args *args; - int ret, sig; - SaEntry *sa, **psa; - - args = compel_parasite_args(ctl, struct parasite_dump_sa_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl); - if (ret < 0) - return ret; - - psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry))); - if (!psa) - return -1; - - sa = (SaEntry *)(psa + SIGMAX - 2); - - tc->n_sigactions = SIGMAX - 2; - tc->sigactions = psa; - - for (sig = 1; sig <= SIGMAX; sig++) { - int i = sig - 1; - - if (sig == SIGSTOP || sig == SIGKILL) - continue; - - sa_entry__init(sa); - ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler)); - ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags); - ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer)); -#ifdef CONFIG_MIPS - sa->has_mask_extended = 1; - BUILD_BUG_ON(sizeof(sa->mask) * 2 != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, &(args->sas[i].rt_sa_mask.sig[0]), sizeof(sa->mask)); - memcpy(&sa->mask_extended, &(args->sas[i].rt_sa_mask.sig[1]), sizeof(sa->mask)); -#else - BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig)); - memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask)); -#endif - sa->has_compat_sigaction = true; - sa->compat_sigaction = !compel_mode_native(ctl); - - *(psa++) = sa++; - } - - return 0; -} diff --git a/criu/sk-inet.c b/criu/sk-inet.c index 422edc656..e52b198c3 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -14,8 +14,6 @@ #include #include #include -#include -#include #include "../soccr/soccr.h" @@ -44,11 +42,6 @@ #define PB_ALEN_INET 1 #define PB_ALEN_INET6 4 -/* Definition for older kernels without MPTCP support (e.g. Ubuntu 20.04) */ -#ifndef IPPROTO_MPTCP -#define IPPROTO_MPTCP 262 -#endif - static LIST_HEAD(inet_ports); struct inet_port { @@ -130,13 +123,9 @@ static int can_dump_ipproto(unsigned int ino, int proto, int type) case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: - case IPPROTO_ICMP: - case IPPROTO_ICMPV6: break; default: pr_err("Unsupported proto %d for socket %x\n", proto, ino); - if (proto == IPPROTO_MPTCP) - pr_err("For Go programs, consider using \"GODEBUG=multipathtcp=0\" to disable MPTCP\n"); return 0; } @@ -399,10 +388,6 @@ static int dump_ip_raw_opts(int sk, int family, int proto, IpOptsRawEntry *r) return ret; } -#ifndef IPV6_FREEBIND -#define IPV6_FREEBIND 78 -#endif - static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *ioe) { int ret = 0; @@ -413,25 +398,11 @@ static int dump_ip_opts(int sk, int family, int type, int proto, IpOptsEntry *io * and fetch additional options. */ ret |= dump_ip_raw_opts(sk, family, proto, ioe->raw); - } - - if (family == AF_INET6) { - if (kdat.has_ipv6_freebind) - ret |= dump_opt(sk, SOL_IPV6, IPV6_FREEBIND, &ioe->freebind); - else if (type != SOCK_RAW) - /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ - ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); - ret |= dump_opt(sk, SOL_IPV6, IPV6_RECVPKTINFO, &ioe->pktinfo); } else { + /* Due to kernel code we can use SOL_IP instead of SOL_IPV6 */ ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); - ret |= dump_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); - ret |= dump_opt(sk, SOL_IP, IP_TOS, &ioe->tos); - ret |= dump_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); + ioe->has_freebind = ioe->freebind; } - ioe->has_freebind = ioe->freebind; - ioe->has_pktinfo = !!ioe->pktinfo; - ioe->has_tos = !!ioe->tos; - ioe->has_ttl = !!ioe->ttl; return ret; } @@ -463,7 +434,6 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT; IpOptsRawEntry ipopts_raw = IP_OPTS_RAW_ENTRY__INIT; SkOptsEntry skopts = SK_OPTS_ENTRY__INIT; - TcpOptsEntry tcpopts = TCP_OPTS_ENTRY__INIT; int ret = -1, err = -1, proto, aux, type; ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL, &proto, sizeof(proto)); @@ -531,7 +501,6 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa ie.opts = &skopts; ie.ip_opts = &ipopts; ie.ip_opts->raw = &ipopts_raw; - ie.tcp_opts = &tcpopts; ie.n_src_addr = PB_ALEN_INET; ie.n_dst_addr = PB_ALEN_INET; @@ -581,7 +550,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa if (dump_ip_opts(lfd, family, type, proto, &ipopts)) goto err; - if (dump_socket_opts(lfd, family, &skopts)) + if (dump_socket_opts(lfd, &skopts)) goto err; pr_info("Dumping inet socket at %d\n", p->fd); @@ -592,20 +561,9 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa switch (proto) { case IPPROTO_TCP: + err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk, &skopts) : 0; if (sk->shutdown) sk_encode_shutdown(&ie, sk->shutdown); - - if (type == SOCK_RAW) { - err = 0; - } else { - err = dump_tcp_opts(lfd, &tcpopts); - if (err < 0) - goto err; - - err = dump_one_tcp(lfd, sk, &skopts); - if (err < 0) - goto err; - } break; case IPPROTO_UDP: case IPPROTO_UDPLITE: @@ -829,21 +787,8 @@ int restore_ip_opts(int sk, int family, int proto, IpOptsEntry *ioe) { int ret = 0; - if (family == AF_INET6) { - if (ioe->has_freebind) - ret |= restore_opt(sk, SOL_IPV6, IPV6_FREEBIND, &ioe->freebind); - if (ioe->has_pktinfo) - ret |= restore_opt(sk, SOL_IPV6, IPV6_RECVPKTINFO, &ioe->pktinfo); - } else { - if (ioe->has_freebind) - ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); - if (ioe->has_pktinfo) - ret |= restore_opt(sk, SOL_IP, IP_PKTINFO, &ioe->pktinfo); - if (ioe->has_tos) - ret |= restore_opt(sk, SOL_IP, IP_TOS, &ioe->tos); - if (ioe->has_ttl) - ret |= restore_opt(sk, SOL_IP, IP_TTL, &ioe->ttl); - } + if (ioe->has_freebind) + ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind); if (ioe->raw) ret |= restore_ip_raw_opts(sk, family, proto, ioe->raw); @@ -924,9 +869,8 @@ static int open_inet_sk(struct file_desc *d, int *new_fd) } if (ie->src_port) { - if (ie->proto != IPPROTO_ICMP && ie->proto != IPPROTO_ICMPV6) - if (inet_bind(sk, ii)) - goto err; + if (inet_bind(sk, ii)) + goto err; } /* @@ -962,9 +906,6 @@ done: if (restore_socket_opts(sk, ie->opts)) goto err; - if (ie->proto == IPPROTO_TCP && restore_tcp_opts(sk, ie->tcp_opts)) - goto err; - if (ie->has_shutdown && (ie->proto == IPPROTO_UDP || ie->proto == IPPROTO_UDPLITE || ie->proto == IPPROTO_TCP)) { if (shutdown(sk, sk_decode_shutdown(ie->shutdown))) { diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c index dc2baa1b8..754eed932 100644 --- a/criu/sk-netlink.c +++ b/criu/sk-netlink.c @@ -161,11 +161,11 @@ static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p) ne.protocol = val; } - ne.flags = p->flags; + ne.fown = (FownEntry *)&p->fown; ne.opts = &skopts; - if (dump_socket_opts(lfd, AF_NETLINK, &skopts)) + if (dump_socket_opts(lfd, &skopts)) goto err; fe.type = FD_TYPES__NETLINKSK; diff --git a/criu/sk-packet.c b/criu/sk-packet.c index 6530bff58..1d2e23522 100644 --- a/criu/sk-packet.c +++ b/criu/sk-packet.c @@ -173,7 +173,7 @@ static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p) psk.fown = (FownEntry *)&p->fown; psk.opts = &skopts; - if (dump_socket_opts(lfd, AF_PACKET, &skopts)) + if (dump_socket_opts(lfd, &skopts)) return -1; psk.protocol = sd->proto; diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 9c8bad1c3..0afecd2d6 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -39,8 +39,6 @@ static int lock_connection(struct inet_sk_desc *sk) return iptables_lock_connection(sk); else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) return nftables_lock_connection(sk); - else if (opts.network_lock_method == NETWORK_LOCK_SKIP) - return 0; return -1; } @@ -52,8 +50,6 @@ static int unlock_connection(struct inet_sk_desc *sk) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; - else if (opts.network_lock_method == NETWORK_LOCK_SKIP) - return 0; return -1; } @@ -135,8 +131,7 @@ void cpt_unlock_tcp_connections(void) static int dump_tcp_conn_state(struct inet_sk_desc *sk) { struct libsoccr_sk *socr = sk->priv; - int exit_code = -1; - int ret; + int ret, aux; struct cr_img *img; TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT; char *buf; @@ -145,11 +140,11 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) ret = libsoccr_save(socr, &data, sizeof(data)); if (ret < 0) { pr_err("libsoccr_save() failed with %d\n", ret); - goto err; + goto err_r; } if (ret != sizeof(data)) { pr_err("This libsocr is not supported (%d vs %d)\n", ret, (int)sizeof(data)); - goto err; + goto err_r; } sk->state = data.state; @@ -186,22 +181,43 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) tse.rcv_wup = data.rcv_wup; } + /* + * TCP socket options + */ + + if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux)) + goto err_opt; + + if (aux) { + tse.has_nodelay = true; + tse.nodelay = true; + } + + if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux)) + goto err_opt; + + if (aux) { + tse.has_cork = true; + tse.cork = true; + } + /* * Push the stuff to image */ + img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino); if (!img) - goto err; + goto err_img; ret = pb_write_one(img, &tse, PB_TCP_STREAM); if (ret < 0) - goto err_close; + goto err_iw; buf = libsoccr_get_queue_bytes(socr, TCP_RECV_QUEUE, SOCCR_MEM_EXCL); if (buf) { ret = write_img_buf(img, buf, tse.inq_len); if (ret < 0) - goto err_close; + goto err_iw; xfree(buf); } @@ -210,40 +226,40 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) if (buf) { ret = write_img_buf(img, buf, tse.outq_len); if (ret < 0) - goto err_close; + goto err_iw; xfree(buf); } pr_info("Done\n"); - exit_code = 0; -err_close: +err_iw: close_image(img); -err: - return exit_code; -} - -int dump_tcp_opts(int fd, TcpOptsEntry *toe) -{ - int ret = 0; - - ret |= dump_opt(fd, SOL_TCP, TCP_NODELAY, &toe->nodelay); - ret |= dump_opt(fd, SOL_TCP, TCP_CORK, &toe->cork); - ret |= dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); - ret |= dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); - ret |= dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); - - toe->has_nodelay = !!toe->nodelay; - toe->has_cork = !!toe->cork; - toe->has_keepcnt = !!toe->keepcnt; - toe->has_keepidle = !!toe->keepidle; - toe->has_keepintvl = !!toe->keepintvl; - +err_img: +err_opt: +err_r: return ret; } int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) { + soe->has_tcp_keepcnt = true; + if (dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt)) { + pr_perror("Can't read TCP_KEEPCNT"); + return -1; + } + + soe->has_tcp_keepidle = true; + if (dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &soe->tcp_keepidle)) { + pr_perror("Can't read TCP_KEEPIDLE"); + return -1; + } + + soe->has_tcp_keepintvl = true; + if (dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &soe->tcp_keepintvl)) { + pr_perror("Can't read TCP_KEEPINTVL"); + return -1; + } + if (sk->dst_port == 0) return 0; @@ -377,11 +393,6 @@ static int restore_tcp_conn_state(int sk, struct libsoccr_sk *socr, struct inet_ if (libsoccr_restore(socr, &data, sizeof(data))) goto err_c; - /* - * Restoring TCP socket options in TcpStreamEntry is - * for backward compatibility only, newer versions - * of CRIU use TcpOptsEntry. - */ if (tse->has_nodelay && tse->nodelay) { aux = 1; if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux)) @@ -434,34 +445,13 @@ int prepare_tcp_socks(struct task_restore_args *ta) return 0; } -int restore_tcp_opts(int sk, TcpOptsEntry *toe) -{ - int ret = 0; - - if(!toe) - return ret; - - if (toe->has_nodelay) - ret |= restore_opt(sk, SOL_TCP, TCP_NODELAY, &toe->nodelay); - if (toe->has_cork) - ret |= restore_opt(sk, SOL_TCP, TCP_CORK, &toe->cork); - if (toe->has_keepcnt) - ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &toe->keepcnt); - if (toe->has_keepidle) - ret |= restore_opt(sk, SOL_TCP, TCP_KEEPIDLE, &toe->keepidle); - if (toe->has_keepintvl) - ret |= restore_opt(sk, SOL_TCP, TCP_KEEPINTVL, &toe->keepintvl); - - return ret; -} - int restore_one_tcp(int fd, struct inet_sk_info *ii) { struct libsoccr_sk *sk; pr_info("Restoring TCP connection\n"); - if (opts.tcp_close) { + if (opts.tcp_close && ii->ie->state != TCP_LISTEN && ii->ie->state != TCP_CLOSE) { if (shutdown(fd, SHUT_RDWR) && errno != ENOTCONN) { pr_perror("Unable to shutdown the socket id %x ino %x", ii->ie->id, ii->ie->ino); } @@ -493,8 +483,6 @@ static int unlock_connection_info(struct inet_sk_info *si) else if (opts.network_lock_method == NETWORK_LOCK_NFTABLES) /* All connections will be unlocked in network_unlock(void) */ return 0; - else if (opts.network_lock_method == NETWORK_LOCK_SKIP) - return 0; return -1; } diff --git a/criu/sk-unix.c b/criu/sk-unix.c index 6145fe734..f3fe60c6e 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -221,7 +221,7 @@ int kerndat_socket_unix_file(void) } fd = ioctl(sk, SIOCUNIXFILE); if (fd < 0 && errno != ENOENT) { - pr_warn("Unable to open a socket file: %s\n", strerror(errno)); + pr_warn("Unable to open a socket file: %m\n"); kdat.sk_unix_file = false; close(sk); return 0; @@ -402,12 +402,12 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) sk_encode_shutdown(ue, sk->shutdown); /* - * If a stream/seqpacket listening socket has non-zero rqueue, - * this means there are in-flight connections waiting to get + * If a stream listening socket has non-zero rqueue, this + * means there are in-flight connections waiting to get * accept()-ed. We handle them separately with the "icons" * (i stands for in-flight, cons -- for connections) things. */ - if (sk->rqlen != 0 && sk->state != TCP_LISTEN) { + if (sk->rqlen != 0 && !(sk->type == SOCK_STREAM && sk->state == TCP_LISTEN)) { if (dump_sk_queue(lfd, id)) goto err; } @@ -460,7 +460,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) pr_warn("Shutdown mismatch %u:%d -> %u:%d\n", ue->ino, ue->shutdown, peer->sd.ino, peer->shutdown); } - } else if (ue->state == TCP_ESTABLISHED && ue->type != SOCK_DGRAM) { + } else if (ue->state == TCP_ESTABLISHED) { const struct unix_sk_listen_icon *e; e = lookup_unix_listen_icons(ue->ino); @@ -497,37 +497,12 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) goto err; } - if (sk->wqlen != 0) { - /* - * There's no known way to get data out of the write - * queue of an icon socket. The only good solution for - * now is to fail the migration. - */ - pr_err("Non-empty write queue on an in-flight socket %#x\n", ue->ino); - goto err; - } - ue->peer = e->sk_desc->sd.ino; pr_debug("\t\tFixed inflight socket %u peer %u)\n", ue->ino, ue->peer); - } else if (ue->state == TCP_LISTEN) { - int i; - - for (i = 0; i < sk->nr_icons; i++) - if (sk->icons[i] == 0) { - /* - * Inode of an icon socket equal to 0 means - * it's already been closed. That means we have - * no simple way to check if it sent any data. - * The only good solution for now is to fail - * the migration. - */ - pr_err("Found a closed in-flight socket to %#x\n", ue->ino); - goto err; - } } dump: - if (dump_socket_opts(lfd, AF_UNIX, skopts)) + if (dump_socket_opts(lfd, skopts)) goto err; pr_info("Dumping unix socket at %d\n", p->fd); @@ -595,14 +570,14 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U else ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc); if (!ns) { - pr_err("Failed to lookup ns by mnt id %d\n", ue->mnt_id); - return -1; + ret = -ENOENT; + goto out; } mntns_root = mntns_get_root_fd(ns); if (mntns_root < 0) { - pr_err("Failed to lookup mntns root for ns %d\n", ns->id); - return -1; + ret = -ENOENT; + goto out; } if (name[0] != '/') { @@ -613,15 +588,15 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U ret = resolve_rel_name(id, d, p, &ue->name_dir); if (ret < 0) - return -1; - return 0; + goto out; + goto postprone; } snprintf(rpath, sizeof(rpath), ".%s", name); if (fstatat(mntns_root, rpath, &st, 0)) { if (errno != ENOENT) { - pr_perror("Can't stat socket %#" PRIx32 "(%s)", id, rpath); - return -1; + pr_warn("Can't stat socket %#x(%s), skipping: %m (err %d)\n", id, rpath, errno); + goto skip; } pr_info("unix: Dropping path %s for unlinked sk %#x\n", name, id); @@ -639,77 +614,92 @@ static int unix_resolve_name_old(int lfd, uint32_t id, struct unix_sk_desc *d, U d->deleted = deleted; +postprone: return 0; + +out: + xfree(name); + return ret; +skip: + ret = 1; + goto out; } static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, UnixSkEntry *ue, const struct fd_parms *p) { char *name = d->name; - char path[PATH_MAX]; + char path[PATH_MAX], tmp[PATH_MAX]; struct stat st; - int fd, ret; - int exit_code = -1; + int fd, proc_fd, mnt_id, ret; if (d->namelen == 0 || name[0] == '\0') return 0; - if (!kdat.sk_unix_file) { - pr_warn("Trying to resolve unix socket with obsolete method\n"); - if (unix_resolve_name_old(lfd, id, d, ue, p)) { - pr_err("Unable to resolve unix socket name with obsolete method. " - "Try a linux kernel newer than 4.10\n"); + if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { + if (get_mnt_id(lfd, &mnt_id)) return -1; - } - return 0; + ue->mnt_id = mnt_id; + ue->has_mnt_id = true; } fd = ioctl(lfd, SIOCUNIXFILE); if (fd < 0) { - pr_perror("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl"); - return -1; + pr_warn("Unable to get a socket file descriptor with SIOCUNIXFILE ioctl: %m\n"); + goto fallback; } - if (root_ns_mask & CLONE_NEWNS) { - struct fdinfo_common fdinfo = { .mnt_id = -1 }; - - if (parse_fdinfo(fd, FD_TYPES__UND, &fdinfo)) - goto out; - - ue->mnt_id = fdinfo.mnt_id; - ue->has_mnt_id = true; - } - - if (fstat(fd, &st)) { + ret = fstat(fd, &st); + if (ret) { pr_perror("Unable to fstat socket fd"); - goto out; + return -1; } d->mode = st.st_mode; d->uid = st.st_uid; d->gid = st.st_gid; - ret = read_fd_link(fd, path, sizeof(path)); - if (ret < 0) + proc_fd = get_service_fd(PROC_FD_OFF); + if (proc_fd < 0) { + pr_err("Unable to get service fd for proc\n"); + return -1; + } + + snprintf(tmp, sizeof(tmp), "self/fd/%d", fd); + ret = readlinkat(proc_fd, tmp, path, PATH_MAX); + if (ret < 0 && ret >= PATH_MAX) { + pr_perror("Unable to readlink %s", tmp); goto out; + } + path[ret] = 0; d->deleted = strip_deleted(path, ret); if (name[0] != '/') { - if (cut_path_ending(path, name)) { - pr_err("Unable too cut %s from %s\n", name, path); + ret = cut_path_ending(path, name); + if (ret) { + pr_err("Unable too resolve %s from %s\n", name, path); goto out; } ue->name_dir = xstrdup(path); - if (!ue->name_dir) + if (!ue->name_dir) { + ret = -ENOMEM; goto out; + } pr_debug("Resolved socket relative name %s to %s/%s\n", name, ue->name_dir, name); } - exit_code = 0; + ret = 0; out: close(fd); - return exit_code; + return ret; + +fallback: + pr_warn("Trying to resolve unix socket with obsolete method\n"); + ret = unix_resolve_name_old(lfd, id, d, ue, p); + if (ret < 0) + pr_err("Unable to resolve unix socket name with obsolete method. Try a linux kernel newer than 4.10\n"); + return ret; } /* @@ -878,8 +868,7 @@ static int __dump_external_socket(struct unix_sk_desc *sk, struct unix_sk_desc * if (peer->type != SOCK_DGRAM) { show_one_unix("Ext stream not supported", peer); - pr_err("Can't dump half of stream unix connection. name: %s; peer name: %s\n", - sk->name, peer->name); + pr_err("Can't dump half of stream unix connection.\n"); return -1; } @@ -969,9 +958,9 @@ struct unix_sk_info { struct unix_sk_info *peer; struct pprep_head peer_resolve; /* XXX : union with the above? */ struct file_desc d; - struct hlist_node hash; /* To lookup socket by ino */ + struct hlist_node hash; /* To lookup socket by ino */ struct list_head connected; /* List of sockets, connected to me */ - struct list_head node; /* To link in peer's connected list */ + struct list_head node; /* To link in peer's connected list */ struct list_head scm_fles; struct list_head ghost_node; size_t ghost_dir_pos; @@ -1032,8 +1021,8 @@ static struct unix_sk_info *find_queuer_for(int id) struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { - if (ui->queuer && ui->ue->id == id) - return ui->queuer; + if (ui->queuer && ui->queuer->ue->id == id) + return ui; } return NULL; @@ -1431,22 +1420,32 @@ err_revert_and_exit: static int restore_file_perms(struct unix_sk_info *ui) { - FilePermsEntry *perms = ui->ue->file_perms; - char fname[PATH_MAX]; + if (ui->ue->file_perms) { + FilePermsEntry *perms = ui->ue->file_perms; + char fname[PATH_MAX]; - if (!perms) - return 0; + if (ui->ue->name.len >= sizeof(fname)) { + pr_err("The file name is too long\n"); + return -E2BIG; + } - if (ui->ue->name.len >= sizeof(fname)) { - pr_err("The file name is too long\n"); - errno = -E2BIG; - return -1; + memcpy(fname, ui->name, ui->ue->name.len); + fname[ui->ue->name.len] = '\0'; + + if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) < 0) { + int errno_cpy = errno; + pr_perror("Unable to change file owner and group"); + return -errno_cpy; + } + + if (fchmodat(AT_FDCWD, fname, perms->mode, 0) < 0) { + int errno_cpy = errno; + pr_perror("Unable to change file mode bits"); + return -errno_cpy; + } } - memcpy(fname, ui->name, ui->ue->name.len); - fname[ui->ue->name.len] = '\0'; - - return cr_fchpermat(AT_FDCWD, fname, perms->uid, perms->gid, perms->mode, 0); + return 0; } static int keep_deleted(struct unix_sk_info *ui) @@ -1473,7 +1472,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) char path[PATH_MAX], path_parked[PATH_MAX], *pos; struct sockaddr_un addr; bool renamed = false; - int ret, exit_code = -1; + int ret; if (ui->ue->name.len >= UNIX_PATH_MAX) { pr_err("ghost: Too long name for socket id %#x ino %u name %s\n", ui->ue->id, ui->ue->ino, ui->name); @@ -1495,9 +1494,10 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) } if (errno != ENOENT) { + ret = -errno; pr_perror("ghost: Can't access %s for socket id %#x ino %u name %s", path, ui->ue->id, ui->ue->ino, ui->name); - return -1; + return ret; } } @@ -1508,8 +1508,9 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) pr_debug("ghost: socket id %#x ino %u name %s creating %s\n", ui->ue->id, ui->ue->ino, ui->name, pos); ret = mkdirpat(AT_FDCWD, pos, 0755); if (ret) { + errno = -ret; pr_perror("ghost: Can't create %s", pos); - return -1; + return ret; } memset(&addr, 0, sizeof(addr)); @@ -1528,9 +1529,10 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) pr_debug("ghost: Unlinked stale socket id %#x ino %d name %s\n", ui->ue->id, ui->ue->ino, path_parked); if (rename(ui->name, path_parked)) { + ret = -errno; pr_perror("ghost: Can't rename id %#x ino %u addr %s -> %s", ui->ue->id, ui->ue->ino, ui->name, path_parked); - return -1; + return ret; } pr_debug("ghost: id %#x ino %d renamed %s -> %s\n", ui->ue->id, ui->ue->ino, ui->name, path_parked); renamed = true; @@ -1538,6 +1540,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) ret = bind(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family) + ui->ue->name.len); if (ret < 0) { + ret = -errno; pr_perror("ghost: Can't bind on socket id %#x ino %d addr %s", ui->ue->id, ui->ue->ino, ui->name); goto out_rename; } @@ -1549,10 +1552,9 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) ret = keep_deleted(ui); if (ret < 0) { pr_err("ghost: Can't save socket %#x ino %u addr %s into fdstore\n", ui->ue->id, ui->ue->ino, ui->name); - goto out; + ret = -EIO; } - exit_code = 0; out: /* * Once everything is ready, just remove the socket from the @@ -1560,14 +1562,14 @@ out: */ ret = unlinkat(AT_FDCWD, ui->name, 0); if (ret < 0) { - exit_code = -1; + ret = -errno; pr_perror("ghost: Can't unlink socket %#x ino %u addr %s", ui->ue->id, ui->ue->ino, ui->name); } out_rename: if (renamed) { if (rename(path_parked, ui->name)) { - exit_code = -1; + ret = -errno; pr_perror("ghost: Can't rename id %#x ino %u addr %s -> %s", ui->ue->id, ui->ue->ino, path_parked, ui->name); } else { @@ -1596,7 +1598,7 @@ out_rename: } } - return exit_code; + return 0; } static int bind_unix_sk(int sk, struct unix_sk_info *ui) @@ -1608,7 +1610,7 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui) if (ui->ue->name.len == 0) return 0; - if ((ui->ue->type != SOCK_DGRAM) && (ui->ue->state == TCP_ESTABLISHED)) { + if ((ui->ue->type == SOCK_STREAM) && (ui->ue->state == TCP_ESTABLISHED)) { /* * FIXME this can be done, but for doing this properly we * need to bind socket to its name, then rename one to @@ -1641,6 +1643,8 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui) if (ui->flags & USK_GHOST_FDSTORE) { pr_debug("ghost: bind id %#x ino %u addr %s\n", ui->ue->id, ui->ue->ino, ui->name); ret = bind_on_deleted(sk, ui); + if (ret) + errno = -ret; } else { pr_debug("bind id %#x ino %u addr %s\n", ui->ue->id, ui->ue->ino, ui->name); ret = bind(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family) + ui->ue->name.len); @@ -1847,10 +1851,14 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) close(sks[1]); sk = sks[0]; - } else if ((ui->ue->state == TCP_ESTABLISHED && ui->ue->type != SOCK_DGRAM) && queuer && - queuer->ue->ino == FAKE_INO) { + } else if (ui->ue->state == TCP_ESTABLISHED && queuer && queuer->ue->ino == FAKE_INO) { int ret, sks[2]; + if (ui->ue->type != SOCK_STREAM) { + pr_err("Non-stream socket %u in established state\n", ui->ue->ino); + return -1; + } + if (ui->ue->shutdown != SK_SHUTDOWN__BOTH) { pr_err("Wrong shutdown/peer state for %u\n", ui->ue->ino); return -1; @@ -2321,7 +2329,7 @@ static void try_resolve_unix_peer(struct unix_sk_info *ui) int unix_sk_id_add(unsigned int ino) { - cleanup_free char *e_str = NULL; + char *e_str; e_str = xmalloc(20); if (!e_str) diff --git a/criu/sockets.c b/criu/sockets.c index e4adae03c..9426b5b94 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -29,7 +29,6 @@ #include "pstree.h" #include "util.h" #include "fdstore.h" -#include "cr_options.h" #undef LOG_PREFIX #define LOG_PREFIX "sockets: " @@ -38,7 +37,7 @@ #define SOCK_DIAG_BY_FAMILY 20 #endif -#define SK_HASH_SIZE (1 << 14) +#define SK_HASH_SIZE 32 #ifndef SO_GET_FILTER #define SO_GET_FILTER SO_ATTACH_FILTER @@ -65,7 +64,7 @@ const char *socket_proto_name(unsigned int proto, char *nm, size_t size) [IPPROTO_IPV6] = __stringify_1(IPPROTO_IPV6), [IPPROTO_RSVP] = __stringify_1(IPPROTO_RSVP), [IPPROTO_GRE] = __stringify_1(IPPROTO_GRE), [IPPROTO_ESP] = __stringify_1(IPPROTO_ESP), [IPPROTO_AH] = __stringify_1(IPPROTO_AH), [IPPROTO_UDPLITE] = __stringify_1(IPPROTO_UDPLITE), - [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), [IPPROTO_ICMPV6] = __stringify_1(IPPROTO_ICMPV6), + [IPPROTO_RAW] = __stringify_1(IPPROTO_RAW), }; return __socket_const_name(nm, size, protos, ARRAY_SIZE(protos), proto); } @@ -131,12 +130,10 @@ enum socket_cl_bits { INET_UDP_CL_BIT, INET_UDPLITE_CL_BIT, INET_RAW_CL_BIT, - INET_ICMP_CL_BIT, INET6_TCP_CL_BIT, INET6_UDP_CL_BIT, INET6_UDPLITE_CL_BIT, INET6_RAW_CL_BIT, - INET6_ICMP_CL_BIT, UNIX_CL_BIT, PACKET_CL_BIT, _MAX_CL_BIT, @@ -163,8 +160,6 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET_RAW_CL_BIT; - if (proto == IPPROTO_ICMP) - return INET_ICMP_CL_BIT; } if (family == AF_INET6) { if (proto == IPPROTO_TCP) @@ -175,8 +170,6 @@ static inline enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsign return INET6_UDPLITE_CL_BIT; if (proto == IPPROTO_RAW) return INET6_RAW_CL_BIT; - if (proto == IPPROTO_ICMPV6) - return INET6_ICMP_CL_BIT; } pr_err("Unknown pair family %d proto %d\n", family, proto); @@ -288,12 +281,6 @@ void preload_socket_modules(void) req.r.i.sdiag_protocol = IPPROTO_RAW; probe_diag(nl, &req, -ENOENT); - req.r.i.sdiag_protocol = IPPROTO_ICMP; - probe_diag(nl, &req, -ENOENT); - - req.r.i.sdiag_protocol = IPPROTO_ICMPV6; - probe_diag(nl, &req, -ENOENT); - close(nl); pr_info("Done probing\n"); } @@ -478,33 +465,18 @@ int do_restore_opt(int sk, int level, int name, void *val, int len) return 0; } -int sk_setbufs(int sk, uint32_t *bufs) +static int sk_setbufs(void *arg, int fd, pid_t pid) { - uint32_t sndbuf = bufs[0], rcvbuf = bufs[1]; + u32 *buf = (u32 *)arg; - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUFFORCE, &sndbuf, sizeof(sndbuf)) || - setsockopt(sk, SOL_SOCKET, SO_RCVBUFFORCE, &rcvbuf, sizeof(rcvbuf))) { - if (opts.unprivileged) { - pr_info("Unable to set SO_SNDBUFFORCE/SO_RCVBUFFORCE, falling back to SO_SNDBUF/SO_RCVBUF\n"); - if (setsockopt(sk, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)) || - setsockopt(sk, SOL_SOCKET, SO_RCVBUF, &rcvbuf, sizeof(rcvbuf))) { - pr_perror("Unable to set socket SO_SNDBUF/SO_RCVBUF"); - return -1; - } - } else { - pr_perror("Unable to set socket SO_SNDBUFFORCE/SO_RCVBUFFORCE"); - return -1; - } - } + if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0])) + return -1; + if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1])) + return -1; return 0; } -static int sk_setbufs_ns(void *arg, int fd, pid_t pid) -{ - return sk_setbufs(fd, (uint32_t *)arg); -} - /* * Set sizes of buffers to maximum and prevent blocking * Caller of this fn should call other socket restoring @@ -517,7 +489,7 @@ int restore_prepare_socket(int sk) /* In kernel a bufsize has type int and a value is doubled. */ u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 }; - if (userns_call(sk_setbufs_ns, 0, maxbuf, sizeof(maxbuf), sk)) + if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk)) return -1; /* Prevent blocking on restore */ @@ -545,12 +517,8 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ - ret |= userns_call(sk_setbufs_ns, 0, bufs, sizeof(bufs), sk); + ret |= userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk); - if (soe->has_so_buf_lock) { - pr_debug("\trestore buf_lock %d for socket\n", soe->so_buf_lock); - ret |= restore_opt(sk, SOL_SOCKET, SO_BUF_LOCK, &soe->so_buf_lock); - } if (soe->has_so_priority) { pr_debug("\trestore priority %d for socket\n", soe->so_priority); ret |= restore_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); @@ -597,12 +565,6 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) pr_debug("\tset keepalive for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val); } - - /* - * Restoring TCP socket options in SkOptsEntry is - * for backward compatibility only, newer versions - * of CRIU use TcpOptsEntry. - */ if (soe->has_tcp_keepcnt) { pr_debug("\tset keepcnt for socket\n"); ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt); @@ -649,7 +611,7 @@ int do_dump_opt(int sk, int level, int name, void *val, int len) return 0; } -int dump_socket_opts(int sk, int family, SkOptsEntry *soe) +int dump_socket_opts(int sk, SkOptsEntry *soe) { int ret = 0, val; struct timeval tv; @@ -657,20 +619,12 @@ int dump_socket_opts(int sk, int family, SkOptsEntry *soe) ret |= dump_opt(sk, SOL_SOCKET, SO_SNDBUF, &soe->so_sndbuf); ret |= dump_opt(sk, SOL_SOCKET, SO_RCVBUF, &soe->so_rcvbuf); - if (kdat.has_sockopt_buf_lock) { - soe->has_so_buf_lock = true; - ret |= dump_opt(sk, SOL_SOCKET, SO_BUF_LOCK, &soe->so_buf_lock); - } soe->has_so_priority = true; ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority); soe->has_so_rcvlowat = true; ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat); - /* - * Restoring SO_MARK requires root or CAP_NET_ADMIN. Avoid saving it - * in unprivileged mode if still has its default value. - */ + soe->has_so_mark = true; ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); - soe->has_so_mark = !!soe->so_mark; ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv); soe->so_snd_tmo_sec = tv.tv_sec; @@ -688,15 +642,13 @@ int dump_socket_opts(int sk, int family, SkOptsEntry *soe) soe->so_reuseport = val ? true : false; soe->has_so_reuseport = true; - if (family == AF_UNIX || family == AF_NETLINK) { - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); - soe->has_so_passcred = true; - soe->so_passcred = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); + soe->has_so_passcred = true; + soe->so_passcred = val ? true : false; - ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); - soe->has_so_passsec = true; - soe->so_passsec = val ? true : false; - } + ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); + soe->has_so_passsec = true; + soe->so_passsec = val ? true : false; ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); soe->has_so_dontroute = true; @@ -787,10 +739,6 @@ static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg) case IPPROTO_RAW: type = SOCK_RAW; break; - case IPPROTO_ICMP: - case IPPROTO_ICMPV6: - type = SOCK_DGRAM; - break; default: BUG_ON(1); return -1; @@ -815,7 +763,7 @@ static int collect_err(int err, struct ns_id *ns, void *arg) char family[32], proto[32]; char msg[256]; - snprintf(msg, sizeof(msg), "Sockets collect procedure family %s proto %s", + snprintf(msg, sizeof(msg), "Sockects collect procedure family %s proto %s", socket_family_name(gr->family, family, sizeof(family)), socket_proto_name(gr->protocol, proto, sizeof(proto))); @@ -923,13 +871,6 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; - /* Collect IPv4 ICMP sockets */ - req.r.i.sdiag_family = AF_INET; - req.r.i.sdiag_protocol = IPPROTO_ICMP; - req.r.i.idiag_ext = 0; - req.r.i.idiag_states = -1; /* All */ - set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); - /* Collect IPv6 TCP sockets */ req.r.i.sdiag_family = AF_INET6; req.r.i.sdiag_protocol = IPPROTO_TCP; @@ -969,13 +910,6 @@ int collect_sockets(struct ns_id *ns) if (tmp) err = tmp; - /* Collect IPv6 ICMP sockets */ - req.r.i.sdiag_family = AF_INET6; - req.r.i.sdiag_protocol = IPPROTO_ICMPV6; - req.r.i.idiag_ext = 0; - req.r.i.idiag_states = -1; /* All */ - set_collect_bit(req.r.n.sdiag_family, req.r.n.sdiag_protocol); - req.r.p.sdiag_family = AF_PACKET; req.r.p.sdiag_protocol = 0; req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST | PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG; diff --git a/criu/string.c b/criu/string.c index 7edd35363..7df0b3e09 100644 --- a/criu/string.c +++ b/criu/string.c @@ -6,6 +6,7 @@ #include "string.h" +#ifndef CONFIG_HAS_STRLCPY /** * strlcpy - Copy a %NUL terminated string into a sized buffer * @dest: Where to copy the string to @@ -17,7 +18,7 @@ * of course, the buffer size is zero). It does not pad * out the result like strncpy() does. */ -size_t __strlcpy(char *dest, const char *src, size_t size) +size_t strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); @@ -28,14 +29,16 @@ size_t __strlcpy(char *dest, const char *src, size_t size) } return ret; } +#endif +#ifndef CONFIG_HAS_STRLCAT /** * strlcat - Append a length-limited, %NUL-terminated string to another * @dest: The string to be appended to * @src: The string to append to it * @count: The size of the destination buffer. */ -size_t __strlcat(char *dest, const char *src, size_t count) +size_t strlcat(char *dest, const char *src, size_t count) { size_t dsize = strlen(dest); size_t len = strlen(src); @@ -54,3 +57,4 @@ size_t __strlcat(char *dest, const char *src, size_t count) dest[len] = 0; return res; } +#endif diff --git a/criu/sysctl.c b/criu/sysctl.c index 99026acf4..b06688712 100644 --- a/criu/sysctl.c +++ b/criu/sysctl.c @@ -203,17 +203,6 @@ static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid) * 2. forks a task * 3. setns()es to the UTS/IPC namespace of the caller * 4. write()s to the files and exits - * - * For the IPC namespace, since - * https://github.com/torvalds/linux/commit/5563cabdde, user with - * enough capability can open IPC sysctl files and write to it. Later - * commit https://github.com/torvalds/linux/commit/1f5c135ee5 and - * https://github.com/torvalds/linux/commit/0889f44e28 bind the IPC - * namespace at the open() time so the changed value does not depend - * on the IPC namespace at the write() time. Also, the permission check - * changes a little bit which makes the above approach unusable but we - * can simply use nonuserns version for restoring as IPC sysctl as the - * restored process currently has enough capability. */ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY); if (dir < 0) { @@ -346,12 +335,9 @@ out: return ret; } -/* exit_code = 1 in case nonuserns failed but we want to fallback to userns approach */ -static int __nonuserns_sysctl_op(struct sysctl_req **orig_req, size_t *orig_nr_req, int op) +static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op) { int ret, exit_code = -1; - struct sysctl_req *req = *orig_req; - size_t nr_req = *orig_nr_req; while (nr_req--) { int fd; @@ -365,14 +351,6 @@ static int __nonuserns_sysctl_op(struct sysctl_req **orig_req, size_t *orig_nr_r req++; continue; } - if (errno == EACCES && (req->flags & CTL_FLAGS_IPC_EACCES_SKIP)) { - /* The remaining requests are restored using userns approach */ - *orig_req = req; - *orig_nr_req = nr_req + 1; - exit_code = 1; - goto out; - } - pr_perror("Can't open sysctl %s", req->name); goto out; } @@ -426,16 +404,7 @@ int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns) * so we can do those in process as well. */ if (!ns || ns & CLONE_NEWNET || op == CTL_READ) - return __nonuserns_sysctl_op(&req, &nr_req, op); - - /* Try to use nonuserns for restoring IPC sysctl and fallback to - * userns approach when the returned code is 1. - */ - if (ns & CLONE_NEWIPC && op == CTL_WRITE) { - ret = __nonuserns_sysctl_op(&req, &nr_req, op); - if (ret <= 0) - return ret; - } + return __nonuserns_sysctl_op(req, nr_req, op); /* * In order to avoid lots of opening of /proc/sys for each struct sysctl_req, diff --git a/criu/timens.c b/criu/timens.c index 257782e5a..5803fc359 100644 --- a/criu/timens.c +++ b/criu/timens.c @@ -5,7 +5,6 @@ #include "proc_parse.h" #include "namespaces.h" #include "timens.h" -#include "cr_options.h" #include "protobuf.h" #include "images/timens.pb-c.h" @@ -58,9 +57,6 @@ int prepare_timens(int id) struct timespec ts; struct timespec prev_moff = {}, prev_boff = {}; - if (opts.unprivileged) - return 0; - img = open_image(CR_FD_TIMENS, O_RSTR, id); if (!img) return -1; @@ -96,8 +92,8 @@ int prepare_timens(int id) ts.tv_nsec = te->monotonic->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: monotonic %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_MONOTONIC, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: monotonic %ld %ld\n", ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %ld %ld\n", CLOCK_MONOTONIC, ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a monotonic clock offset"); goto err; } @@ -111,8 +107,8 @@ int prepare_timens(int id) ts.tv_nsec = te->boottime->tv_nsec - ts.tv_nsec; normalize_timespec(&ts); - pr_debug("timens: boottime %" PRId64 " %ld\n", (int64_t)ts.tv_sec, ts.tv_nsec); - if (dprintf(fd, "%d %" PRId64 " %ld\n", CLOCK_BOOTTIME, (int64_t)ts.tv_sec, ts.tv_nsec) < 0) { + pr_debug("timens: boottime %ld %ld\n", ts.tv_sec, ts.tv_nsec); + if (dprintf(fd, "%d %ld %ld\n", CLOCK_BOOTTIME, ts.tv_sec, ts.tv_nsec) < 0) { pr_perror("Unable to set a boottime clock offset"); goto err; } diff --git a/criu/timer.c b/criu/timer.c deleted file mode 100644 index 856501be6..000000000 --- a/criu/timer.c +++ /dev/null @@ -1,402 +0,0 @@ -#include "types.h" -#include "crtools.h" -#include "infect.h" -#include "protobuf.h" -#include "pstree.h" -#include "posix-timer.h" -#include "parasite.h" -#include "namespaces.h" -#include "rst-malloc.h" -#include "restorer.h" - -static inline int timeval_valid(struct timeval *tv) -{ - return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC); -} - -static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val) -{ - if (ie->isec == 0 && ie->iusec == 0 && ie->vsec == 0 && ie->vusec == 0) { - memzero_p(val); - return 0; - } - - val->it_interval.tv_sec = ie->isec; - val->it_interval.tv_usec = ie->iusec; - - if (!timeval_valid(&val->it_interval)) { - pr_err("Invalid timer interval\n"); - return -1; - } - - if (ie->vsec == 0 && ie->vusec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - val->it_value.tv_sec = ie->isec; - val->it_value.tv_usec = ie->iusec; - } else { - val->it_value.tv_sec = ie->vsec; - val->it_value.tv_usec = ie->vusec; - } - - if (!timeval_valid(&val->it_value)) { - pr_err("Invalid timer value\n"); - return -1; - } - - pr_info("Restored %s timer to %" PRId64 ".%" PRId64 " -> %" PRId64 ".%" PRId64 "\n", n, - (int64_t)val->it_value.tv_sec, (int64_t)val->it_value.tv_usec, - (int64_t)val->it_interval.tv_sec, (int64_t)val->it_interval.tv_usec); - - return 0; -} - -/* - * Legacy itimers restore from CR_FD_ITIMERS - */ - -int prepare_itimers_from_fd(int pid, struct task_restore_args *args) -{ - int ret = -1; - struct cr_img *img; - ItimerEntry *ie; - - if (!deprecated_ok("Itimers")) - return -1; - - img = open_image(CR_FD_ITIMERS, O_RSTR, pid); - if (!img) - return -1; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("real", ie, &args->itimers[0]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("virt", ie, &args->itimers[1]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; - - ret = pb_read_one(img, &ie, PB_ITIMER); - if (ret < 0) - goto out; - ret = decode_itimer("prof", ie, &args->itimers[2]); - itimer_entry__free_unpacked(ie, NULL); - if (ret < 0) - goto out; -out: - close_image(img); - return ret; -} - -int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core) -{ - int ret = 0; - TaskTimersEntry *tte = core->tc->timers; - - if (!tte) - return prepare_itimers_from_fd(pid, args); - - ret |= decode_itimer("real", tte->real, &args->itimers[0]); - ret |= decode_itimer("virt", tte->virt, &args->itimers[1]); - ret |= decode_itimer("prof", tte->prof, &args->itimers[2]); - - return ret; -} - -static inline int timespec_valid(struct timespec *ts) -{ - return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC); -} - -static inline int decode_posix_timer(PosixTimerEntry *pte, struct restore_posix_timer *pt) -{ - pt->val.it_interval.tv_sec = pte->isec; - pt->val.it_interval.tv_nsec = pte->insec; - - if (!timespec_valid(&pt->val.it_interval)) { - pr_err("Invalid timer interval(posix)\n"); - return -1; - } - - if (pte->vsec == 0 && pte->vnsec == 0) { - /* - * Remaining time was too short. Set it to - * interval to make the timer armed and work. - */ - pt->val.it_value.tv_sec = pte->isec; - pt->val.it_value.tv_nsec = pte->insec; - } else { - pt->val.it_value.tv_sec = pte->vsec; - pt->val.it_value.tv_nsec = pte->vnsec; - } - - if (!timespec_valid(&pt->val.it_value)) { - pr_err("Invalid timer value(posix)\n"); - return -1; - } - - pt->spt.it_id = pte->it_id; - pt->spt.clock_id = pte->clock_id; - pt->spt.si_signo = pte->si_signo; - pt->spt.it_sigev_notify = pte->it_sigev_notify; - pt->spt.sival_ptr = decode_pointer(pte->sival_ptr); - pt->spt.notify_thread_id = pte->notify_thread_id; - pt->overrun = pte->overrun; - - return 0; -} - -static int cmp_posix_timer_proc_id(const void *p1, const void *p2) -{ - return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id; -} - -static void sort_posix_timers(struct task_restore_args *ta) -{ - void *tmem; - - /* - * This is required for restorer's create_posix_timers(), - * it will probe them one-by-one for the desired ID, since - * kernel doesn't provide another API for timer creation - * with given ID. - */ - - if (ta->posix_timers_n > 0) { - tmem = rst_mem_remap_ptr((unsigned long)ta->posix_timers, RM_PRIVATE); - qsort(tmem, ta->posix_timers_n, sizeof(struct restore_posix_timer), cmp_posix_timer_proc_id); - } -} - -/* - * Legacy posix timers restoration from CR_FD_POSIX_TIMERS - */ - -int prepare_posix_timers_from_fd(int pid, struct task_restore_args *ta) -{ - struct cr_img *img; - int ret = -1; - struct restore_posix_timer *t; - - if (!deprecated_ok("Posix timers")) - return -1; - - img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid); - if (!img) - return -1; - - ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; - ta->posix_timers_n = 0; - while (1) { - PosixTimerEntry *pte; - - ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER); - if (ret <= 0) - break; - - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - break; - - ret = decode_posix_timer(pte, t); - if (ret < 0) - break; - - posix_timer_entry__free_unpacked(pte, NULL); - ta->posix_timers_n++; - } - - close_image(img); - if (!ret) - sort_posix_timers(ta); - - return ret; -} - -int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core) -{ - int i, ret = -1; - TaskTimersEntry *tte = core->tc->timers; - struct restore_posix_timer *t; - - ta->posix_timers = (struct restore_posix_timer *)rst_mem_align_cpos(RM_PRIVATE); - - if (!tte) - return prepare_posix_timers_from_fd(pid, ta); - - ta->posix_timers_n = tte->n_posix; - ta->posix_timer_cr_ids = kdat.has_timer_cr_ids; - for (i = 0; i < ta->posix_timers_n; i++) { - t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE); - if (!t) - goto out; - - if (decode_posix_timer(tte->posix[i], t)) - goto out; - } - - ret = 0; - sort_posix_timers(ta); -out: - return ret; -} - -static void encode_itimer(struct itimerval *v, ItimerEntry *ie) -{ - ie->isec = v->it_interval.tv_sec; - ie->iusec = v->it_interval.tv_usec; - ie->vsec = v->it_value.tv_sec; - ie->vusec = v->it_value.tv_usec; -} - -int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - struct parasite_dump_itimers_args *args; - int ret; - - args = compel_parasite_args(ctl, struct parasite_dump_itimers_args); - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl); - if (ret < 0) - return ret; - - encode_itimer((&args->real), (core->tc->timers->real)); - encode_itimer((&args->virt), (core->tc->timers->virt)); - encode_itimer((&args->prof), (core->tc->timers->prof)); - - return 0; -} - -static int core_alloc_posix_timers(TaskTimersEntry *tte, int n, PosixTimerEntry **pte) -{ - int sz; - - /* - * Will be free()-ed in core_entry_free() - */ - - sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry)); - tte->posix = xmalloc(sz); - if (!tte->posix) - return -1; - - tte->n_posix = n; - *pte = (PosixTimerEntry *)(tte->posix + n); - return 0; -} - -static int encode_notify_thread_id(pid_t rtid, struct pstree_item *item, PosixTimerEntry *pte) -{ - pid_t vtid = 0; - int i; - - if (rtid == 0) - return 0; - - if (!(root_ns_mask & CLONE_NEWPID)) { - /* Non-pid-namespace case */ - pte->notify_thread_id = rtid; - pte->has_notify_thread_id = true; - return 0; - } - - /* Pid-namespace case */ - if (!kdat.has_nspid) { - pr_err("Have no NSpid support to dump notify thread id in pid namespace\n"); - return -1; - } - - for (i = 0; i < item->nr_threads; i++) { - if (item->threads[i].real != rtid) - continue; - - vtid = item->threads[i].ns[0].virt; - break; - } - - if (vtid == 0) { - pr_err("Unable to convert the notify thread id %d\n", rtid); - return -1; - } - - pte->notify_thread_id = vtid; - pte->has_notify_thread_id = true; - return 0; -} - -static int encode_posix_timer(struct pstree_item *item, struct posix_timer *v, struct proc_posix_timer *vp, - PosixTimerEntry *pte) -{ - pte->it_id = vp->spt.it_id; - pte->clock_id = vp->spt.clock_id; - pte->si_signo = vp->spt.si_signo; - pte->it_sigev_notify = vp->spt.it_sigev_notify; - pte->sival_ptr = encode_pointer(vp->spt.sival_ptr); - - pte->overrun = v->overrun; - - pte->isec = v->val.it_interval.tv_sec; - pte->insec = v->val.it_interval.tv_nsec; - pte->vsec = v->val.it_value.tv_sec; - pte->vnsec = v->val.it_value.tv_nsec; - - if (encode_notify_thread_id(vp->spt.notify_thread_id, item, pte)) - return -1; - - return 0; -} - -int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args, struct parasite_ctl *ctl, - struct pstree_item *item) -{ - CoreEntry *core = item->core[0]; - TaskTimersEntry *tte = core->tc->timers; - PosixTimerEntry *pte; - struct proc_posix_timer *temp; - struct parasite_dump_posix_timers_args *args; - int ret, exit_code = -1; - int args_size; - int i; - - if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte)) - return -1; - - args_size = posix_timers_dump_size(proc_args->timer_n); - args = compel_parasite_args_s(ctl, args_size); - args->timer_n = proc_args->timer_n; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - args->timer[i].it_id = temp->spt.it_id; - i++; - } - - ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl); - if (ret < 0) - goto end_posix; - - i = 0; - list_for_each_entry(temp, &proc_args->timers, list) { - posix_timer_entry__init(&pte[i]); - if (encode_posix_timer(item, &args->timer[i], temp, &pte[i])) - goto end_posix; - tte->posix[i] = &pte[i]; - i++; - } - - exit_code = 0; -end_posix: - free_posix_timers(proc_args); - return exit_code; -} diff --git a/criu/tls.c b/criu/tls.c index 3d365e21d..60bd105bc 100644 --- a/criu/tls.c +++ b/criu/tls.c @@ -8,7 +8,7 @@ #include "cr_options.h" #include "xmalloc.h" -/* Compatibility with GnuTLS version < 3.5 */ +/* Compatability with GnuTLS verson <3.5 */ #ifndef GNUTLS_E_CERTIFICATE_VERIFICATION_ERROR #define GNUTLS_E_CERTIFICATE_VERIFICATION_ERROR GNUTLS_E_CERTIFICATE_ERROR #endif @@ -31,7 +31,7 @@ static gnutls_certificate_credentials_t x509_cred; static int tls_sk = -1; static int tls_sk_flags = 0; -void tls_terminate_session(bool async) +void tls_terminate_session(void) { int ret; @@ -40,26 +40,20 @@ void tls_terminate_session(bool async) if (session) { do { - /* - * Initiate a connection shutdown but don't - * wait for peer to close connection. - */ - ret = gnutls_bye(session, async ? GNUTLS_SHUT_WR : GNUTLS_SHUT_RDWR); + /* don't wait for peer to close connection */ + ret = gnutls_bye(session, GNUTLS_SHUT_WR); } while (ret == GNUTLS_E_AGAIN || ret == GNUTLS_E_INTERRUPTED); - /* Free the session object */ gnutls_deinit(session); } tls_sk = -1; - - /* Free the credentials object */ if (x509_cred) gnutls_certificate_free_credentials(x509_cred); } ssize_t tls_send(const void *buf, size_t len, int flags) { - ssize_t ret; + int ret; tls_sk_flags = flags; ret = gnutls_record_send(session, buf, len); @@ -101,7 +95,7 @@ int tls_send_data_from_fd(int fd, unsigned long len) return -1; while (len > 0) { - ssize_t ret, sent; + int ret, sent; copied = read(fd, buf, min(len, buf_size)); if (copied <= 0) { @@ -125,7 +119,7 @@ err: ssize_t tls_recv(void *buf, size_t len, int flags) { - ssize_t ret; + int ret; tls_sk_flags = flags; ret = gnutls_record_recv(session, buf, len); @@ -169,7 +163,7 @@ int tls_recv_data_to_fd(int fd, unsigned long len) gnutls_packet_t packet; while (len > 0) { - ssize_t ret, w; + int ret, w; gnutls_datum_t pdata; ret = gnutls_record_recv_packet(session, &packet); @@ -235,7 +229,6 @@ static int tls_handshake(void) { int ret = -1; while (ret != GNUTLS_E_SUCCESS) { - /* Establish TLS session */ ret = gnutls_handshake(session); if (gnutls_error_is_fatal(ret)) { tls_perror("TLS handshake failed", ret); @@ -264,7 +257,6 @@ static int tls_x509_setup_creds(void) if (opts.tls_key) key = opts.tls_key; - /* Load the trusted CA certificates */ ret = gnutls_certificate_allocate_credentials(&x509_cred); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to allocate x509 credentials", ret); @@ -306,14 +298,10 @@ static int tls_x509_setup_creds(void) return 0; } -/** - * A function used by gnutls to send data. It returns a positive - * number indicating the bytes sent, and -1 on error. - */ static ssize_t _tls_push_cb(void *p, const void *data, size_t sz) { int fd = *(int *)(p); - ssize_t ret = send(fd, data, sz, tls_sk_flags); + int ret = send(fd, data, sz, tls_sk_flags); if (ret < 0 && errno != EAGAIN) { int _errno = errno; pr_perror("Push callback send failed"); @@ -322,15 +310,10 @@ static ssize_t _tls_push_cb(void *p, const void *data, size_t sz) return ret; } -/** - * A callback function used by gnutls to receive data. - * It returns 0 on connection termination, a positive number - * indicating the number of bytes received, and -1 on error. - */ static ssize_t _tls_pull_cb(void *p, void *data, size_t sz) { int fd = *(int *)(p); - ssize_t ret = recv(fd, data, sz, tls_sk_flags); + int ret = recv(fd, data, sz, tls_sk_flags); if (ret < 0 && errno != EAGAIN) { int _errno = errno; pr_perror("Pull callback recv failed"); @@ -343,33 +326,26 @@ static int tls_x509_setup_session(unsigned int flags) { int ret; - /* Create the session object */ ret = gnutls_init(&session, flags); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to initialize session", ret); return -1; } - /* Install the trusted certificates */ ret = gnutls_credentials_set(session, GNUTLS_CRD_CERTIFICATE, x509_cred); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to set session credentials", ret); return -1; } - /* Configure the cipher preferences */ ret = gnutls_set_default_priority(session); if (ret != GNUTLS_E_SUCCESS) { tls_perror("Failed to set priority", ret); return -1; } - /* Associate the socket with the session object */ gnutls_transport_set_ptr(session, &tls_sk); - - /* Set a push function for gnutls to use to send data */ gnutls_transport_set_push_function(session, _tls_push_cb); - /* set a pull function for gnutls to use to receive data */ gnutls_transport_set_pull_function(session, _tls_pull_cb); if (flags == GNUTLS_SERVER) { @@ -399,6 +375,6 @@ int tls_x509_init(int sockfd, bool is_server) return 0; err: - tls_terminate_session(true); + tls_terminate_session(); return -1; } diff --git a/criu/tty.c b/criu/tty.c index 9a4520d53..1598ad956 100644 --- a/criu/tty.c +++ b/criu/tty.c @@ -22,7 +22,6 @@ #include "rst-malloc.h" #include "log.h" #include "common/list.h" -#include "util.h" #include "util-pie.h" #include "proc_parse.h" #include "file-ids.h" @@ -259,7 +258,7 @@ static int pts_fd_get_index(int fd, const struct fd_parms *p) { int index; const struct fd_link *link = p->link; - const char *pos = strrchr(link->name, '/'); + char *pos = strrchr(link->name, '/'); if (!pos || pos == (link->name + link->len - 1)) { pr_err("Unexpected format on path %s\n", link->name + 1); @@ -399,7 +398,8 @@ static int tty_verify_active_pairs(void) { unsigned long i, unpaired_slaves = 0; - for_each_bit(i, tty_active_pairs) { + for_each_bit(i, tty_active_pairs) + { if ((i % 2) == 0) { if (test_bit(i + 1, tty_active_pairs)) { i++; @@ -560,7 +560,7 @@ static int do_open_tty_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) * them. So simply setup mode from image * the regular file engine will check * for this, so if we fail here it - * gonna be caught anyway. + * gonna be catched anyway. */ if (rfi->rfe->has_mode) fchmod(fd, rfi->rfe->mode); @@ -817,26 +817,8 @@ static int do_restore_tty_parms(void *arg, int fd, pid_t pid) * on termios too. Just to be on the safe side. */ - if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) { - struct termios t; - - if (errno != EPERM) - goto err; - - memzero(&t, sizeof(t)); - if (ioctl(fd, TIOCGLCKTRMIOS, &t) < 0) { - pr_perror("Can't get tty locked params on %#x", p->tty_id); - goto err; - } - - /* - * The ioctl(TIOCSLCKTRMIOS) requires a CRIU process to be privileged - * in the init_user_ns, but if the current "termios_locked" value equal - * to the "termios_locked" value from the image, we can safely skip setting it. - */ - if (memcmp(&t, &p->tl, sizeof(struct termios)) != 0) - goto err; - } + if ((p->has & HAS_TERMIOS_L) && ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0) + goto err; if ((p->has & HAS_TERMIOS) && ioctl(fd, TCSETS, &p->t) < 0) goto err; @@ -886,7 +868,7 @@ static int restore_tty_params(int fd, struct tty_info *info) } if (info->tie->has_uid && info->tie->has_gid) { - if (cr_fchown(fd, info->tie->uid, info->tie->gid)) { + if (fchown(fd, info->tie->uid, info->tie->gid)) { pr_perror("Can't setup uid %d gid %d on %#x", (int)info->tie->uid, (int)info->tie->gid, info->tfe->id); return -1; @@ -1995,12 +1977,6 @@ static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p) pr_info("Dumping tty %d with id %#x\n", lfd, id); driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev); - if (driver == NULL) { - pr_err("Unable to find a tty driver (rdev %#" PRIx64 " dev %#" PRIx64 ")\n", p->stat.st_rdev, - p->stat.st_dev); - return -1; - } - if (driver->fd_get_index) index = driver->fd_get_index(lfd, p); else @@ -2424,9 +2400,9 @@ int devpts_restore(struct mount_info *pm) struct mount_info *bm; int dfd, exit_code = -1; - dfd = open(service_mountpoint(pm), O_RDONLY); + dfd = open(pm->mountpoint, O_RDONLY); if (dfd < 0) { - pr_perror("Unable to open %s", service_mountpoint(pm)); + pr_perror("Unable to open %s", pm->mountpoint); return -1; } diff --git a/criu/tun.c b/criu/tun.c index 9d66f9929..2a2f950da 100644 --- a/criu/tun.c +++ b/criu/tun.c @@ -121,7 +121,7 @@ static int list_tun_link(NetDeviceEntry *nde, unsigned ns_id) if (!tl) return -1; - __strlcpy(tl->name, nde->name, sizeof(tl->name)); + strlcpy(tl->name, nde->name, sizeof(tl->name)); /* * Keep tun-flags not only for persistency fixup (see * comment below), but also for TUNSETIFF -- we must @@ -153,9 +153,8 @@ static struct tun_link *__dump_tun_link_fd(int fd, char *name, unsigned ns_id, u tl = xmalloc(sizeof(*tl)); if (!tl) goto err; - __strlcpy(tl->name, name, sizeof(tl->name)); + strlcpy(tl->name, name, sizeof(tl->name)); tl->ns_id = ns_id; - INIT_LIST_HEAD(&tl->l); if (ioctl(fd, TUNGETVNETHDRSZ, &tl->dmp.vnethdr) < 0) { pr_perror("Can't dump vnethdr size for %s", name); @@ -241,7 +240,7 @@ static int open_tun_dev(char *name, unsigned int idx, unsigned flags) } memset(&ifr, 0, sizeof(ifr)); - __strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); + strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); ifr.ifr_flags = flags; if (ioctl(fd, TUNSETIFF, &ifr)) { @@ -272,7 +271,7 @@ static struct tun_link *get_tun_link_fd(char *name, unsigned ns_id, unsigned fla */ if (!(flags & IFF_PERSIST)) { - pr_err("No fd info for non persistent tun device %s\n", name); + pr_err("No fd infor for non persistent tun device %s\n", name); return NULL; } @@ -393,7 +392,7 @@ static int tunfile_open(struct file_desc *d, int *new_fd) } memset(&ifr, 0, sizeof(ifr)); - __strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); + strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name)); ifr.ifr_flags = tl->rst.flags; if (ioctl(fd, TUNSETIFF, &ifr) < 0) { @@ -455,38 +454,31 @@ int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds, struct nlattr **in TunLinkEntry tle = TUN_LINK_ENTRY__INIT; char spath[64]; char buf[64]; + int ret = 0; struct tun_link *tl; sprintf(spath, "class/net/%s/tun_flags", nde->name); - if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) - return -1; + ret |= read_ns_sys_file(spath, buf, sizeof(buf)); tle.flags = strtol(buf, NULL, 0); sprintf(spath, "class/net/%s/owner", nde->name); - if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) - return -1; + ret |= read_ns_sys_file(spath, buf, sizeof(buf)); tle.owner = strtol(buf, NULL, 10); sprintf(spath, "class/net/%s/group", nde->name); - if (read_ns_sys_file(spath, buf, sizeof(buf)) < 0) - return -1; + ret |= read_ns_sys_file(spath, buf, sizeof(buf)); tle.group = strtol(buf, NULL, 10); + if (ret < 0) + return ret; + tl = get_tun_link_fd(nde->name, nde->peer_nsid, tle.flags); if (!tl) - return -1; + return ret; tle.vnethdr = tl->dmp.vnethdr; tle.sndbuf = tl->dmp.sndbuf; - /* - * Function get_tun_link_fd() can return either entry - * from tun_links list or a newly allocated one, need to - * free it only if not in list. - */ - if (list_empty(&tl->l)) - xfree(tl); - nde->tun = &tle; return write_netdev_img(nde, fds, info); } diff --git a/criu/uffd.c b/criu/uffd.c index 8e12dcd63..18bdc040f 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -59,7 +59,7 @@ #define LAZY_PAGES_RESTORE_FINISHED 0x52535446 /* ReSTore Finished */ /* - * Background transfer parameters. + * Backround transfer parameters. * The default xfer length is arbitrary set to 64Kbytes * The limit of 4Mbytes matches the maximal chunk size we can have in * a pipe in the page-server @@ -71,8 +71,8 @@ static mutex_t *lazy_sock_mutex; struct lazy_iov { struct list_head l; - unsigned long start; /* run-time start address, tracks remaps */ - unsigned long end; /* run-time end address, tracks remaps */ + unsigned long start; /* run-time start address, tracks remaps */ + unsigned long end; /* run-time end address, tracks remaps */ unsigned long img_start; /* start address at the dump time */ }; @@ -281,7 +281,7 @@ int uffd_open(int flags, unsigned long *features, int *err) } if (uffdio_api.api != UFFD_API) { - pr_err("Incompatible uffd API: expected %llu, got %llu\n", UFFD_API, uffdio_api.api); + pr_err("Incompatible uffd API: expected %Lu, got %Lu\n", UFFD_API, uffdio_api.api); goto close; } @@ -668,11 +668,12 @@ static int remap_iovs(struct lazy_pages_info *lpi, unsigned long from, unsigned */ static int collect_iovs(struct lazy_pages_info *lpi) { - unsigned long start, end, len, nr_pages = 0; - int n_vma = 0, max_iov_len = 0, ret = -1; struct page_read *pr = &lpi->pr; struct lazy_iov *iov; MmEntry *mm; + int nr_pages = 0, n_vma = 0, max_iov_len = 0; + int ret = -1; + unsigned long start, end, len; mm = init_mm_entry(lpi); if (!mm) @@ -727,7 +728,7 @@ free_mm: return ret; } -static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, unsigned long nr); +static int uffd_io_complete(struct page_read *pr, unsigned long vaddr, int nr); static int ud_open(int client, struct lazy_pages_info **_lpi) { @@ -821,7 +822,7 @@ static bool uffd_recoverable_error(int mcopy_rc) return false; } -static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsigned long *nr_pages, long mcopy_rc) +static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, int *nr_pages, long mcopy_rc) { if (errno == ENOSPC || errno == ESRCH) { handle_exit(lpi); @@ -843,7 +844,7 @@ static int uffd_check_op_error(struct lazy_pages_info *lpi, const char *op, unsi return 0; } -static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long *nr_pages) +static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, int *nr_pages) { struct uffdio_copy uffdio_copy; unsigned long len = *nr_pages * page_size(); @@ -864,12 +865,12 @@ static int uffd_copy(struct lazy_pages_info *lpi, __u64 address, unsigned long * return 0; } -static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsigned long nr) +static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, int nr) { struct lazy_pages_info *lpi; - unsigned long addr = 0, req_pages; + unsigned long addr = 0; + int req_pages, ret; struct lazy_iov *req; - int ret; lpi = container_of(pr, struct lazy_pages_info, pr); @@ -919,7 +920,7 @@ static int uffd_io_complete(struct page_read *pr, unsigned long img_addr, unsign return drop_iovs(lpi, addr, nr * PAGE_SIZE); } -static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long nr_pages) +static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, int nr_pages) { struct uffdio_zeropage uffdio_zeropage; unsigned long len = page_size() * nr_pages; @@ -945,7 +946,7 @@ static int uffd_zero(struct lazy_pages_info *lpi, __u64 address, unsigned long n * Returns 0 for zero pages, 1 for "real" pages and negative value on * error */ -static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr) +static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, int nr) { int ret; @@ -960,7 +961,7 @@ static int uffd_seek_pages(struct lazy_pages_info *lpi, __u64 address, unsigned return 0; } -static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, unsigned long nr, unsigned flags) +static int uffd_handle_pages(struct lazy_pages_info *lpi, __u64 address, int nr, unsigned flags) { int ret; @@ -1002,7 +1003,7 @@ static void update_xfer_len(struct lazy_pages_info *lpi, bool pf) static int xfer_pages(struct lazy_pages_info *lpi) { struct lazy_iov *iov; - unsigned long nr_pages; + unsigned int nr_pages; unsigned long len; int err; @@ -1097,8 +1098,6 @@ static int handle_fork(struct lazy_pages_info *parent_lpi, struct uffd_msg *msg) lpi_get(lpi->parent); - page_read_disable_dedup(&parent_lpi->pr); - page_read_disable_dedup(&lpi->pr); return 1; out: @@ -1469,7 +1468,7 @@ int cr_lazy_pages(bool daemon) ret = handle_requests(epollfd, &events, nr_fds); - disconnect_from_page_server(); + tls_terminate_session(); xfree(events); return ret; diff --git a/criu/unittest/mock.c b/criu/unittest/mock.c index b2d507278..fb9d38494 100644 --- a/criu/unittest/mock.c +++ b/criu/unittest/mock.c @@ -2,11 +2,8 @@ #include #include -#include #include -#include "compel/infect-util.h" - int add_external(char *key) { return 0; @@ -100,47 +97,3 @@ int close_service_fd(int type) void compel_log_init(int log_fn, unsigned int level) { } - -void set_cr_errno(int new_err) -{ -} - -struct ns_desc {}; -struct ns_desc user_ns_desc; -int switch_ns(int pid, struct ns_desc *nd, int *rst) -{ - return -1; -} - -enum script_actions { ACT_FAKE }; -int run_scripts(enum script_actions act) -{ - return -1; -} - -typedef struct VmaEntry VmaEntry; -struct VmaEntry {}; -void vma_entry__init(VmaEntry *message) -{ -} - -int clone_noasan(int (*fn)(void *), int flags, void *arg) -{ - return -1; -} - -struct kerndat_s { - unsigned int sysctl_nr_open; -}; -struct kerndat_s kdat = {}; - -int service_fd_rlim_cur; - -unsigned __page_size; - -int check_mount_v2(void) -{ - return 0; -} - -char compel_run_id[RUN_ID_HASH_LENGTH]; diff --git a/criu/unittest/unit.c b/criu/unittest/unit.c index 54769e6f2..3ee141e25 100644 --- a/criu/unittest/unit.c +++ b/criu/unittest/unit.c @@ -3,7 +3,6 @@ #include #include "log.h" -#include "util.h" #include "criu-log.h" int parse_statement(int i, char *line, char **configuration); @@ -101,48 +100,6 @@ int main(int argc, char *argv[], char *envp[]) i = parse_statement(0, "a b c d e f g h i\n", configuration); assert(i == -1); - /* get_relative_path */ - /* different kinds of representation of "/" */ - assert(!strcmp(get_relative_path("/", "/"), "")); - assert(!strcmp(get_relative_path("/", ""), "")); - assert(!strcmp(get_relative_path("", "/"), "")); - assert(!strcmp(get_relative_path(".", "/"), "")); - assert(!strcmp(get_relative_path("/", "."), "")); - assert(!strcmp(get_relative_path("/", "./"), "")); - assert(!strcmp(get_relative_path("./", "/"), "")); - assert(!strcmp(get_relative_path("/.", "./"), "")); - assert(!strcmp(get_relative_path("./", "/."), "")); - assert(!strcmp(get_relative_path(".//////.", ""), "")); - assert(!strcmp(get_relative_path("/./", ""), "")); - - /* all relative paths given are assumed relative to "/" */ - assert(!strcmp(get_relative_path("/a/b/c", "a/b/c"), "")); - - /* multiple slashes are ignored, only directory names matter */ - assert(!strcmp(get_relative_path("///alfa///beta///gamma///", "//alfa//beta//gamma//"), "")); - - /* returned path is always relative */ - assert(!strcmp(get_relative_path("/a/b/c", "/"), "a/b/c")); - assert(!strcmp(get_relative_path("/a/b/c", "/a/b"), "c")); - - /* single dots supported */ - assert(!strcmp(get_relative_path("./a/b", "a/"), "b")); - - /* double dots are partially supported */ - assert(!strcmp(get_relative_path("a/../b", "a"), "../b")); - assert(!strcmp(get_relative_path("a/../b", "a/.."), "b")); - assert(!get_relative_path("a/../b/c", "b")); - - /* if second path is not subpath - NULL returned */ - assert(!get_relative_path("/a/b/c", "/a/b/d")); - assert(!get_relative_path("/a/b", "/a/b/c")); - assert(!get_relative_path("/a/b/c/d", "b/c/d")); - - assert(!strcmp(get_relative_path("./a////.///./b//././c", "///./a/b"), "c")); - - /* leaves punctuation in returned string as is */ - assert(!strcmp(get_relative_path("./a////.///./b//././c", "a"), "b//././c")); - pr_msg("OK\n"); return 0; } diff --git a/criu/util.c b/criu/util.c index 2eaad35bb..06124c220 100644 --- a/criu/util.c +++ b/criu/util.c @@ -1,4 +1,4 @@ -#define _XOPEN_SOURCE 500 +#define _XOPEN_SOURCE #include #include @@ -24,11 +24,8 @@ #include #include #include +#include #include -#include -#include -#include -#include #include "linux/mount.h" @@ -40,7 +37,6 @@ #include "mem.h" #include "namespaces.h" #include "criu-log.h" -#include "util-caps.h" #include "clone-noasan.h" #include "cr_options.h" @@ -48,14 +44,10 @@ #include "files.h" #include "pstree.h" #include "sched.h" -#include "mount-v2.h" #include "cr-errno.h" #include "action-scripts.h" -#include "compel/infect-util.h" -#include - #define VMA_OPT_LEN 128 static int xatol_base(const char *string, long *number, int base) @@ -195,7 +187,6 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); - opt2s(VMA_AREA_UPROBES, "uprobes"); #undef opt2s } @@ -222,9 +213,10 @@ int close_safe(int *fd) if (*fd > -1) { ret = close(*fd); - if (ret) - pr_perror("Failed closing fd %d", *fd); - *fd = -1; + if (!ret) + *fd = -1; + else + pr_perror("Unable to close fd %d", *fd); } return ret; @@ -519,25 +511,12 @@ int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned return cr_system_userns(in, out, err, cmd, argv, flags, -1); } -int cr_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags) -{ - return syscall(__NR_close_range, fd, max_fd, flags); -} - -int close_fds(int minfd) +static int close_fds(int minfd) { DIR *dir; struct dirent *de; int fd, ret, dfd; - if (kdat.has_close_range) { - if (cr_close_range(minfd, ~0, 0)) { - pr_perror("close_range failed"); - return -1; - } - return 0; - } - dir = opendir("/proc/self/fd"); if (dir == NULL) { pr_perror("Can't open /proc/self/fd"); @@ -675,79 +654,6 @@ out: return ret; } -struct child_args { - int *sk_pair; - int (*child_setup)(void); -}; - -static int child_func(void *_args) -{ - struct child_args *args = _args; - int sk, *sk_pair = args->sk_pair; - char c = 0; - - sk = sk_pair[1]; - close(sk_pair[0]); - - if (args->child_setup && args->child_setup() != 0) - exit(1); - - if (write(sk, &c, 1) != 1) { - pr_perror("write"); - exit(1); - } - - while (1) - sleep(1000); - exit(1); -} - -pid_t fork_and_ptrace_attach(int (*child_setup)(void)) -{ - pid_t pid; - int sk_pair[2], sk; - char c = 0; - struct child_args cargs = { - .sk_pair = sk_pair, - .child_setup = child_setup, - }; - - if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) { - pr_perror("socketpair"); - return -1; - } - - pid = clone_noasan(child_func, CLONE_UNTRACED | SIGCHLD, &cargs); - if (pid < 0) { - pr_perror("fork"); - return -1; - } - - sk = sk_pair[0]; - close(sk_pair[1]); - - if (read(sk, &c, 1) != 1) { - close(sk); - kill(pid, SIGKILL); - waitpid(pid, NULL, 0); - pr_perror("read"); - return -1; - } - - close(sk); - - if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) { - pr_perror("Unable to ptrace the child"); - kill(pid, SIGKILL); - waitpid(pid, NULL, 0); - return -1; - } - - waitpid(pid, NULL, 0); - - return pid; -} - int status_ready(void) { char c = 0; @@ -979,89 +885,6 @@ FILE *fopenat(int dirfd, char *path, char *cflags) return fdopen(tmp, cflags); } -int cr_fchown(int fd, uid_t new_uid, gid_t new_gid) -{ - struct stat st; - - if (!fchown(fd, new_uid, new_gid)) - return 0; - if (errno != EPERM) - return -1; - - if (fstat(fd, &st) < 0) { - pr_perror("fstat() after fchown() for fd %d", fd); - goto out_eperm; - } - pr_debug("fstat(%d): uid %u gid %u\n", fd, st.st_uid, st.st_gid); - - if (new_uid != st.st_uid || new_gid != st.st_gid) - goto out_eperm; - - return 0; -out_eperm: - errno = EPERM; - return -1; -} - -int cr_fchpermat(int dirfd, const char *path, uid_t new_uid, gid_t new_gid, mode_t new_mode, int flags) -{ - struct stat st; - int ret; - - if (fchownat(dirfd, path, new_uid, new_gid, flags) < 0 && errno != EPERM) { - int errno_cpy = errno; - pr_perror("Unable to change [%d]/%s ownership to (%d, %d)", - dirfd, path, new_uid, new_gid); - errno = errno_cpy; - return -1; - } - - if (fstatat(dirfd, path, &st, flags) < 0) { - int errno_cpy = errno; - pr_perror("Unable to stat [%d]/%s", dirfd, path); - errno = errno_cpy; - return -1; - } - - if (new_uid != st.st_uid || new_gid != st.st_gid) { - errno = EPERM; - pr_perror("Unable to change [%d]/%s ownership (%d, %d) to (%d, %d)", - dirfd, path, st.st_uid, st.st_gid, new_uid, new_gid); - errno = EPERM; - return -1; - } - - if (new_mode == st.st_mode) - return 0; - - if (S_ISLNK(st.st_mode)) { - /* - * We have no lchmod() function, and fchmod() will fail on - * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() - * function and flag AT_SYMLINK_NOFOLLOW described in - * man 2 fchmodat, but it is not currently implemented. %) - */ - return 0; - } - - if (!*path && flags & AT_EMPTY_PATH) - ret = fchmod(dirfd, new_mode); - else - ret = fchmodat(dirfd, path, new_mode, flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)); - if (ret < 0) { - int errno_cpy = errno; - pr_perror("Unable to set perms %o on [%d]/%s", new_mode, dirfd, path); - errno = errno_cpy; - } - - return ret; -} - -int cr_fchperm(int fd, uid_t new_uid, gid_t new_gid, mode_t new_mode) -{ - return cr_fchpermat(fd, "", new_uid, new_gid, new_mode, AT_EMPTY_PATH); -} - void split(char *str, char token, char ***out, int *n) { int i; @@ -1182,6 +1005,20 @@ const char *ns_to_string(unsigned int ns) } } +void tcp_cork(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val))) + pr_perror("Unable to restore TCP_CORK (%d)", val); +} + +void tcp_nodelay(int sk, bool on) +{ + int val = on ? 1 : 0; + if (setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val))) + pr_perror("Unable to restore TCP_NODELAY (%d)", val); +} + static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, unsigned short port) { memset(addr, 0, sizeof(*addr)); @@ -1269,7 +1106,7 @@ out: int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) { int ret; - struct sockaddr_storage caddr; + struct sockaddr_in caddr; socklen_t clen = sizeof(caddr); if (daemon_mode) { @@ -1297,20 +1134,13 @@ int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) return -1; if (sk >= 0) { - char port[6]; - char address[INET6_ADDRSTRLEN]; *ask = accept(sk, (struct sockaddr *)&caddr, &clen); if (*ask < 0) { pr_perror("Can't accept connection to server"); goto err; - } - ret = getnameinfo((struct sockaddr *)&caddr, clen, address, sizeof(address), port, sizeof(port), - NI_NUMERICHOST | NI_NUMERICSERV); - if (ret) { - pr_err("Failed converting address: %s\n", gai_strerror(ret)); - goto err; - } - pr_info("Accepted connection from %s:%s\n", address, port); + } else + pr_info("Accepted connection from %s:%u\n", inet_ntoa(caddr.sin_addr), + (int)ntohs(caddr.sin_port)); close(sk); } @@ -1523,9 +1353,6 @@ void rlimit_unlimit_nofile(void) { struct rlimit new; - if (opts.unprivileged && !has_cap_sys_resource(opts.cap_eff)) - return; - new.rlim_cur = kdat.sysctl_nr_open; new.rlim_max = kdat.sysctl_nr_open; @@ -1556,78 +1383,23 @@ void print_stack_trace(pid_t pid) } #endif -int cr_fsopen(const char *fsname, unsigned int flags) -{ - return syscall(__NR_fsopen, fsname, flags); -} - -int cr_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) -{ - int ret = syscall(__NR_fsconfig, fd, cmd, key, value, aux); - if (ret) - fsfd_dump_messages(fd); - return ret; -} - -int cr_fsmount(int fd, unsigned int flags, unsigned int attr_flags) -{ - int ret = syscall(__NR_fsmount, fd, flags, attr_flags); - if (ret) - fsfd_dump_messages(fd); - return ret; -} - -void fsfd_dump_messages(int fd) -{ - char buf[4096]; - int err, n; - - err = errno; - - for (;;) { - n = read(fd, buf, sizeof(buf) - 1); - if (n < 0) { - if (errno != ENODATA) - pr_perror("Unable to read from fs descriptor"); - break; - } - buf[n] = 0; - - switch (buf[0]) { - case 'w': - pr_warn("%s\n", buf); - break; - case 'i': - pr_info("%s\n", buf); - break; - case 'e': - /* fallthrough */ - default: - pr_err("%s\n", buf); - break; - } - } - - errno = err; -} - int mount_detached_fs(const char *fsname) { int fsfd, fd; - fsfd = cr_fsopen(fsname, 0); + fsfd = sys_fsopen(fsname, 0); if (fsfd < 0) { pr_perror("Unable to open the %s file system", fsname); return -1; } - if (cr_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { pr_perror("Unable to create the %s file system", fsname); close(fsfd); return -1; } - fd = cr_fsmount(fsfd, 0, 0); + fd = sys_fsmount(fsfd, 0, 0); if (fd < 0) pr_perror("Unable to mount the %s file system", fsname); close(fsfd); @@ -1717,7 +1489,7 @@ static int is_iptables_nft(char *bin) goto err; } - ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, CRS_CAN_FAIL); + ret = cr_system(-1, pfd[1], -1, cmd[0], cmd, 0); if (ret) { pr_err("%s -V failed\n", cmd[0]); goto err; @@ -1745,46 +1517,44 @@ err: return ret; } -char *get_legacy_iptables_bin(bool ipv6, bool restore) +char *get_legacy_iptables_bin(bool ipv6) { - static char iptables_bin[2][2][32]; + static char iptables_bin[2][32]; /* 0 - means we don't know yet, * -1 - not present, * 1 - present. */ - static int iptables_present[2][2] = { { 0, 0 }, { 0, 0 } }; - char bins[2][2][2][32] = { { { "iptables-save", "iptables-legacy-save" }, - { "iptables-restore", "iptables-legacy-restore" } }, - { { "ip6tables-save", "ip6tables-legacy-save" }, - { "ip6tables-restore", "ip6tables-legacy-restore" } } }; + static int iptables_present[2] = { 0, 0 }; + char bins[2][2][32] = { { "iptables-save", "iptables-legacy-save" }, + { "ip6tables-save", "ip6tables-legacy-save" } }; int ret; - if (iptables_present[ipv6][restore] == -1) + if (iptables_present[ipv6] == -1) return NULL; - if (iptables_present[ipv6][restore] == 1) - return iptables_bin[ipv6][restore]; + if (iptables_present[ipv6] == 1) + return iptables_bin[ipv6]; - memcpy(iptables_bin[ipv6][restore], bins[ipv6][restore][0], strlen(bins[ipv6][restore][0]) + 1); - ret = is_iptables_nft(iptables_bin[ipv6][restore]); + memcpy(iptables_bin[ipv6], bins[ipv6][0], strlen(bins[ipv6][0]) + 1); + ret = is_iptables_nft(iptables_bin[ipv6]); /* * iptables on host uses nft backend (or not installed), * let's try iptables-legacy */ if (ret < 0 || ret == 1) { - memcpy(iptables_bin[ipv6][restore], bins[ipv6][restore][1], strlen(bins[ipv6][restore][1]) + 1); - ret = is_iptables_nft(iptables_bin[ipv6][restore]); + memcpy(iptables_bin[ipv6], bins[ipv6][1], strlen(bins[ipv6][1]) + 1); + ret = is_iptables_nft(iptables_bin[ipv6]); if (ret < 0 || ret == 1) { - iptables_present[ipv6][restore] = -1; + iptables_present[ipv6] = -1; return NULL; } } /* we can come here with iptables-save or iptables-legacy-save */ - iptables_present[ipv6][restore] = 1; + iptables_present[ipv6] = 1; - return iptables_bin[ipv6][restore]; + return iptables_bin[ipv6]; } /* @@ -1843,25 +1613,44 @@ ssize_t write_all(int fd, const void *buf, size_t size) return n; } -static int remove_one(const char *fpath, const struct stat *sb, int tflag, struct FTW *ftwbuf) +int rm_rf(char *target) { - int ret; + int offset = strlen(target); + DIR *dir = NULL; + struct dirent *de; + int ret = -1; - ret = remove(fpath); - if (ret) { - pr_perror("rmrf: unable to remove %s", fpath); + dir = opendir(target); + if (!dir) { + pr_perror("unable to open %s", target); return -1; } - return 0; -} + while ((de = readdir(dir))) { + int n; -#define NFTW_FD_MAX 64 + if (dir_dots(de)) + continue; -int rmrf(char *path) -{ - pr_debug("rmrf: removing %s\n", path); - return nftw(path, remove_one, NFTW_FD_MAX, FTW_DEPTH | FTW_PHYS); + n = snprintf(target + offset, PATH_MAX - offset, "/%s", de->d_name); + if (n < 0 || n >= PATH_MAX) { + pr_err("snprintf failed\n"); + goto out; + } + + if (de->d_type == DT_DIR && rm_rf(target)) + goto out; + + if (remove(target) < 0) { + pr_perror("unable to remove %s", target); + goto out; + } + } + + ret = 0; +out: + target[offset] = 0; + return ret; } __attribute__((returns_twice)) static pid_t raw_legacy_clone(unsigned long flags, int *pidfd) @@ -1898,8 +1687,8 @@ __attribute__((returns_twice)) static pid_t raw_legacy_clone(unsigned long flags */ "addx %%g0, 0, %%g1" : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ - : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ - : "%cc"); /* clobbers */ + : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ + : "%cc"); /* clobbers */ is_error = g1; retval = o0; @@ -2026,217 +1815,3 @@ int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) return fret; } - -char criu_run_id[RUN_ID_HASH_LENGTH]; - -void util_init(void) -{ - uuid_t uuid; - - uuid_generate(uuid); - uuid_unparse(uuid, criu_run_id); - pr_info("CRIU run id = %s\n", criu_run_id); - memcpy(compel_run_id, criu_run_id, sizeof(criu_run_id)); -} - -/* - * This function cuts sub_path from the path. - * 1) It assumes all relative paths given are relative to "/": - * /a/b/c is the same as a/b/c - * 2) It can handle paths with multiple consequent slashes: - * ///a///b///c is the same as /a/b/c - * 3) It always returns relative path, with no leading slash: - * get_relative_path("/a/b/c", "/") would be "a/b/c" - * get_relative_path("/a/b/c", "/a/b") would be "c" - * get_relative_path("/", "/") would be "" - * 4) It can handle paths with single dots: - * get_relative_path("./a/b", "a/") would be "b" - * 5) Note ".." in paths are not supported and handled as normal directory name - */ -char *get_relative_path(char *path, char *sub_path) -{ - bool skip_slashes = true; - - while (1) { - if ((*path == '/' || *path == '\0') && (*sub_path == '/' || *sub_path == '\0')) - skip_slashes = true; - - if (skip_slashes) { - while (*path == '/' || (path[0] == '.' && (path[1] == '/' || path[1] == '\0'))) - path++; - while (*sub_path == '/' || (sub_path[0] == '.' && (sub_path[1] == '/' || sub_path[1] == '\0'))) - sub_path++; - } - - if (*sub_path == '\0') { - if (skip_slashes) - return path; - return NULL; - } - skip_slashes = false; - - if (*path == '\0') - return NULL; - - if (*path != *sub_path) - return NULL; - - path++; - sub_path++; - } - - /* will never get here */ - return NULL; -} - -bool is_sub_path(char *path, char *sub_path) -{ - char *rel_path; - - rel_path = get_relative_path(path, sub_path); - if (!rel_path) - return false; - - return true; -} - -bool is_same_path(char *path1, char *path2) -{ - char *rel_path; - - rel_path = get_relative_path(path1, path2); - if (!rel_path || *rel_path != '\0') - return false; - - return true; -} - -/* - * Checks if path is a mountpoint - * (path should be visible - no overmounts) - */ -static int path_is_mountpoint(char *path, bool *is_mountpoint) -{ - char *dname, *bname, *free_name; - struct open_how how = { - .flags = O_PATH, - .resolve = RESOLVE_NO_XDEV, - }; - int exit_code = -1; - int dfd, fd; - - dname = free_name = xstrdup(path); - if (!dname) - return -1; - dname = dirname(dname); - - bname = get_relative_path(path, dname); - if (!bname || *bname == '\0') { - pr_err("Failed to get bname for %s\n", path); - goto err_free; - } - - dfd = open(dname, O_PATH); - if (dfd < 0) { - pr_perror("Failed to open dir %s", dname); - goto err_free; - } - - fd = sys_openat2(dfd, bname, &how, sizeof(how)); - if (fd < 0) { - if (errno != EXDEV) { - pr_perror("Failed to open %s at %s", bname, dname); - goto err_close; - } - - /* - * EXDEV means that dfd and bname are from different - * mounts, meaning that bname is a mountpoint - */ - *is_mountpoint = true; - } else { - /* - * No error means that dfd and bname are from same mount, - * meaning that bname is not a mountpoint - */ - *is_mountpoint = false; - close(fd); - } - - exit_code = 0; -err_close: - close(dfd); -err_free: - xfree(free_name); - return exit_code; -} - -/* - * Resolves real mountpoint path by any path on it - * (path should be visible - no overmountes) - */ -char *resolve_mountpoint(char *path) -{ - char *mp_path, *free_path; - bool is_mountpoint; - - /* - * The dirname() function may modify the contents of given path, - * so we need a strdup here to preserve path. - */ - mp_path = free_path = xstrdup(path); - if (!mp_path) - return NULL; - - while (1) { - /* - * If we see "/" or "." we can't check if they are mountpoints - * by openat2 RESOLVE_NO_XDEV, let's just assume they are. - */ - if (is_same_path(mp_path, "/")) - goto out; - - if (path_is_mountpoint(mp_path, &is_mountpoint) == -1) { - xfree(free_path); - return NULL; - } - - if (is_mountpoint) - goto out; - - /* Try parent directory */ - mp_path = dirname(mp_path); - } - - /* never get here */ - xfree(free_path); - return NULL; -out: - /* - * The dirname() function may or may not return statically allocated - * strings, so here mp_path can be either dynamically allocated or - * statically allocated. Let's strdup to make the return pointer - * always freeable. - */ - mp_path = xstrdup(mp_path); - xfree(free_path); - return mp_path; -} - -int set_opts_cap_eff(void) -{ - struct __user_cap_header_struct cap_header; - struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3]; - int i; - - cap_header.version = _LINUX_CAPABILITY_VERSION_3; - cap_header.pid = getpid(); - - if (capget(&cap_header, &cap_data[0])) - return -1; - - for (i = 0; i < _LINUX_CAPABILITY_U32S_3; i++) - memcpy(&opts.cap_eff[i], &cap_data[i].effective, sizeof(u32)); - - return 0; -} diff --git a/criu/vdso.c b/criu/vdso.c index 2d9e57c4d..dc70513d8 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -145,9 +145,6 @@ static void drop_rt_vdso(struct vm_area_list *vma_area_list, struct vdso_quarter * Also BTW search for rt-vvar to remove it later. */ list_for_each_entry(vma, &vma_area_list->h, list) { - if (vma_area_is(vma, VMA_AREA_GUARD)) - continue; - if (vma->e->start == addr->orig_vdso) { vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO; pr_debug("vdso: Restore orig vDSO status at %lx\n", (long)vma->e->start); @@ -279,9 +276,6 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vm_area_list } list_for_each_entry(vma, &vma_area_list->h, list) { - if (vma_area_is(vma, VMA_AREA_GUARD)) - continue; - /* * Defer handling marked vdso until we walked over * all vmas and restore potentially remapped vDSO @@ -316,7 +310,7 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) while (1) { unsigned long start, end; - char *has_vdso, *has_vvar, *has_vvar_vclock; + char *has_vdso, *has_vvar; buf = breadline(&f); if (buf == NULL) @@ -324,19 +318,13 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) if (IS_ERR(buf)) goto err; - has_vvar = NULL; - has_vvar_vclock = NULL; - do { - has_vdso = strstr(buf, "[vdso]"); - if (has_vdso) - break; + has_vdso = strstr(buf, "[vdso]"); + if (!has_vdso) has_vvar = strstr(buf, "[vvar]"); - if (has_vvar) - break; - has_vvar_vclock = strstr(buf, "[vvar_vclock]"); - } while (0); + else + has_vvar = NULL; - if (!has_vdso && !has_vvar && !has_vvar_vclock) + if (!has_vdso && !has_vvar) continue; if (sscanf(buf, "%lx-%lx", &start, &end) != 2) { @@ -351,21 +339,13 @@ static int vdso_parse_maps(pid_t pid, struct vdso_maps *s) } s->vdso_start = start; s->sym.vdso_size = end - start; - } else if (has_vvar) { + } else { if (s->vvar_start != VVAR_BAD_ADDR) { pr_err("Got second VVAR entry\n"); goto err; } s->vvar_start = start; s->sym.vvar_size = end - start; - } else { - if (s->vvar_start == VDSO_BAD_ADDR || - s->vvar_start + s->sym.vvar_size != start) { - pr_err("VVAR and VVAR_VCLOCK entries are not subsequent\n"); - goto err; - } - s->sym.vvar_vclock_size = end - start; - s->sym.vvar_size += s->sym.vvar_vclock_size; } } @@ -499,7 +479,7 @@ out_close: return ret; } -#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 4) +#define COMPAT_VDSO_BUF_SZ (PAGE_SIZE * 2) static int vdso_fill_compat_symtable(struct vdso_maps *native, struct vdso_maps *compat) { void *vdso_mmap; @@ -520,7 +500,7 @@ static int vdso_fill_compat_symtable(struct vdso_maps *native, struct vdso_maps } if (vdso_fill_symtable_compat((uintptr_t)vdso_mmap, compat->sym.vdso_size, &compat->sym)) { - pr_err("Failed to parse mmapped compatible vdso blob\n"); + pr_err("Failed to parse mmaped compatible vdso blob\n"); goto out_unmap; } diff --git a/flake.lock b/flake.lock deleted file mode 100644 index 90c914452..000000000 --- a/flake.lock +++ /dev/null @@ -1,61 +0,0 @@ -{ - "nodes": { - "flake-utils": { - "inputs": { - "systems": "systems" - }, - "locked": { - "lastModified": 1731533236, - "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } - }, - "nixpkgs": { - "locked": { - "lastModified": 1744463964, - "narHash": "sha256-LWqduOgLHCFxiTNYi3Uj5Lgz0SR+Xhw3kr/3Xd0GPTM=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "2631b0b7abcea6e640ce31cd78ea58910d31e650", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" - } - }, - "root": { - "inputs": { - "flake-utils": "flake-utils", - "nixpkgs": "nixpkgs" - } - }, - "systems": { - "locked": { - "lastModified": 1681028828, - "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", - "owner": "nix-systems", - "repo": "default", - "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", - "type": "github" - }, - "original": { - "owner": "nix-systems", - "repo": "default", - "type": "github" - } - } - }, - "root": "root", - "version": 7 -} diff --git a/flake.nix b/flake.nix deleted file mode 100644 index dc2429ffc..000000000 --- a/flake.nix +++ /dev/null @@ -1,77 +0,0 @@ -{ - description = "CRIU development environment"; - - inputs = { - nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - flake-utils.url = "github:numtide/flake-utils"; - }; - - outputs = { self, nixpkgs, flake-utils }: - flake-utils.lib.eachDefaultSystem (system: - let - pkgs = nixpkgs.legacyPackages.${system}; - - # Dependencies for CRIU - criuDeps = with pkgs; [ - # Compiler and build essentials - gcc - gnumake - pkg-config - - # Protocol Buffers - protobuf - protobufc - python3Packages.protobuf - - # Other required libraries - libuuid - libbsd - iproute2 - nftables - libcap - libnet - libnl - libaio - gnutls - libdrm - - # ZDTM - python3Packages.pyyaml - ]; - - # Multilib support for 32-bit compatibility - # criuDeps32bit = with pkgs; [ - # glibc.dev - # glibc - # gcc-unwrapped - # ]; - - devShell = pkgs.mkShell { - buildInputs = criuDeps; # ++ (if pkgs.stdenv.isx86_64 then criuDeps32bit else []); - - shellHook = '' - echo "CRIU development environment" - echo "==============================" - echo "" - echo "Useful commands:" - echo " make - Build CRIU" - echo " make test - Run tests (requires ZDTM dependencies)" - echo "" - ''; - - # Add proper flags for multilib support - # NIX_CFLAGS_COMPILE = pkgs.lib.optional pkgs.stdenv.isx86_64 "-m32"; - - # Make sure the shell can find headers for multilib - # PKG_CONFIG_PATH = pkgs.lib.makeSearchPath "lib/pkgconfig" criuDeps; - }; - in - { - # Export the development shell - devShells.default = devShell; - - # Build CRIU package as well - packages.default = pkgs.criu; - } - ); -} diff --git a/images/Makefile b/images/Makefile index 2c33152e9..2eaeb7cad 100644 --- a/images/Makefile +++ b/images/Makefile @@ -2,12 +2,10 @@ proto-obj-y += stats.o proto-obj-y += core.o proto-obj-y += core-x86.o proto-obj-y += core-mips.o -proto-obj-y += core-loongarch64.o proto-obj-y += core-arm.o proto-obj-y += core-aarch64.o proto-obj-y += core-ppc64.o proto-obj-y += core-s390.o -proto-obj-y += core-riscv64.o proto-obj-y += cpuinfo.o proto-obj-y += inventory.o proto-obj-y += fdinfo.o @@ -58,6 +56,7 @@ proto-obj-y += ext-file.o proto-obj-y += cgroup.o proto-obj-y += userns.o proto-obj-y += pidns.o +proto-obj-y += google/protobuf/descriptor.o # To make protoc-c happy and compile opts.proto proto-obj-y += opts.o proto-obj-y += seccomp.o proto-obj-y += binfmt-misc.o @@ -72,8 +71,6 @@ proto-obj-y += img-streamer.o proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o -proto-obj-y += rseq.o -proto-obj-y += pidfd.o CFLAGS += -iquote $(obj)/ @@ -90,27 +87,12 @@ endef makefile-deps := Makefile $(obj)/Makefile -# -# Generate descriptor.pb-c.c and descriptor.pb-c.h to compile opts.proto. -DESCRIPTOR_DIR := images/google/protobuf -GOOGLE_INCLUDE=$(shell pkg-config protobuf --variable=includedir)/google/protobuf -$(DESCRIPTOR_DIR)/descriptor.pb-c.c: $(GOOGLE_INCLUDE)/descriptor.proto - $(call msg-gen, $@) - $(Q) protoc --proto_path=/usr/include --proto_path=$(obj)/ --c_out=$(obj)/ $< - -cleanup-y += $(DESCRIPTOR_DIR)/descriptor.pb-c.d - -submrproper: - $(Q) rm -f $(DESCRIPTOR_DIR)/* -.PHONY: submrproper -mrproper: submrproper - # # Generates rules needed to compile protobuf files. define gen-proto-rules $(obj)/$(1).pb-c.c $(obj)/$(1).pb-c.h: $(obj)/$(1).proto $(addsuffix .pb-c.c,$(addprefix $(obj)/,$(2))) $(makefile-deps) $$(E) " PBCC " $$@ - $$(Q) protoc --proto_path=$(obj)/ --c_out=$(obj)/ $$< + $$(Q) protoc-c --proto_path=$(obj)/ --c_out=$(obj)/ $$< ifeq ($(PROTOUFIX),y) $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$@ $$(Q) sed -i -e 's/4294967295/0xFFFFFFFF/g' $$(patsubst %.c,%.h,$$@) diff --git a/images/bpfmap-file.proto b/images/bpfmap-file.proto index 895321e13..34a6c1dd2 100644 --- a/images/bpfmap-file.proto +++ b/images/bpfmap-file.proto @@ -21,5 +21,4 @@ message bpfmap_file_entry { required string map_name = 13; required uint32 ifindex = 14 [default = 0]; optional sint32 mnt_id = 15 [default = -1]; - optional uint64 map_extra = 16; } diff --git a/images/cgroup.proto b/images/cgroup.proto index 02f226835..ee0354124 100644 --- a/images/cgroup.proto +++ b/images/cgroup.proto @@ -24,7 +24,6 @@ message cgroup_dir_entry { message cg_controller_entry { repeated string cnames = 1; repeated cgroup_dir_entry dirs = 2; - optional bool is_threaded = 3; } message cg_member_entry { diff --git a/images/core-aarch64.proto b/images/core-aarch64.proto index a94911c0b..3356e6b75 100644 --- a/images/core-aarch64.proto +++ b/images/core-aarch64.proto @@ -17,38 +17,9 @@ message user_aarch64_fpsimd_context_entry { required uint32 fpcr = 3; } -message user_aarch64_gcs_entry { - required uint64 gcspr_el0 = 1 [(criu).hex = true]; - required uint64 features_enabled = 2 [(criu).hex = true]; -} - -message pac_address_keys { - required uint64 apiakey_lo = 1; - required uint64 apiakey_hi = 2; - required uint64 apibkey_lo = 3; - required uint64 apibkey_hi = 4; - required uint64 apdakey_lo = 5; - required uint64 apdakey_hi = 6; - required uint64 apdbkey_lo = 7; - required uint64 apdbkey_hi = 8; - required uint64 pac_enabled_key = 9; -} - -message pac_generic_keys { - required uint64 apgakey_lo = 1; - required uint64 apgakey_hi = 2; -} - -message pac_keys { - optional pac_address_keys pac_address_keys = 6; - optional pac_generic_keys pac_generic_keys = 7; -} - message thread_info_aarch64 { required uint64 clear_tid_addr = 1[(criu).hex = true]; required uint64 tls = 2; required user_aarch64_regs_entry gpregs = 3[(criu).hex = true]; required user_aarch64_fpsimd_context_entry fpsimd = 4; - optional pac_keys pac_keys = 5; - optional user_aarch64_gcs_entry gcs = 6; } diff --git a/images/core-loongarch64.proto b/images/core-loongarch64.proto deleted file mode 100755 index 8258f006e..000000000 --- a/images/core-loongarch64.proto +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: MIT - -syntax = "proto2"; - -import "opts.proto"; - -message user_loongarch64_gpregs_entry { - repeated uint64 regs = 1; - required uint64 pc = 2; -} - -message user_loongarch64_fpregs_entry { - repeated uint64 regs = 1; - required uint64 fcc = 2; - required uint32 fcsr = 3; -} - -message thread_info_loongarch64 { - required uint64 clear_tid_addr = 1[(criu).hex = true]; - required uint64 tls = 2; - required user_loongarch64_gpregs_entry gpregs = 3[(criu).hex = true]; - required user_loongarch64_fpregs_entry fpregs = 4[(criu).hex = true]; -} diff --git a/images/core-mips.proto b/images/core-mips.proto old mode 100644 new mode 100755 diff --git a/images/core-ppc64.proto b/images/core-ppc64.proto index bb07e09e0..6a27f9012 100644 --- a/images/core-ppc64.proto +++ b/images/core-ppc64.proto @@ -5,7 +5,7 @@ syntax = "proto2"; import "opts.proto"; message user_ppc64_regs_entry { - /* Following is the list of registers starting at r0. */ + /* Following is the list of regiters starting at r0. */ repeated uint64 gpr = 1; required uint64 nip = 2; required uint64 msr = 3; @@ -22,7 +22,7 @@ message user_ppc64_regs_entry { } message user_ppc64_fpstate_entry { - /* Following is the list of registers starting at fpr0 */ + /* Following is the list of regiters starting at fpr0 */ repeated uint64 fpregs = 1; } diff --git a/images/core-riscv64.proto b/images/core-riscv64.proto deleted file mode 100644 index 1ddfdd8bd..000000000 --- a/images/core-riscv64.proto +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: MIT - -syntax = "proto2"; - -import "opts.proto"; - -// Refer to riscv-gnu-toolchain/linux-headers/include/asm/ptrace.h -message user_riscv64_regs_entry { - required uint64 pc = 1; - required uint64 ra = 2; - required uint64 sp = 3; - required uint64 gp = 4; - required uint64 tp = 5; - required uint64 t0 = 6; - required uint64 t1 = 7; - required uint64 t2 = 8; - required uint64 s0 = 9; - required uint64 s1 = 10; - required uint64 a0 = 11; - required uint64 a1 = 12; - required uint64 a2 = 13; - required uint64 a3 = 14; - required uint64 a4 = 15; - required uint64 a5 = 16; - required uint64 a6 = 17; - required uint64 a7 = 18; - required uint64 s2 = 19; - required uint64 s3 = 20; - required uint64 s4 = 21; - required uint64 s5 = 22; - required uint64 s6 = 23; - required uint64 s7 = 24; - required uint64 s8 = 25; - required uint64 s9 = 26; - required uint64 s10 = 27; - required uint64 s11 = 28; - required uint64 t3 = 29; - required uint64 t4 = 30; - required uint64 t5 = 31; - required uint64 t6 = 32; -} - -message user_riscv64_d_ext_entry { - repeated uint64 f = 1; - required uint32 fcsr = 2; -} - -message thread_info_riscv64 { - required uint64 clear_tid_addr = 1[(criu).hex = true]; - required uint64 tls = 2; - required user_riscv64_regs_entry gpregs = 3[(criu).hex = true]; - required user_riscv64_d_ext_entry fpsimd = 4; -} diff --git a/images/core-x86.proto b/images/core-x86.proto index 762418d73..ee7be8ff1 100644 --- a/images/core-x86.proto +++ b/images/core-x86.proto @@ -41,13 +41,8 @@ message user_x86_regs_entry { optional user_x86_regs_mode mode = 28 [default = NATIVE]; } -message user_x86_cet_entry { - required uint64 cet = 1[(criu).hex = true]; - required uint64 ssp = 2[(criu).hex = true]; -} - message user_x86_xsave_entry { - /* standard xsave features */ + /* standart xsave features */ required uint64 xstate_bv = 1; /* AVX components: 16x 256-bit ymm registers, hi 128 bits */ @@ -65,9 +60,6 @@ message user_x86_xsave_entry { /* Protected keys */ repeated uint32 pkru = 8; - /* CET */ - optional user_x86_cet_entry cet = 9; - /* * Processor trace (PT) and hardware duty cycling (HDC) * are supervisor state components and only managed by @@ -107,7 +99,7 @@ message user_desc_t { required bool read_exec_only = 7 [default = true]; required bool limit_in_pages = 8; required bool seg_not_present = 9 [default = true]; - required bool usable = 10; + required bool useable = 10; } message thread_info_x86 { diff --git a/images/core.proto b/images/core.proto index 1fa23868b..b713119f2 100644 --- a/images/core.proto +++ b/images/core.proto @@ -8,15 +8,12 @@ import "core-aarch64.proto"; import "core-ppc64.proto"; import "core-s390.proto"; import "core-mips.proto"; -import "core-loongarch64.proto"; -import "core-riscv64.proto"; import "rlimit.proto"; import "timer.proto"; import "creds.proto"; import "sa.proto"; import "siginfo.proto"; -import "rseq.proto"; import "opts.proto"; @@ -42,7 +39,6 @@ message task_core_entry { optional task_timers_entry timers = 7; optional task_rlimits_entry rlimits = 8; - /* This is deprecated, should be per-thread */ optional uint32 cg_set = 9; optional signal_queue_entry signals_s = 10; @@ -63,10 +59,6 @@ message task_core_entry { // Reserved for container relative start time //optional uint64 start_time = 19; optional uint64 blk_sigset_extended = 20[(criu).hex = true]; - - optional uint32 stop_signo = 21; - - optional uint32 membarrier_registration_mask = 22 [(criu).hex = true]; } message task_kobj_ids_entry { @@ -109,8 +101,6 @@ message thread_core_entry { optional string comm = 13; optional uint64 blk_sigset_extended = 14; - optional rseq_entry rseq_entry = 15; - optional uint32 cg_set = 16; } message task_rlimits_entry { @@ -126,8 +116,6 @@ message core_entry { PPC64 = 4; S390 = 5; MIPS = 6; - LOONGARCH64 = 7; - RISCV64 = 8; } required march mtype = 1; @@ -137,8 +125,6 @@ message core_entry { optional thread_info_ppc64 ti_ppc64 = 9; optional thread_info_s390 ti_s390 = 10; optional thread_info_mips ti_mips = 11; - optional thread_info_loongarch64 ti_loongarch64 = 12; - optional thread_info_riscv64 ti_riscv64 = 13; optional task_core_entry tc = 3; optional task_kobj_ids_entry ids = 4; diff --git a/images/creds.proto b/images/creds.proto index 932a40ccf..6228f7fcb 100644 --- a/images/creds.proto +++ b/images/creds.proto @@ -24,7 +24,4 @@ message creds_entry { optional string lsm_profile = 15; optional string lsm_sockcreate = 16; optional bytes apparmor_data = 17; - optional uint32 no_new_privs = 18; - - repeated uint32 cap_amb = 19; } diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 32ec13cf4..88f1c1186 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -17,7 +17,6 @@ import "ext-file.proto"; import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; -import "pidfd.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; @@ -43,7 +42,6 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; - PIDFD = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -80,5 +78,4 @@ message file_entry { optional tty_file_entry tty = 19; optional memfd_file_entry memfd = 20; optional bpfmap_file_entry bpf = 21; - optional pidfd_entry pidfd = 22; } diff --git a/images/google/protobuf/.gitignore b/images/google/protobuf/.gitignore deleted file mode 100644 index 68359a786..000000000 --- a/images/google/protobuf/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.c -*.h diff --git a/images/google/protobuf/descriptor.proto b/images/google/protobuf/descriptor.proto new file mode 120000 index 000000000..07a4c9add --- /dev/null +++ b/images/google/protobuf/descriptor.proto @@ -0,0 +1 @@ +/usr/include/google/protobuf/descriptor.proto \ No newline at end of file diff --git a/images/inventory.proto b/images/inventory.proto index feed5b850..a735bad1d 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -10,13 +10,6 @@ enum lsmtype { APPARMOR = 2; } -// It is not possible to distinguish between an empty repeated field -// and unset repeated field. To solve this problem and provide backwards -// compabibility, we use the 'plugins_entry' message. -message plugins_entry { - repeated string plugins = 12; -}; - message inventory_entry { required uint32 img_version = 1; optional bool fdinfo_per_id = 2; @@ -28,10 +21,4 @@ message inventory_entry { optional uint32 pre_dump_mode = 9; optional bool tcp_close = 10; optional uint32 network_lock_method = 11; - optional plugins_entry plugins_entry = 12; - // Remember the criu_run_id when CRIU dumped the process. - // This is currently used to delete the correct nftables - // network locking rule. - optional string dump_criu_run_id = 13; - optional bool allow_uprobes = 14; } diff --git a/images/ipc-shm.proto b/images/ipc-shm.proto index c5feebac0..7865dad8d 100644 --- a/images/ipc-shm.proto +++ b/images/ipc-shm.proto @@ -8,5 +8,4 @@ message ipc_shm_entry { required ipc_desc_entry desc = 1; required uint64 size = 2; optional bool in_pagemaps = 3; - optional uint32 hugetlb_flag = 4; } diff --git a/images/memfd.proto b/images/memfd.proto index bb0be4a6f..a944f145d 100644 --- a/images/memfd.proto +++ b/images/memfd.proto @@ -21,6 +21,4 @@ message memfd_inode_entry { required uint32 shmid = 5; required uint32 seals = 6 [(criu).flags = "seals.flags"]; required uint64 inode_id = 7; - optional uint32 hugetlb_flag = 8; - optional uint32 mode = 9; }; diff --git a/images/netdev.proto b/images/netdev.proto index 42e2bc7d7..748fd0200 100644 --- a/images/netdev.proto +++ b/images/netdev.proto @@ -74,5 +74,4 @@ message netns_entry { repeated netns_id nsids = 7; optional string ext_key = 8; repeated sysctl_entry unix_conf = 9; - repeated sysctl_entry ipv4_sysctl = 10; } diff --git a/images/opts.proto b/images/opts.proto index d730673a2..95304a8c6 100644 --- a/images/opts.proto +++ b/images/opts.proto @@ -5,7 +5,7 @@ syntax = "proto2"; import "google/protobuf/descriptor.proto"; message CRIU_Opts { - optional bool hex = 1; // Indicate that CRIT should treat this field as hex. + optional bool hex = 1; // Idicate that CRIT should treat this field as hex. optional bool ipadd = 2; // The field is IPv4/v6 address optional string flags = 3; optional bool dev = 4; // Device major:minor packed diff --git a/images/pagemap.proto b/images/pagemap.proto index f2436a51a..e6d341b0f 100644 --- a/images/pagemap.proto +++ b/images/pagemap.proto @@ -10,8 +10,7 @@ message pagemap_head { message pagemap_entry { required uint64 vaddr = 1 [(criu).hex = true]; - required uint32 compat_nr_pages = 2; + required uint32 nr_pages = 2; optional bool in_parent = 3; optional uint32 flags = 4 [(criu).flags = "pmap.flags" ]; - optional uint64 nr_pages = 5; } diff --git a/images/pidfd.proto b/images/pidfd.proto deleted file mode 100644 index a9da3e454..000000000 --- a/images/pidfd.proto +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: MIT - -syntax = "proto2"; - -import "fown.proto"; - -message pidfd_entry { - required uint32 id = 1; - required uint32 ino = 2; - required uint32 flags = 3; - required int32 nspid = 4; - required fown_entry fown = 5; -} diff --git a/images/rpc.proto b/images/rpc.proto index 1a4722a9c..a9f51ac4b 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -52,7 +52,6 @@ enum criu_cg_mode { enum criu_network_lock_method { IPTABLES = 1; NFTABLES = 2; - SKIP = 3; }; enum criu_pre_dump_mode { @@ -61,8 +60,7 @@ enum criu_pre_dump_mode { }; message criu_opts { - required int32 images_dir_fd = 1 [default = -1]; - optional string images_dir = 68; /* used only if images_dir_fd == -1 */ + required int32 images_dir_fd = 1; optional int32 pid = 2; /* if not set on dump, will dump requesting process */ optional bool leave_running = 3; @@ -139,12 +137,6 @@ message criu_opts { optional int32 pidfd_store_sk = 62; optional string lsm_mount_context = 63; optional criu_network_lock_method network_lock = 64 [default = IPTABLES]; - optional bool mntns_compat_mode = 65; - optional bool skip_file_rwx_check = 66; - optional bool unprivileged = 67; - optional bool leave_stopped = 69; - optional bool display_stats = 70; - optional bool log_to_stderr = 71; /* optional bool check_mounts = 128; */ } @@ -180,8 +172,6 @@ enum criu_req_type { WAIT_PID = 11; PAGE_SERVER_CHLD = 12; - - SINGLE_PRE_DUMP = 13; } /* diff --git a/images/rseq.proto b/images/rseq.proto deleted file mode 100644 index 45cb8476d..000000000 --- a/images/rseq.proto +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: MIT - -syntax = "proto2"; - -message rseq_entry { - required uint64 rseq_abi_pointer = 1; - required uint32 rseq_abi_size = 2; - required uint32 signature = 3; - optional uint64 rseq_cs_pointer = 4; -} diff --git a/images/sk-inet.proto b/images/sk-inet.proto index 2c709e018..594e29c66 100644 --- a/images/sk-inet.proto +++ b/images/sk-inet.proto @@ -5,7 +5,6 @@ syntax = "proto2"; import "opts.proto"; import "fown.proto"; import "sk-opts.proto"; -import "tcp-stream.proto"; message ip_opts_raw_entry { optional bool hdrincl = 1; @@ -18,10 +17,6 @@ message ip_opts_entry { optional bool freebind = 1; // Fields 2 and 3 are reserved for vz7 use optional ip_opts_raw_entry raw = 4; - - optional bool pktinfo = 5; - optional uint32 tos = 6; - optional uint32 ttl = 7; } message inet_sk_entry { @@ -57,5 +52,4 @@ message inet_sk_entry { optional string ifname = 17; optional uint32 ns_id = 18; optional sk_shutdown shutdown = 19; - optional tcp_opts_entry tcp_opts = 20; } diff --git a/images/sk-opts.proto b/images/sk-opts.proto index 2f9d4e5c3..2377f6b62 100644 --- a/images/sk-opts.proto +++ b/images/sk-opts.proto @@ -26,16 +26,11 @@ message sk_opts_entry { optional bool so_reuseport = 17; optional bool so_broadcast = 18; optional bool so_keepalive = 19; - - /* These three are deprecated, use tcp_opts_entry instead */ optional uint32 tcp_keepcnt = 20; optional uint32 tcp_keepidle = 21; optional uint32 tcp_keepintvl = 22; - optional uint32 so_oobinline = 23; optional uint32 so_linger = 24; - - optional uint32 so_buf_lock = 25; } enum sk_shutdown { diff --git a/images/tcp-stream.proto b/images/tcp-stream.proto index 3d834159f..c2244ba3b 100644 --- a/images/tcp-stream.proto +++ b/images/tcp-stream.proto @@ -4,14 +4,6 @@ syntax = "proto2"; import "opts.proto"; -message tcp_opts_entry { - optional bool cork = 1; - optional bool nodelay = 2; - optional uint32 keepcnt = 3; - optional uint32 keepidle = 4; - optional uint32 keepintvl = 5; -} - message tcp_stream_entry { required uint32 inq_len = 1; required uint32 inq_seq = 2; @@ -24,7 +16,6 @@ message tcp_stream_entry { optional uint32 rcv_wscale = 8; optional uint32 timestamp = 9; - /* These two are deprecated, use tcp_opts_entry instead */ optional bool cork = 10; optional bool nodelay = 11; diff --git a/include/common/arch/aarch64/asm/page.h b/include/common/arch/aarch64/asm/page.h index 4555debbd..90670d126 100644 --- a/include/common/arch/aarch64/asm/page.h +++ b/include/common/arch/aarch64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned long page_size(void) +static inline unsigned page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned long page_size(void); +extern unsigned page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/loongarch64/asm/atomic.h b/include/common/arch/loongarch64/asm/atomic.h deleted file mode 100644 index 901725439..000000000 --- a/include/common/arch/loongarch64/asm/atomic.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef __CR_ATOMIC_H__ -#define __CR_ATOMIC_H__ - -#include -#include "common/compiler.h" - -typedef struct { - int counter; -} atomic_t; - -static inline int atomic_read(const atomic_t *v) -{ - return (*(volatile int *)&(v)->counter); -} - -static inline void atomic_set(atomic_t *v, int i) -{ - v->counter = i; -} - -static inline int __atomic_add(int i, atomic_t *v) -{ - int result; - asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(v->counter), "=&r"(result) : "r"(i) : "memory"); - return result + i; -} - -static inline void atomic_add(int i, atomic_t *v) -{ - __atomic_add(i, v); -} - -static inline int atomic_add_return(int i, atomic_t *v) -{ - return __atomic_add(i, v); -} - -#define atomic_sub(i, v) atomic_add(-(int)i, v) -#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) -#define atomic_inc(v) atomic_add(1, v) -#define atomic_inc_return(v) atomic_add_return(1, v) -#define atomic_dec(v) atomic_sub(1, v) -#define atomic_dec_return(v) atomic_sub_return(1, v) - -static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) -{ - int ret; - asm volatile("1: \n" - " ll.w %0, %1 \n" - " bne %0, %2, 2f \n" - " or $t0, %3, $zero \n" - " sc.w $t0, %1 \n" - " beqz $t0, 1b \n" - "2: \n" - " dbar 0 \n" - : "=&r"(ret), "+ZB"(ptr->counter) - : "r"(old), "r"(new) - : "t0", "memory"); - return ret; -} - -#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/loongarch64/asm/bitops.h b/include/common/arch/loongarch64/asm/bitops.h deleted file mode 100644 index 170e4f736..000000000 --- a/include/common/arch/loongarch64/asm/bitops.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _LINUX_BITOPS_H -#define _LINUX_BITOPS_H -#include "common/asm-generic/bitops.h" - -/** - * test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - * - * This operation is atomic and cannot be reordered. - * It also implies a memory barrier. - */ - -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) -#define BIT_WORD(nr) ((1UL << ((nr) / BITS_PER_LONG)) - 1) -static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) -{ - unsigned long res, mask; - mask = BIT_MASK(nr); - asm volatile("amor_db.d %0, %2, %1" : "=&r"(res), "+ZB"(addr[BIT_WORD(nr)]) : "r"(mask) : "memory"); - return (res & mask) != 0; -} - -#endif diff --git a/include/common/arch/loongarch64/asm/bitsperlong.h b/include/common/arch/loongarch64/asm/bitsperlong.h deleted file mode 100644 index 13d06a384..000000000 --- a/include/common/arch/loongarch64/asm/bitsperlong.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CR_BITSPERLONG_H__ -#define __CR_BITSPERLONG_H__ - -#define BITS_PER_LONG _LOONGARCH_SZLONG - -#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/loongarch64/asm/linkage.h b/include/common/arch/loongarch64/asm/linkage.h deleted file mode 100644 index 448acc29f..000000000 --- a/include/common/arch/loongarch64/asm/linkage.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __CR_LINKAGE_H__ -#define __CR_LINKAGE_H__ - -#define __ALIGN .align 2 -#define __ALIGN_STR ".align 2" - -#define GLOBAL(name) \ - .globl name; \ -name: - -#define ENTRY(name) \ - .globl name; \ - __ALIGN; \ - .type name, @function; \ -name: - -#define END(sym) .size sym, .- sym - -#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/loongarch64/asm/page.h b/include/common/arch/loongarch64/asm/page.h deleted file mode 100644 index 4fcdb64dc..000000000 --- a/include/common/arch/loongarch64/asm/page.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __CR_ASM_PAGE_H__ -#define __CR_ASM_PAGE_H__ - -#define ARCH_HAS_LONG_PAGES - -#ifndef CR_NOGLIBC -#include /* ffsl() */ -#include /* _SC_PAGESIZE */ - -static unsigned __page_size; -static unsigned __page_shift; - -static inline unsigned long page_size(void) -{ - if (!__page_size) - __page_size = sysconf(_SC_PAGESIZE); - return __page_size; -} - -static inline unsigned page_shift(void) -{ - if (!__page_shift) - __page_shift = (ffsl(page_size()) - 1); - return __page_shift; -} - -#define PAGE_SIZE page_size() -#define PAGE_SHIFT page_shift() -#define PAGE_MASK (~(PAGE_SIZE - 1)) - -#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) -#else /* CR_NOGLIBC */ - -extern unsigned long page_size(void); -#define PAGE_SIZE page_size() - -#endif /* CR_NOGLIBC */ - -#endif /* __CR_ASM_PAGE_H__ */ diff --git a/include/common/arch/mips/asm/page.h b/include/common/arch/mips/asm/page.h index 4fcdb64dc..25bdbc141 100644 --- a/include/common/arch/mips/asm/page.h +++ b/include/common/arch/mips/asm/page.h @@ -10,7 +10,7 @@ static unsigned __page_size; static unsigned __page_shift; -static inline unsigned long page_size(void) +static inline unsigned page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -31,7 +31,7 @@ static inline unsigned page_shift(void) #define PAGE_PFN(addr) ((addr) / PAGE_SIZE) #else /* CR_NOGLIBC */ -extern unsigned long page_size(void); +extern unsigned page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/ppc64/asm/bitops.h b/include/common/arch/ppc64/asm/bitops.h index 54d55da16..704668263 100644 --- a/include/common/arch/ppc64/asm/bitops.h +++ b/include/common/arch/ppc64/asm/bitops.h @@ -46,7 +46,6 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] -#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #define __stringify_in_c(...) #__VA_ARGS__ #define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " @@ -197,14 +196,14 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ - i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ diff --git a/include/common/arch/ppc64/asm/linkage.h b/include/common/arch/ppc64/asm/linkage.h index 983f10071..8e388c105 100644 --- a/include/common/arch/ppc64/asm/linkage.h +++ b/include/common/arch/ppc64/asm/linkage.h @@ -43,7 +43,7 @@ * * The lower case r0-r31 should be used in preference to the upper * case R0-R31 as they provide more error checking in the assembler. - * Use R0-31 only when really necessary. + * Use R0-31 only when really nessesary. */ /* clang-format off */ diff --git a/include/common/arch/ppc64/asm/page.h b/include/common/arch/ppc64/asm/page.h index 2b0c0e504..a1ff6718a 100644 --- a/include/common/arch/ppc64/asm/page.h +++ b/include/common/arch/ppc64/asm/page.h @@ -10,7 +10,7 @@ extern unsigned __page_size; extern unsigned __page_shift; -static inline unsigned long page_size(void) +static inline unsigned page_size(void) { if (!__page_size) __page_size = sysconf(_SC_PAGESIZE); @@ -37,7 +37,7 @@ static inline unsigned page_shift(void) #else /* CR_NOGLIBC */ -extern unsigned long page_size(void); +extern unsigned page_size(void); #define PAGE_SIZE page_size() #endif /* CR_NOGLIBC */ diff --git a/include/common/arch/riscv64/asm/atomic.h b/include/common/arch/riscv64/asm/atomic.h deleted file mode 100644 index 4b08bd9fd..000000000 --- a/include/common/arch/riscv64/asm/atomic.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef __CR_ATOMIC_H__ -#define __CR_ATOMIC_H__ - -typedef struct { - int counter; -} atomic_t; - -/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ - -#define nop() __asm__ __volatile__("nop") - -#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") - -/* These barriers need to enforce ordering on both devices or memory. */ -#define mb() RISCV_FENCE(iorw, iorw) -#define rmb() RISCV_FENCE(ir, ir) -#define wmb() RISCV_FENCE(ow, ow) - -/* These barriers do not need to enforce ordering on devices, just memory. */ -#define __smp_mb() RISCV_FENCE(rw, rw) -#define __smp_rmb() RISCV_FENCE(r, r) -#define __smp_wmb() RISCV_FENCE(w, w) - -#define __smp_store_release(p, v) \ - do { \ - compiletime_assert_atomic_type(*p); \ - RISCV_FENCE(rw, w); \ - WRITE_ONCE(*p, v); \ - } while (0) - -#define __smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ - compiletime_assert_atomic_type(*p); \ - RISCV_FENCE(r, rw); \ - ___p1; \ - }) - -/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ - -static inline int atomic_read(const atomic_t *v) -{ - return (*(volatile int *)&(v)->counter); -} - -static inline void atomic_set(atomic_t *v, int i) -{ - v->counter = i; -} - -#define atomic_get atomic_read - -static inline int atomic_add_return(int i, atomic_t *v) -{ - int result; - - asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(v->counter), "=r"(result) : "r"(i) : "memory"); - __smp_mb(); - return result + i; -} - -static inline int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -static inline int atomic_inc(atomic_t *v) -{ - return atomic_add_return(1, v) - 1; -} - -static inline int atomic_add(int val, atomic_t *v) -{ - return atomic_add_return(val, v) - val; -} - -static inline int atomic_dec(atomic_t *v) -{ - return atomic_sub_return(1, v) + 1; -} - -/* true if the result is 0, or false for all other cases. */ -#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) - -#define atomic_inc_return(v) (atomic_add_return(1, v)) - -static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) -{ - unsigned long tmp; - int oldval; - - __smp_mb(); - - asm volatile("1:\n" - " lr.w %1, %2\n" - " bne %1, %3, 2f\n" - " sc.w %0, %4, %2\n" - " bnez %0, 1b\n" - "2:" - : "=&r"(tmp), "=&r"(oldval), "+A"(ptr->counter) - : "r"(old), "r"(new) - : "memory"); - - __smp_mb(); - return oldval; -} - -#endif /* __CR_ATOMIC_H__ */ diff --git a/include/common/arch/riscv64/asm/bitops.h b/include/common/arch/riscv64/asm/bitops.h deleted file mode 100644 index eabab27c7..000000000 --- a/include/common/arch/riscv64/asm/bitops.h +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef __CR_ASM_BITOPS_H__ -#define __CR_ASM_BITOPS_H__ - -#include "common/compiler.h" -#include "common/asm/bitsperlong.h" - -#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) -#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) - -#define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] -#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) - -#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) -/* Technically wrong, but this avoids compilation errors on some gcc - versions. */ -#define BITOP_ADDR(x) "=m"(*(volatile long *)(x)) -#else -#define BITOP_ADDR(x) "+m"(*(volatile long *)(x)) -#endif - -#define ADDR BITOP_ADDR(addr) - -static inline void set_bit(int nr, volatile unsigned long *addr) -{ - addr += nr / BITS_PER_LONG; - *addr |= (1UL << (nr % BITS_PER_LONG)); -} - -static inline void change_bit(int nr, volatile unsigned long *addr) -{ - addr += nr / BITS_PER_LONG; - *addr ^= (1UL << (nr % BITS_PER_LONG)); -} - -static inline int test_bit(int nr, volatile const unsigned long *addr) -{ - addr += nr / BITS_PER_LONG; - return (*addr & (1UL << (nr % BITS_PER_LONG))) ? -1 : 0; -} - -static inline void clear_bit(int nr, volatile unsigned long *addr) -{ - addr += nr / BITS_PER_LONG; - *addr &= ~(1UL << (nr % BITS_PER_LONG)); -} - -/** - * __ffs - find first set bit in word - * @word: The word to search - * - * Undefined if no bit exists, so code should check against 0 first. - */ -static inline unsigned long __ffs(unsigned long word) -{ - int p = 0; - - for (; p < 8*sizeof(word); ++p) { - if (word & 1) { - break; - } - - word >>= 1; - } - - return p; -} - -#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) - -/* - * Find the next set bit in a memory region. - */ -static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) -{ - const unsigned long *p = addr + BITOP_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); - unsigned long tmp; - - if (offset >= size) - return size; - size -= result; - offset %= BITS_PER_LONG; - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG - 1)) { - if ((tmp = *(p++))) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = *p; - -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __ffs(tmp); -} - -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ - i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) - - -#define BITS_PER_LONG 64 - -#define BIT_MASK(nr) ((1##UL) << ((nr) % BITS_PER_LONG)) -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) - -#define __AMO(op) "amo" #op ".d" - -#define __test_and_op_bit_ord(op, mod, nr, addr, ord) \ - ({ \ - unsigned long __res, __mask; \ - __mask = BIT_MASK(nr); \ - __asm__ __volatile__(__AMO(op) #ord " %0, %2, %1" \ - : "=r"(__res), "+A"(addr[BIT_WORD(nr)]) \ - : "r"(mod(__mask)) \ - : "memory"); \ - ((__res & __mask) != 0); \ - }) - -#define __op_bit_ord(op, mod, nr, addr, ord) \ - __asm__ __volatile__(__AMO(op) #ord " zero, %1, %0" \ - : "+A"(addr[BIT_WORD(nr)]) \ - : "r"(mod(BIT_MASK(nr))) \ - : "memory"); - -#define __test_and_op_bit(op, mod, nr, addr) __test_and_op_bit_ord(op, mod, nr, addr, .aqrl) -#define __op_bit(op, mod, nr, addr) __op_bit_ord(op, mod, nr, addr, ) - -/* Bitmask modifiers */ -#define __NOP(x) (x) -#define __NOT(x) (~(x)) - -/** - * test_and_set_bit - Set a bit and return its old value - * @nr: Bit to set - * @addr: Address to count from - * - * This operation may be reordered on other architectures than x86. - */ -static inline int test_and_set_bit(int nr, volatile unsigned long *addr) -{ - return __test_and_op_bit(or, __NOP, nr, addr); -} - -#endif /* __CR_ASM_BITOPS_H__ */ diff --git a/include/common/arch/riscv64/asm/bitsperlong.h b/include/common/arch/riscv64/asm/bitsperlong.h deleted file mode 100644 index d95727d19..000000000 --- a/include/common/arch/riscv64/asm/bitsperlong.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __CR_BITSPERLONG_H__ -#define __CR_BITSPERLONG_H__ - -#define BITS_PER_LONG 64 - -#endif /* __CR_BITSPERLONG_H__ */ diff --git a/include/common/arch/riscv64/asm/linkage.h b/include/common/arch/riscv64/asm/linkage.h deleted file mode 100644 index c6d40f750..000000000 --- a/include/common/arch/riscv64/asm/linkage.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __CR_LINKAGE_H__ -#define __CR_LINKAGE_H__ - -#ifdef __ASSEMBLY__ - -#define __ALIGN .align 4, 0x00 -#define __ALIGN_STR ".align 4, 0x00" - -#define GLOBAL(name) \ - .globl name; \ -name: - -#define ENTRY(name) \ - .globl name; \ - .type name, @function; \ - __ALIGN; \ -name: - -#define END(sym) .size sym, .- sym - -#endif /* __ASSEMBLY__ */ - -#endif /* __CR_LINKAGE_H__ */ diff --git a/include/common/arch/riscv64/asm/page.h b/include/common/arch/riscv64/asm/page.h deleted file mode 100644 index 5113cb6db..000000000 --- a/include/common/arch/riscv64/asm/page.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef __CR_ASM_PAGE_H__ -#define __CR_ASM_PAGE_H__ - -#define ARCH_HAS_LONG_PAGES - -#ifndef CR_NOGLIBC -#include /* ffsl() */ -#include /* _SC_PAGESIZE */ - -extern unsigned __page_size; -extern unsigned __page_shift; - -static inline unsigned page_size(void) -{ - if (!__page_size) - __page_size = sysconf(_SC_PAGESIZE); - return __page_size; -} - -static inline unsigned page_shift(void) -{ - if (!__page_shift) - __page_shift = (ffsl(page_size()) - 1); - return __page_shift; -} - -/* - * Don't add ifdefs for PAGE_SIZE: if any header defines it as a constant - * on aarch64, then we need refrain using PAGE_SIZE in criu and use - * page_size() across sources (as it may differ on aarch64). - */ -#define PAGE_SIZE page_size() -#define PAGE_MASK (~(PAGE_SIZE - 1)) -#define PAGE_SHIFT page_shift() - -#define PAGE_PFN(addr) ((addr) / PAGE_SIZE) - -#else /* CR_NOGLIBC */ - -extern unsigned long page_size(void); -#define PAGE_SIZE page_size() - -#endif /* CR_NOGLIBC */ -#endif /* __CR_ASM_PAGE_H__ */ diff --git a/include/common/arch/s390/asm/bitops.h b/include/common/arch/s390/asm/bitops.h index 22547c544..f396721e9 100644 --- a/include/common/arch/s390/asm/bitops.h +++ b/include/common/arch/s390/asm/bitops.h @@ -10,7 +10,6 @@ #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] -#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) static inline unsigned long *__bitops_word(unsigned long nr, volatile unsigned long *ptr) { @@ -144,8 +143,8 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo return _find_next_bit(addr, size, offset, 0UL); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ - i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* _S390_BITOPS_H */ diff --git a/include/common/arch/x86/asm/bitops.h b/include/common/arch/x86/asm/bitops.h index f3c7dbbdf..d7a60589b 100644 --- a/include/common/arch/x86/asm/bitops.h +++ b/include/common/arch/x86/asm/bitops.h @@ -10,7 +10,6 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] -#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc @@ -114,14 +113,14 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ - i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* __CR_BITOPS_H__ */ diff --git a/include/common/asm-generic/bitops.h b/include/common/asm-generic/bitops.h index d8f38091d..064ba4cc4 100644 --- a/include/common/asm-generic/bitops.h +++ b/include/common/asm-generic/bitops.h @@ -14,7 +14,6 @@ #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG) #define DECLARE_BITMAP(name, bits) unsigned long name[BITS_TO_LONGS(bits)] -#define BITMAP_SIZE(name) (sizeof(name) * CHAR_BIT) #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) /* Technically wrong, but this avoids compilation errors on some gcc @@ -98,14 +97,14 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned lo found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ + if (tmp == 0UL) /* Are any bits set? */ return result + size; /* Nope. */ found_middle: return result + __ffs(tmp); } -#define for_each_bit(i, bitmask) \ - for (i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), 0); i < BITMAP_SIZE(bitmask); \ - i = find_next_bit(bitmask, BITMAP_SIZE(bitmask), i + 1)) +#define for_each_bit(i, bitmask) \ + for (i = find_next_bit(bitmask, sizeof(bitmask), 0); i < sizeof(bitmask); \ + i = find_next_bit(bitmask, sizeof(bitmask), i + 1)) #endif /* __CR_GENERIC_BITOPS_H__ */ diff --git a/include/common/compiler.h b/include/common/compiler.h index 3e66709f9..bd3de01df 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -30,17 +30,6 @@ #define __always_unused __attribute__((unused)) #define __must_check __attribute__((__warn_unused_result__)) -#ifndef __has_attribute -#define __has_attribute(x) 0 -#endif - -/* Not supported by clang */ -#if __has_attribute(__externally_visible__) -#define __visible __attribute__((__externally_visible__)) -#else -#define __visible -#endif - #define __section(S) __attribute__((__section__(#S))) #ifndef __always_inline @@ -58,9 +47,7 @@ #define noinline __attribute__((noinline)) #endif -#ifndef __aligned #define __aligned(x) __attribute__((aligned(x))) -#endif /* * Macro to define stack alignment. @@ -89,7 +76,6 @@ #define round_down(x, y) ((x) & ~__round_mask(x, y)) #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) #define ALIGN(x, a) (((x) + (a)-1) & ~((a)-1)) -#define ALIGN_DOWN(x, a) ALIGN((x) - ((a) - 1), (a)) #define min(x, y) \ ({ \ diff --git a/include/common/lock.h b/include/common/lock.h index 4733d7287..4fbe42c63 100644 --- a/include/common/lock.h +++ b/include/common/lock.h @@ -2,7 +2,6 @@ #define __CR_COMMON_LOCK_H__ #include -#include #include #include #include @@ -10,7 +9,7 @@ #include "common/asm/atomic.h" #include "common/compiler.h" -/* scan-build complains about dereferencing a NULL pointer here. */ +/* scan-build complains about derefencing a NULL pointer here. */ #ifndef __clang_analyzer__ #define LOCK_BUG_ON(condition) \ if ((condition)) \ @@ -163,11 +162,6 @@ static inline void mutex_lock(mutex_t *m) } } -static inline bool mutex_trylock(mutex_t *m) -{ - return atomic_inc_return(&m->raw) == 1; -} - static inline void mutex_unlock(mutex_t *m) { uint32_t c = 0; diff --git a/include/common/scm-code.c b/include/common/scm-code.c index de581846b..d7d732587 100644 --- a/include/common/scm-code.c +++ b/include/common/scm-code.c @@ -96,7 +96,7 @@ int __recv_fds(int sock, int *fds, int nr_fds, void *data, unsigned ch_size, int min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int); /* * In case if kernel screwed the recipient, most probably - * the caller stack frame will be overwritten, just scream + * the caller stack frame will be overwriten, just scream * and exit. * * FIXME Need to sanitize util.h to be able to include it diff --git a/include/common/scm.h b/include/common/scm.h index 5b6f78a8b..bcb198882 100644 --- a/include/common/scm.h +++ b/include/common/scm.h @@ -11,7 +11,7 @@ * Because of kernel doing kmalloc for user data passed * in SCM messages, and there is kernel's SCM_MAX_FD as a limit * for descriptors passed at once we're trying to reduce - * the pressure on kernel memory manager and use predefined + * the pressue on kernel memory manager and use predefined * known to work well size of the message buffer. */ #define CR_SCM_MSG_SIZE (1024) diff --git a/lib/.gitignore b/lib/.gitignore deleted file mode 100644 index a10181b80..000000000 --- a/lib/.gitignore +++ /dev/null @@ -1 +0,0 @@ -pycriu.egg-info/ diff --git a/lib/Makefile b/lib/Makefile index 4b8a6cbb8..575a7bad3 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,6 +2,10 @@ CRIU_SO := libcriu.so CRIU_A := libcriu.a UAPI_HEADERS := lib/c/criu.h images/rpc.proto images/rpc.pb-c.h criu/include/version.h +# +# File to keep track of files installed by setup.py +CRIT_SETUP_FILES := lib/.crit-setup.files + all-y += lib-c lib-a lib-py # @@ -25,23 +29,23 @@ lib-a: lib/c/$(CRIU_A) # # Python bindings. -lib/pycriu/Makefile: ; -lib/pycriu/%: .FORCE +lib/py/Makefile: ; +lib/py/%: .FORCE $(call msg-gen, $@) - $(Q) $(MAKE) $(build)=lib/pycriu $@ + $(Q) $(MAKE) $(build)=lib/py $@ lib-py: - $(Q) $(MAKE) $(build)=lib/pycriu all + $(Q) $(MAKE) $(build)=lib/py all .PHONY: lib-py clean-lib: $(Q) $(MAKE) $(build)=lib/c clean - $(Q) $(MAKE) $(build)=lib/pycriu clean + $(Q) $(MAKE) $(build)=lib/py clean .PHONY: clean-lib clean: clean-lib cleanup-y += lib/c/$(CRIU_SO) lib/c/$(CRIU_A) lib/c/criu.pc mrproper: clean -install: lib-c lib-a lib-py lib/c/criu.pc.in +install: lib-c lib-a lib-py crit/crit lib/c/criu.pc.in $(E) " INSTALL " lib $(Q) mkdir -p $(DESTDIR)$(LIBDIR) $(Q) install -m 755 lib/c/$(CRIU_SO) $(DESTDIR)$(LIBDIR)/$(CRIU_SO).$(CRIU_SO_VERSION_MAJOR).$(CRIU_SO_VERSION_MINOR) @@ -54,12 +58,8 @@ install: lib-c lib-a lib-py lib/c/criu.pc.in $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig $(Q) sed -e 's,@version@,$(CRIU_VERSION),' -e 's,@libdir@,$(LIBDIR),' -e 's,@includedir@,$(dir $(INCLUDEDIR)/criu/),' lib/c/criu.pc.in > lib/c/criu.pc $(Q) install -m 644 lib/c/criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig -ifeq ($(SKIP_PIP_INSTALL),0) - $(E) " INSTALL " pycriu - $(Q) $(PYTHON) -m pip install $(PIPFLAGS) --prefix=$(DESTDIR)$(PREFIX) ./lib -else - $(E) " SKIP INSTALL pycriu" -endif + $(E) " INSTALL " crit + $(Q) $(PYTHON) scripts/crit-setup.py install --prefix=$(DESTDIR)$(PREFIX) --record $(CRIT_SETUP_FILES) .PHONY: install uninstall: @@ -71,10 +71,6 @@ uninstall: $(Q) $(RM) $(addprefix $(DESTDIR)$(INCLUDEDIR)/criu/,$(notdir $(UAPI_HEADERS))) $(E) " UNINSTALL" pkgconfig/criu.pc $(Q) $(RM) $(addprefix $(DESTDIR)$(LIBDIR)/pkgconfig/,criu.pc) -ifeq ($(SKIP_PIP_INSTALL),0) - $(E) " UNINSTALL" pycriu - $(Q) $(PYTHON) ./scripts/uninstall_module.py --prefix=$(DESTDIR)$(PREFIX) pycriu -else - $(E) " SKIP UNINSTALL pycriu" -endif + $(E) " UNINSTALL" crit + $(Q) while read -r file; do $(RM) "$$file"; done < $(CRIT_SETUP_FILES) .PHONY: uninstall diff --git a/lib/c/criu.c b/lib/c/criu.c index 485c8b178..75f168799 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -214,18 +214,6 @@ void criu_local_free_opts(criu_opts *opts) } opts->rpc->n_external = 0; - if (opts->rpc->join_ns) { - for (i = 0; i < opts->rpc->n_join_ns; i++) { - free(opts->rpc->join_ns[i]->ns); - free(opts->rpc->join_ns[i]->ns_file); - if (opts->rpc->join_ns[i]->extra_opt) { - free(opts->rpc->join_ns[i]->extra_opt); - } - free(opts->rpc->join_ns[i]); - } - } - opts->rpc->n_join_ns = 0; - if (opts->rpc->ps) { free(opts->rpc->ps->address); free(opts->rpc->ps); @@ -238,7 +226,6 @@ void criu_local_free_opts(criu_opts *opts) free(opts->rpc->freeze_cgroup); free(opts->rpc->log_file); free(opts->rpc->lsm_profile); - free(opts->rpc->lsm_mount_context); free(opts->rpc); criu_free_service(opts); free(opts); @@ -352,8 +339,8 @@ int criu_set_parent_images(const char *path) int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode) { + opts->rpc->has_pre_dump_mode = true; if (mode == CRIU_PRE_DUMP_SPLICE || mode == CRIU_PRE_DUMP_READ) { - opts->rpc->has_pre_dump_mode = true; opts->rpc->pre_dump_mode = (CriuPreDumpMode)mode; return 0; } @@ -555,28 +542,6 @@ void criu_set_shell_job(bool shell_job) criu_local_set_shell_job(global_opts, shell_job); } -void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check) -{ - opts->rpc->has_skip_file_rwx_check = true; - opts->rpc->skip_file_rwx_check = skip_file_rwx_check; -} - -void criu_set_skip_file_rwx_check(bool skip_file_rwx_check) -{ - criu_local_set_skip_file_rwx_check(global_opts, skip_file_rwx_check); -} - -void criu_local_set_unprivileged(criu_opts *opts, bool unprivileged) -{ - opts->rpc->has_unprivileged = true; - opts->rpc->unprivileged = unprivileged; -} - -void criu_set_unprivileged(bool unprivileged) -{ - criu_local_set_unprivileged(global_opts, unprivileged); -} - void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master) { opts->rpc->has_orphan_pts_master = true; @@ -674,20 +639,6 @@ int criu_set_lsm_profile(const char *name) return criu_local_set_lsm_profile(global_opts, name); } -int criu_local_set_lsm_mount_context(criu_opts *opts, const char *name) -{ - opts->rpc->lsm_mount_context = strdup(name); - if (opts->rpc->lsm_mount_context == NULL) { - return -ENOMEM; - } - return 0; -} - -int criu_set_lsm_mount_context(const char *name) -{ - return criu_local_set_lsm_mount_context(global_opts, name); -} - void criu_local_set_timeout(criu_opts *opts, unsigned int timeout) { opts->rpc->timeout = timeout; @@ -1192,17 +1143,6 @@ int criu_set_page_server_address_port(const char *address, int port) return criu_local_set_page_server_address_port(global_opts, address, port); } -void criu_local_set_mntns_compat_mode(criu_opts *opts, bool val) -{ - opts->rpc->has_mntns_compat_mode = true; - opts->rpc->mntns_compat_mode = val; -} - -void criu_set_mntns_compat_mode(bool val) -{ - criu_local_set_mntns_compat_mode(global_opts, val); -} - static CriuResp *recv_resp(int socket_fd) { struct msghdr msg_hdr = { 0 }; @@ -1560,7 +1500,7 @@ int criu_check(void) return criu_local_check(global_opts); } -static int dump(bool pre_dump, criu_opts *opts) +int criu_local_dump(criu_opts *opts) { int ret = -1; CriuReq req = CRIU_REQ__INIT; @@ -1568,7 +1508,7 @@ static int dump(bool pre_dump, criu_opts *opts) saved_errno = 0; - req.type = pre_dump ? CRIU_REQ_TYPE__SINGLE_PRE_DUMP : CRIU_REQ_TYPE__DUMP; + req.type = CRIU_REQ_TYPE__DUMP; req.opts = opts->rpc; ret = send_req_and_recv_resp(opts, &req, &resp); @@ -1576,7 +1516,7 @@ static int dump(bool pre_dump, criu_opts *opts) goto exit; if (resp->success) { - if (!pre_dump && resp->dump->has_restored && resp->dump->restored) + if (resp->dump->has_restored && resp->dump->restored) ret = 1; else ret = 0; @@ -1594,26 +1534,11 @@ exit: return ret; } -int criu_local_dump(criu_opts *opts) -{ - return dump(false, opts); -} - int criu_dump(void) { return criu_local_dump(global_opts); } -int criu_local_pre_dump(criu_opts *opts) -{ - return dump(true, opts); -} - -int criu_pre_dump(void) -{ - return criu_local_pre_dump(global_opts); -} - int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)) { int ret = -1, fd = -1, uret; @@ -1867,8 +1792,8 @@ void criu_set_pidfd_store_sk(int sk) int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method) { - if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES || method == CRIU_NETWORK_LOCK_SKIP) { - opts->rpc->has_network_lock = true; + opts->rpc->has_network_lock = true; + if (method == CRIU_NETWORK_LOCK_IPTABLES || method == CRIU_NETWORK_LOCK_NFTABLES) { opts->rpc->network_lock = (CriuNetworkLockMethod)method; return 0; } @@ -1879,184 +1804,3 @@ int criu_set_network_lock(enum criu_network_lock_method method) { return criu_local_set_network_lock(global_opts, method); } - -int criu_local_join_ns_add(criu_opts *opts, const char *ns, const char *ns_file, const char *extra_opt) -{ - int n_join_ns; - char *_ns = NULL, *_ns_file = NULL, *_extra_opt = NULL; - JoinNamespace **join_ns_arr, *join_ns = NULL; - - if (!ns) { - fprintf(stderr, "ns parameter for join_ns is not specified"); - goto err; - } - - _ns = strdup(ns); - if (!_ns) { - perror("Can't allocate memory for ns"); - goto err; - } - - if (!ns_file) { - fprintf(stderr, "ns parameter for join_ns is not specified"); - goto err; - } - - _ns_file = strdup(ns_file); - if (!_ns_file) { - perror("Can't allocate memory for ns_file"); - goto err; - } - - if (extra_opt) { - _extra_opt = strdup(extra_opt); - if (!_extra_opt) { - perror("Can't allocate memory for extra_opt"); - goto err; - } - } - - join_ns = malloc(sizeof(JoinNamespace)); - if (!join_ns) { - perror("Can't allocate memory for join_ns"); - goto err; - } - - n_join_ns = opts->rpc->n_join_ns + 1; - join_ns_arr = realloc(opts->rpc->join_ns, n_join_ns * sizeof(join_ns)); - if (!join_ns_arr) { - perror("Can't allocate memory for join_ns_arr"); - goto err; - } - - join_namespace__init(join_ns); - join_ns->ns = _ns; - join_ns->ns_file = _ns_file; - if (_extra_opt) { - join_ns->extra_opt = _extra_opt; - } - - join_ns_arr[n_join_ns - 1] = join_ns; - opts->rpc->join_ns = join_ns_arr; - opts->rpc->n_join_ns = n_join_ns; - - return 0; - -err: - if (_ns) - free(_ns); - if (_ns_file) - free(_ns_file); - if (_extra_opt) - free(_extra_opt); - if (join_ns) - free(join_ns); - return -1; -} - -int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt) -{ - return criu_local_join_ns_add(global_opts, ns, ns_file, extra_opt); -} - -int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size) -{ - CriuFeatures criu_features = CRIU_FEATURES__INIT; - struct criu_feature_check features_copy = { 0 }; - CriuReq req = CRIU_REQ__INIT; - CriuResp *resp = NULL; - int ret = -1; - - saved_errno = 0; - - if (!features) - goto exit; - - if (size > sizeof(struct criu_feature_check)) - goto exit; - - memcpy(&features_copy, features, size); - - req.type = CRIU_REQ_TYPE__FEATURE_CHECK; - req.opts = opts->rpc; - - if (features_copy.mem_track) { - criu_features.has_mem_track = true; - criu_features.mem_track = true; - } - if (features_copy.lazy_pages) { - criu_features.has_lazy_pages = true; - criu_features.lazy_pages = true; - } - if (features_copy.pidfd_store) { - criu_features.has_pidfd_store = true; - criu_features.pidfd_store = true; - } - req.features = &criu_features; - - ret = send_req_and_recv_resp(opts, &req, &resp); - if (ret) - goto exit; - - memset(&features_copy, 0, sizeof(struct criu_feature_check)); - - if (resp->success) { - if (resp->features->has_mem_track) { - features_copy.mem_track = resp->features->mem_track; - } - if (resp->features->has_lazy_pages) { - features_copy.lazy_pages = resp->features->lazy_pages; - } - if (resp->features->has_pidfd_store) { - features_copy.pidfd_store = resp->features->pidfd_store; - } - memcpy(features, &features_copy, size); - } else { - ret = -EBADE; - } - -exit: - if (resp) - criu_resp__free_unpacked(resp, NULL); - - swrk_wait(opts); - - errno = saved_errno; - - return ret; -} - -int criu_feature_check(struct criu_feature_check *features, size_t size) -{ - return criu_local_feature_check(global_opts, features, size); -} - -void criu_local_set_empty_ns(criu_opts *opts, int namespaces) -{ - opts->rpc->has_empty_ns = true; - opts->rpc->empty_ns = namespaces; -} - -void criu_set_empty_ns(int namespaces) -{ - criu_local_set_empty_ns(global_opts, namespaces); -} - -int criu_local_set_config_file(criu_opts *opts, const char *path) -{ - char *new; - - new = strdup(path); - if (!new) - return -ENOMEM; - - free(opts->rpc->config_file); - opts->rpc->config_file = new; - - return 0; -} - -int criu_set_config_file(const char *path) -{ - return criu_local_set_config_file(global_opts, path); -} diff --git a/lib/c/criu.h b/lib/c/criu.h index 44446f664..258e33a19 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -28,13 +28,6 @@ extern "C" { #endif -#define CRIU_LOG_UNSET (-1) -#define CRIU_LOG_MSG (0) /* Print message regardless of log level */ -#define CRIU_LOG_ERROR (1) /* Errors only */ -#define CRIU_LOG_WARN (2) /* Warnings */ -#define CRIU_LOG_INFO (3) /* Informative */ -#define CRIU_LOG_DEBUG (4) /* Debug only */ - enum criu_service_comm { CRIU_COMM_SK, CRIU_COMM_FD, CRIU_COMM_BIN }; enum criu_cg_mode { @@ -50,7 +43,6 @@ enum criu_cg_mode { enum criu_network_lock_method { CRIU_NETWORK_LOCK_IPTABLES = 1, CRIU_NETWORK_LOCK_NFTABLES = 2, - CRIU_NETWORK_LOCK_SKIP = 3, }; enum criu_pre_dump_mode { CRIU_PRE_DUMP_SPLICE = 1, CRIU_PRE_DUMP_READ = 2 }; @@ -79,8 +71,6 @@ void criu_set_tcp_close(bool tcp_close); void criu_set_weak_sysctls(bool val); void criu_set_evasive_devices(bool evasive_devices); void criu_set_shell_job(bool shell_job); -void criu_set_skip_file_rwx_check(bool skip_file_rwx_check); -void criu_set_unprivileged(bool unprivileged); void criu_set_orphan_pts_master(bool orphan_pts_master); void criu_set_file_locks(bool file_locks); void criu_set_track_mem(bool track_mem); @@ -95,7 +85,6 @@ void criu_set_manage_cgroups(bool manage); void criu_set_manage_cgroups_mode(enum criu_cg_mode mode); int criu_set_freeze_cgroup(const char *name); int criu_set_lsm_profile(const char *name); -int criu_set_lsm_mount_context(const char *name); void criu_set_timeout(unsigned int timeout); void criu_set_auto_ext_mnt(bool val); void criu_set_ext_sharing(bool val); @@ -114,9 +103,6 @@ int criu_set_page_server_address_port(const char *address, int port); int criu_set_pre_dump_mode(enum criu_pre_dump_mode mode); void criu_set_pidfd_store_sk(int sk); int criu_set_network_lock(enum criu_network_lock_method method); -int criu_join_ns_add(const char *ns, const char *ns_file, const char *extra_opt); -void criu_set_mntns_compat_mode(bool val); -int criu_set_config_file(const char *path); /* * The criu_notify_arg_t na argument is an opaque @@ -166,7 +152,6 @@ int criu_get_orphan_pts_master_fd(void); */ int criu_check(void); int criu_dump(void); -int criu_pre_dump(void); int criu_restore(void); int criu_restore_child(void); @@ -191,7 +176,7 @@ int criu_dump_iters(int (*more)(criu_predump_info pi)); * As this library is just forwarding all tasks to an * independent (of this library) CRIU binary, the actual * version of the CRIU binary can be different then the - * hardcoded values in the library (version.h). + * hardcoded values in the libary (version.h). * To be able to easily check the version of the CRIU binary * the function criu_get_version() returns the version * in the following format: @@ -242,7 +227,6 @@ void criu_local_set_tcp_close(criu_opts *opts, bool tcp_close); void criu_local_set_weak_sysctls(criu_opts *opts, bool val); void criu_local_set_evasive_devices(criu_opts *opts, bool evasive_devices); void criu_local_set_shell_job(criu_opts *opts, bool shell_job); -void criu_local_set_skip_file_rwx_check(criu_opts *opts, bool skip_file_rwx_check); void criu_local_set_orphan_pts_master(criu_opts *opts, bool orphan_pts_master); void criu_local_set_file_locks(criu_opts *opts, bool file_locks); void criu_local_set_track_mem(criu_opts *opts, bool track_mem); @@ -257,7 +241,6 @@ void criu_local_set_manage_cgroups(criu_opts *opts, bool manage); void criu_local_set_manage_cgroups_mode(criu_opts *opts, enum criu_cg_mode mode); int criu_local_set_freeze_cgroup(criu_opts *opts, const char *name); int criu_local_set_lsm_profile(criu_opts *opts, const char *name); -int criu_local_set_lsm_mount_context(criu_opts *opts, const char *name); void criu_local_set_timeout(criu_opts *opts, unsigned int timeout); void criu_local_set_auto_ext_mnt(criu_opts *opts, bool val); void criu_local_set_ext_sharing(criu_opts *opts, bool val); @@ -280,15 +263,11 @@ int criu_local_set_page_server_address_port(criu_opts *opts, const char *address int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode); void criu_local_set_pidfd_store_sk(criu_opts *opts, int sk); int criu_local_set_network_lock(criu_opts *opts, enum criu_network_lock_method method); -int criu_local_join_ns_add(criu_opts *opts, const char *ns, const char *ns_file, const char *extra_opt); -void criu_local_set_mntns_compat_mode(criu_opts *opts, bool val); -int criu_local_set_config_file(criu_opts *opts, const char *path); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); int criu_local_check(criu_opts *opts); int criu_local_dump(criu_opts *opts); -int criu_local_pre_dump(criu_opts *opts); int criu_local_restore(criu_opts *opts); int criu_local_restore_child(criu_opts *opts); int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)); @@ -296,38 +275,6 @@ int criu_local_dump_iters(criu_opts *opts, int (*more)(criu_predump_info pi)); int criu_local_get_version(criu_opts *opts); int criu_local_check_version(criu_opts *opts, int minimum); -/* - * Feature checking allows the user to check if CRIU supports - * certain features. There are CRIU features which do not depend - * on the version of CRIU but on kernel features or architecture. - * - * One example is memory tracking. Memory tracking can be disabled - * in the kernel or there are architectures which do not support - * it (aarch64 for example). By using the feature check a libcriu - * user can easily query CRIU if a certain feature is available. - * - * The features which should be checked can be marked in the - * structure 'struct criu_feature_check'. Each structure member - * that is set to true will result in CRIU checking for the - * availability of that feature in the current combination of - * CRIU/kernel/architecture. - * - * Available features will be set to true when the function - * returns successfully. Missing features will be set to false. - */ - -struct criu_feature_check { - bool mem_track; - bool lazy_pages; - bool pidfd_store; -}; - -int criu_feature_check(struct criu_feature_check *features, size_t size); -int criu_local_feature_check(criu_opts *opts, struct criu_feature_check *features, size_t size); - -void criu_local_set_empty_ns(criu_opts *opts, int namespaces); -void criu_set_empty_ns(int namespaces); - #ifdef __GNUG__ } #endif diff --git a/lib/py/.gitignore b/lib/py/.gitignore new file mode 100644 index 000000000..d3090fca3 --- /dev/null +++ b/lib/py/.gitignore @@ -0,0 +1,2 @@ +*_pb2.py +*.pyc diff --git a/lib/pycriu/Makefile b/lib/py/Makefile similarity index 66% rename from lib/pycriu/Makefile rename to lib/py/Makefile index 5ce9bc8f7..691b6bdd3 100644 --- a/lib/pycriu/Makefile +++ b/lib/py/Makefile @@ -1,4 +1,4 @@ -all-y += libpy-images rpc_pb2.py version.py +all-y += libpy-images rpc_pb2.py $(obj)/images/Makefile: ; $(obj)/images/%: .FORCE @@ -11,10 +11,7 @@ libpy-images: rpc_pb2.py: $(Q) protoc -I=images/ --python_out=$(obj) images/$(@:_pb2.py=.proto) -version.py: - $(Q) echo "__version__ = '${CRIU_VERSION}'" > $(obj)/$@ - -cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc version.py) +cleanup-y += $(addprefix $(obj)/,rpc_pb2.py *.pyc) clean-lib-py: $(Q) $(MAKE) $(build)=$(obj)/images clean diff --git a/lib/py/__init__.py b/lib/py/__init__.py new file mode 100644 index 000000000..96b3e9526 --- /dev/null +++ b/lib/py/__init__.py @@ -0,0 +1,3 @@ +from . import rpc_pb2 as rpc +from . import images +from .criu import * diff --git a/crit/crit/__main__.py b/lib/py/cli.py similarity index 94% rename from crit/crit/__main__.py rename to lib/py/cli.py index bce523445..5419384c3 100755 --- a/crit/crit/__main__.py +++ b/lib/py/cli.py @@ -1,17 +1,18 @@ -#!/usr/bin/env python3 +from __future__ import print_function import argparse import sys import json import os import pycriu -from . import __version__ def inf(opts): if opts['in']: return open(opts['in'], 'rb') else: + if (sys.version_info < (3, 0)): + return sys.stdin if sys.stdin.isatty(): # If we are reading from a terminal (not a pipe) we want text input and not binary return sys.stdin @@ -27,6 +28,8 @@ def outf(opts, decode): mode = 'w+' return open(opts['out'], mode) else: + if (sys.version_info < (3, 0)): + return sys.stdout if decode: return sys.stdout return sys.stdout.buffer @@ -42,9 +45,9 @@ def decode(opts): try: img = pycriu.images.load(inf(opts), opts['pretty'], opts['nopl']) except pycriu.images.MagicException as exc: - print("Unknown magic %#x.\n" - "Maybe you are feeding me an image with " - "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) + print("Unknown magic %#x.\n"\ + "Maybe you are feeding me an image with "\ + "raw data(i.e. pages.img)?" % exc.magic, file=sys.stderr) sys.exit(1) if opts['pretty']: @@ -60,9 +63,9 @@ def encode(opts): try: img = json.load(inf(opts)) except UnicodeDecodeError: - print("Cannot read JSON.\n" - "Maybe you are feeding me an image with protobuf data? " - "Encode expects JSON input.", file=sys.stderr) + print("Cannot read JSON.\n"\ + "Maybe you are feeding me an image with protobuf data? "\ + "Encode expects JSON input.", file=sys.stderr) sys.exit(1) pycriu.images.dump(img, outf(opts, False)) @@ -132,7 +135,7 @@ def ftype_find_in_files(opts, ft, fid): if files_img is None: try: files_img = pycriu.images.load(dinf(opts, "files.img"))['entries'] - except Exception: + except: files_img = [] if len(files_img) == 0: @@ -323,12 +326,12 @@ def explore_rss(opts): pvmi = -1 for pm in pms[1:]: pstr = '\t%lx / %-8d' % (pm['vaddr'], pm['nr_pages']) - while vmi < len(vmas) and vmas[vmi]['end'] <= pm['vaddr']: + while vmas[vmi]['end'] <= pm['vaddr']: vmi += 1 pme = pm['vaddr'] + (pm['nr_pages'] << 12) vstr = '' - while vmi < len(vmas) and vmas[vmi]['start'] < pme: + while vmas[vmi]['start'] < pme: vma = vmas[vmi] if vmi == pvmi: vstr += ' ~' @@ -365,7 +368,6 @@ def main(): desc = 'CRiu Image Tool' parser = argparse.ArgumentParser( description=desc, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument('--version', action='version', version=__version__) subparsers = parser.add_subparsers( help='Use crit CMD --help for command-specific help') @@ -375,7 +377,8 @@ def main(): 'decode', help='convert criu image from binary type to json') decode_parser.add_argument( '--pretty', - help='Multiline with indents and some numerical fields in field-specific format', + help= + 'Multiline with indents and some numerical fields in field-specific format', action='store_true') decode_parser.add_argument( '-i', diff --git a/lib/pycriu/criu.py b/lib/py/criu.py similarity index 89% rename from lib/pycriu/criu.py rename to lib/py/criu.py index 51a5c2902..f3e018095 100644 --- a/lib/pycriu/criu.py +++ b/lib/py/criu.py @@ -8,7 +8,6 @@ import struct import pycriu.rpc_pb2 as rpc -CR_DEFAULT_SERVICE_ADDRESS = "./criu_service.socket" class _criu_comm: """ @@ -46,14 +45,7 @@ class _criu_comm_sk(_criu_comm): def connect(self, daemon): self.sk = socket.socket(socket.AF_UNIX, socket.SOCK_SEQPACKET) - try: - self.sk.connect(self.comm) - - except FileNotFoundError: - raise FileNotFoundError("Socket file not found.") - - except ConnectionRefusedError: - raise ConnectionRefusedError("Service not running.") + self.sk.connect(self.comm) return self.sk @@ -111,7 +103,7 @@ class _criu_comm_bin(_criu_comm): os.close(2) css[0].send(struct.pack('i', os.getpid())) - os.execvp(self.comm, + os.execv(self.comm, [self.comm, 'swrk', "%d" % css[0].fileno()]) os._exit(1) @@ -189,14 +181,15 @@ class CRIUExceptionExternal(CRIUException): if self.errno == errno.EBADRQC: s += "Bad options" - elif self.typ == rpc.DUMP and self.errno == errno.ESRCH: - s += "No process with such pid" + if self.typ == rpc.DUMP: + if self.errno == errno.ESRCH: + s += "No process with such pid" - elif self.typ == rpc.RESTORE and self.errno == errno.EEXIST: - s += "Process with requested pid already exists" + if self.typ == rpc.RESTORE: + if self.errno == errno.EEXIST: + s += "Process with requested pid already exists" - else: - s += "Unknown" + s += "Unknown" return s @@ -211,11 +204,10 @@ class criu: def __init__(self): self.use_binary('criu') - # images_dir_fd is required field with default value of -1 - self.opts = rpc.criu_opts(images_dir_fd=-1) + self.opts = rpc.criu_opts() self.sk = None - def use_sk(self, sk_name=CR_DEFAULT_SERVICE_ADDRESS): + def use_sk(self, sk_name): """ Access criu using unix socket which that belongs to criu service daemon. """ @@ -242,7 +234,7 @@ class criu: # process resources from its own if criu is located in a same # process tree it is trying to dump. daemon = False - if req.type == rpc.DUMP and (not req.opts.HasField('pid') or req.opts.pid == os.getpid()): + if req.type == rpc.DUMP and not req.opts.HasField('pid'): daemon = True try: @@ -274,7 +266,6 @@ class criu: """ req = rpc.criu_req() req.type = rpc.CHECK - req.opts.MergeFrom(self.opts) resp = self._send_req_and_recv_resp(req) diff --git a/lib/pycriu/images/.gitignore b/lib/py/images/.gitignore similarity index 100% rename from lib/pycriu/images/.gitignore rename to lib/py/images/.gitignore diff --git a/lib/pycriu/images/Makefile b/lib/py/images/Makefile similarity index 100% rename from lib/pycriu/images/Makefile rename to lib/py/images/Makefile diff --git a/lib/pycriu/images/__init__.py b/lib/py/images/__init__.py similarity index 100% rename from lib/pycriu/images/__init__.py rename to lib/py/images/__init__.py diff --git a/lib/pycriu/images/images.py b/lib/py/images/images.py similarity index 89% rename from lib/pycriu/images/images.py rename to lib/py/images/images.py index 9db506e1e..300b1cc69 100644 --- a/lib/pycriu/images/images.py +++ b/lib/py/images/images.py @@ -42,6 +42,7 @@ import base64 import struct import os import array +import sys from . import magic from . import pb @@ -68,16 +69,6 @@ class MagicException(Exception): self.magic = magic -def decode_base64_data(data): - """A helper function to decode base64 data.""" - return base64.decodebytes(str.encode(data)) - - -def write_base64_data(f, data): - """A helper function to write base64 encoded data to a file.""" - f.write(base64.decodebytes(str.encode(data))) - - # Generic class to handle loading/dumping criu images entries from/to bin # format to/from dict(json). class entry_handler: @@ -96,7 +87,7 @@ class entry_handler: def load(self, f, pretty=False, no_payload=False): """ Convert criu image entries from binary format to dict(json). - Takes a file-like object and returns a list with entries in + Takes a file-like object and returnes a list with entries in dict(json) format. """ entries = [] @@ -107,7 +98,7 @@ class entry_handler: # Read payload pbuff = self.payload() buf = f.read(4) - if len(buf) == 0: + if buf == b'': break size, = struct.unpack('i', buf) pbuff.ParseFromString(f.read(size)) @@ -181,7 +172,7 @@ class entry_handler: while True: buf = f.read(4) - if len(buf) == 0: + if buf == '': break size, = struct.unpack('i', buf) f.seek(size, 1) @@ -204,7 +195,7 @@ class pagemap_handler: pbuff = pb.pagemap_head() while True: buf = f.read(4) - if len(buf) == 0: + if buf == b'': break size, = struct.unpack('i', buf) pbuff.ParseFromString(f.read(size)) @@ -294,9 +285,15 @@ class ghost_file_handler: size = len(pb_str) f.write(struct.pack('i', size)) f.write(pb_str) - write_base64_data(f, item['extra']) + if (sys.version_info > (3, 0)): + f.write(base64.decodebytes(str.encode(item['extra']))) + else: + f.write(base64.decodebytes(item['extra'])) else: - write_base64_data(f, item['extra']) + if (sys.version_info > (3, 0)): + f.write(base64.decodebytes(str.encode(item['extra']))) + else: + f.write(base64.decodebytes(item['extra'])) def dumps(self, entries): f = io.BytesIO('') @@ -317,7 +314,10 @@ class pipes_data_extra_handler: return base64.encodebytes(data).decode('utf-8') def dump(self, extra, f, pload): - data = decode_base64_data(extra) + if (sys.version_info > (3, 0)): + data = base64.decodebytes(str.encode(extra)) + else: + data = base64.decodebytes(extra) f.write(data) def skip(self, f, pload): @@ -332,7 +332,10 @@ class sk_queues_extra_handler: return base64.encodebytes(data).decode('utf-8') def dump(self, extra, f, _unused): - data = decode_base64_data(extra) + if (sys.version_info > (3, 0)): + data = base64.decodebytes(str.encode(extra)) + else: + data = base64.decodebytes(extra) f.write(data) def skip(self, f, pload): @@ -353,8 +356,12 @@ class tcp_stream_extra_handler: return d def dump(self, extra, f, _unused): - inq = decode_base64_data(extra['inq']) - outq = decode_base64_data(extra['outq']) + if (sys.version_info > (3, 0)): + inq = base64.decodebytes(str.encode(extra['inq'])) + outq = base64.decodebytes(str.encode(extra['outq'])) + else: + inq = base64.decodebytes(extra['inq']) + outq = base64.decodebytes(extra['outq']) f.write(inq) f.write(outq) @@ -363,7 +370,6 @@ class tcp_stream_extra_handler: f.seek(0, os.SEEK_END) return pbuff.inq_len + pbuff.outq_len - class bpfmap_data_extra_handler: def load(self, f, pload): size = pload.keys_bytes + pload.values_bytes @@ -378,13 +384,14 @@ class bpfmap_data_extra_handler: f.seek(pload.bytes, os.SEEK_CUR) return pload.bytes - class ipc_sem_set_handler: def load(self, f, pbuff): entry = pb2dict.pb2dict(pbuff) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) - s = self._get_sem_array() + s = array.array('H') + if s.itemsize != sizeof_u16: + raise Exception("Array size mismatch") s.frombytes(f.read(size)) f.seek(rounded - size, 1) return s.tolist() @@ -393,7 +400,9 @@ class ipc_sem_set_handler: entry = pb2dict.pb2dict(pbuff) size = sizeof_u16 * entry['nsems'] rounded = round_up(size, sizeof_u64) - s = self._get_sem_array() + s = array.array('H') + if s.itemsize != sizeof_u16: + raise Exception("Array size mismatch") s.fromlist(extra) if len(s) != entry['nsems']: raise Exception("Number of semaphores mismatch") @@ -406,16 +415,23 @@ class ipc_sem_set_handler: f.seek(round_up(size, sizeof_u64), os.SEEK_CUR) return size - def _get_sem_array(self): - s = array.array('H') - if s.itemsize != sizeof_u16: - raise Exception("Array size mismatch") - return s - class ipc_msg_queue_handler: def load(self, f, pbuff): - messages, _ = self._read_messages(f, pbuff) + entry = pb2dict.pb2dict(pbuff) + messages = [] + for x in range(0, entry['qnum']): + buf = f.read(4) + if buf == '': + break + size, = struct.unpack('i', buf) + msg = pb.ipc_msg() + msg.ParseFromString(f.read(size)) + rounded = round_up(msg.msize, sizeof_u64) + data = f.read(msg.msize) + f.seek(rounded - msg.msize, 1) + messages.append(pb2dict.pb2dict(msg)) + messages.append(base64.encodebytes(data).decode('utf-8')) return messages def dump(self, extra, f, pbuff): @@ -427,37 +443,28 @@ class ipc_msg_queue_handler: f.write(struct.pack('i', size)) f.write(msg_str) rounded = round_up(msg.msize, sizeof_u64) - data = decode_base64_data(extra[i + 1]) + if (sys.version_info > (3, 0)): + data = base64.decodebytes(str.encode(extra[i + 1])) + else: + data = base64.decodebytes(extra[i + 1]) f.write(data[:msg.msize]) f.write(b'\0' * (rounded - msg.msize)) def skip(self, f, pbuff): - _, pl_len = self._read_messages(f, pbuff, skip_data=True) - return pl_len - - def _read_messages(self, f, pbuff, skip_data=False): entry = pb2dict.pb2dict(pbuff) - messages = [] pl_len = 0 for x in range(0, entry['qnum']): buf = f.read(4) - if len(buf) == 0: + if buf == '': break size, = struct.unpack('i', buf) msg = pb.ipc_msg() msg.ParseFromString(f.read(size)) rounded = round_up(msg.msize, sizeof_u64) + f.seek(rounded, os.SEEK_CUR) pl_len += size + msg.msize - if skip_data: - f.seek(rounded, os.SEEK_CUR) - else: - data = f.read(msg.msize) - f.seek(rounded - msg.msize, 1) - messages.append(pb2dict.pb2dict(msg)) - messages.append(base64.encodebytes(data).decode('utf-8')) - - return messages, pl_len + return pl_len class ipc_shm_handler: @@ -553,7 +560,7 @@ handlers = { 'MEMFD_INODE': entry_handler(pb.memfd_inode_entry), 'BPFMAP_FILE': entry_handler(pb.bpfmap_file_entry), 'BPFMAP_DATA': entry_handler(pb.bpfmap_data_entry, - bpfmap_data_extra_handler()), + bpfmap_data_extra_handler()), 'APPARMOR': entry_handler(pb.apparmor_entry), } @@ -567,12 +574,12 @@ def __rhandler(f): try: m = magic.by_val[img_magic] - except Exception: + except: raise MagicException(img_magic) try: handler = handlers[m] - except Exception: + except: raise Exception("No handler found for image with magic " + m) return m, handler @@ -634,7 +641,7 @@ def dump(img, f): try: handler = handlers[m] - except Exception: + except: raise Exception("No handler found for image with such magic") handler.dump(img['entries'], f) diff --git a/lib/pycriu/images/pb2dict.py b/lib/py/images/pb2dict.py similarity index 95% rename from lib/pycriu/images/pb2dict.py rename to lib/py/images/pb2dict.py index f22887a52..cfaff6c7d 100644 --- a/lib/pycriu/images/pb2dict.py +++ b/lib/py/images/pb2dict.py @@ -3,6 +3,7 @@ import collections import os import quopri import socket +import sys from ipaddress import IPv4Address, IPv6Address, ip_address from google.protobuf.descriptor import FieldDescriptor as FD @@ -22,7 +23,7 @@ if "encodebytes" not in dir(base64): # here are some of them: # - both have a common bug in treating optional field with empty # repeated inside. -# - protobuf-to-json is not available in pip or in any other python +# - protobuf-to-json is not avalible in pip or in any other python # repo, so it is hard to distribute and we can't rely on it. # - both do not treat enums in a way we would like to. They convert # protobuf enum to int, but we need a string here, because it is @@ -83,7 +84,6 @@ mmap_prot_map = [ mmap_flags_map = [ ('MAP_SHARED', 0x1), ('MAP_PRIVATE', 0x2), - ('MAP_DROPPABLE', 0x08), ('MAP_ANON', 0x20), ('MAP_GROWSDOWN', 0x0100), ] @@ -103,9 +103,6 @@ mmap_status_map = [ ('VMA_AREA_SOCKET', 1 << 11), ('VMA_AREA_VVAR', 1 << 12), ('VMA_AREA_AIORING', 1 << 13), - ('VMA_AREA_MEMFD', 1 << 14), - ('VMA_AREA_SHSTK', 1 << 15), - ('VMA_AREA_UPROBES', 1 << 17), ('VMA_UNSUPP', 1 << 31), ] @@ -154,9 +151,8 @@ flags_maps = { gen_maps = { 'task_state': { 1: 'Alive', - 2: 'Dead', - 3: 'Stopped', - 6: 'Zombie', + 3: 'Zombie', + 6: 'Stopped' }, } @@ -251,11 +247,17 @@ def encode_dev(field, value): def encode_base64(value): - return base64.encodebytes(value).decode() + if (sys.version_info > (3, 0)): + return base64.encodebytes(value).decode() + else: + return base64.encodebytes(value) def decode_base64(value): - return base64.decodebytes(str.encode(value)) + if (sys.version_info > (3, 0)): + return base64.decodebytes(str.encode(value)) + else: + return base64.decodebytes(value) def encode_unix(value): @@ -307,7 +309,7 @@ def _pb2dict_cast(field, value, pretty=False, is_hex=False): return field.enum_type.values_by_number.get(value, None).name elif field.type in _basic_cast: cast = _basic_cast[field.type] - if pretty and cast is int: + if pretty and (cast == int): if is_hex: # Fields that have (criu).hex = true option set # should be stored in hex string format. @@ -362,24 +364,21 @@ def pb2dict(pb, pretty=False, is_hex=False): else: d_val = _pb2dict_cast(field, value, pretty, is_hex) - try: - d[field.name] = d_val.decode() - except (UnicodeDecodeError, AttributeError): - d[field.name] = d_val + d[field.name] = d_val.decode() if type(d_val) == bytes else d_val return d def _dict2pb_cast(field, value): # Not considering TYPE_MESSAGE here, as repeated # and non-repeated messages need special treatment - # in this case, and are handled separately. + # in this case, and are hadled separately. if field.type == FD.TYPE_BYTES: return get_bytes_dec(field)(value) elif field.type == FD.TYPE_ENUM: return field.enum_type.values_by_name.get(value, None).number elif field.type in _basic_cast: cast = _basic_cast[field.type] - if cast is int and is_string(value): + if (cast == int) and is_string(value): if _marked_as_dev(field): return encode_dev(field, value) diff --git a/lib/pycriu/.gitignore b/lib/pycriu/.gitignore deleted file mode 100644 index 111642787..000000000 --- a/lib/pycriu/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -__pycache__ -*_pb2.py -*.pyc -version.py diff --git a/lib/pycriu/__init__.py b/lib/pycriu/__init__.py deleted file mode 100644 index 28f1e9424..000000000 --- a/lib/pycriu/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from . import rpc_pb2 as rpc -from . import images -from .criu import criu, CRIUExceptionExternal, CRIUException -from .criu import CR_DEFAULT_SERVICE_ADDRESS -from .version import __version__ - -__all__ = ( - "rpc", - "images", - "criu", - "CRIUExceptionExternal", - "CRIUException", - "CR_DEFAULT_SERVICE_ADDRESS", - "__version__", -) \ No newline at end of file diff --git a/lib/pyproject.toml b/lib/pyproject.toml deleted file mode 100644 index ea9f88dcc..000000000 --- a/lib/pyproject.toml +++ /dev/null @@ -1,20 +0,0 @@ -[build-system] -requires = ["setuptools", "protobuf<4.0"] -build-backend = "setuptools.build_meta" - -[project] -name = "pycriu" -description = "Python bindings for CRIU" -authors = [ - {name = "CRIU team", email = "criu@lists.linux.dev"}, -] -license = {text = "LGPLv2.1"} -dynamic = ["version"] -requires-python = ">=3.6" -dependencies = ["protobuf"] - -[tool.setuptools] -packages = ["pycriu", "pycriu.images"] - -[tool.setuptools.dynamic] -version = {attr = "pycriu.__version__"} diff --git a/lib/setup.cfg b/lib/setup.cfg deleted file mode 100644 index 28c9e49c3..000000000 --- a/lib/setup.cfg +++ /dev/null @@ -1,18 +0,0 @@ -# Configuring setuptools using pyproject.toml files was introduced in setuptools 61.0.0 -# https://setuptools.pypa.io/en/latest/history.html#v61-0-0 -# For older versions of setuptools, we need to use the setup.cfg file -# https://setuptools.pypa.io/en/latest/userguide/declarative_config.html#declarative-config - -[metadata] -name = pycriu -description = Python bindings for CRIU -author = CRIU team -author_email = criu@lists.linux.dev -license = LGPLv2.1 -version = attr: pycriu.__version__ - -[options] -packages = find: -python_requires = >=3.6 -install_requires = - protobuf diff --git a/plugins/amdgpu/.gitignore b/plugins/amdgpu/.gitignore deleted file mode 100644 index 4e5c8f58e..000000000 --- a/plugins/amdgpu/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.pb-c.c -*.pb-c.h -test_topology_remap diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile deleted file mode 100644 index 250e7b0e7..000000000 --- a/plugins/amdgpu/Makefile +++ /dev/null @@ -1,65 +0,0 @@ -PLUGIN_NAME := amdgpu_plugin -PLUGIN_SOBJ := amdgpu_plugin.so - - -PLUGIN_INCLUDE := -iquote../../include -PLUGIN_INCLUDE += -iquote../../criu/include -PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ -PLUGIN_INCLUDE += -iquote../../ - -COMPEL := ../../compel/compel-host -LIBDRM_INC := -I/usr/include/libdrm -DEPS_OK := amdgpu_plugin.so amdgpu_plugin_test -DEPS_NOK := ; - -__nmk_dir ?= ../../scripts/nmk/scripts/ -include $(__nmk_dir)msg.mk - -PLUGIN_CFLAGS := -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC -PLUGIN_LDFLAGS := -lpthread -lrt -ldrm -ldrm_amdgpu - -ifeq ($(CONFIG_AMDGPU),y) - all: $(DEPS_OK) -else - all: $(DEPS_NOK) -endif - -criu-amdgpu.pb-c.c: criu-amdgpu.proto - protoc --proto_path=. --c_out=. criu-amdgpu.proto - -amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c - $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC) - -amdgpu_plugin_clean: - $(call msg-clean, $@) - $(Q) $(RM) amdgpu_plugin.so criu-amdgpu.pb-c* -.PHONY: amdgpu_plugin_clean - -test_topology_remap: amdgpu_plugin_topology.c tests/test_topology_remap.c - $(CC) $^ -o $@ -DCOMPILE_TESTS $(PLUGIN_INCLUDE) -I . - -amdgpu_plugin_test: test_topology_remap -.PHONY: amdgpu_plugin_test - -amdgpu_plugin_test_clean: - $(Q) $(RM) test_topology_remap -.PHONY: amdgpu_plugin_test_clean - -clean: amdgpu_plugin_clean amdgpu_plugin_test_clean - -mrproper: clean - -install: -ifeq ($(CONFIG_AMDGPU),y) - $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) - $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) -endif -.PHONY: install - -uninstall: -ifeq ($(CONFIG_AMDGPU),y) - $(E) " UNINSTALL" $(PLUGIN_NAME) - $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) -endif -.PHONY: uninstall diff --git a/plugins/amdgpu/README.md b/plugins/amdgpu/README.md deleted file mode 100644 index b808fbc4f..000000000 --- a/plugins/amdgpu/README.md +++ /dev/null @@ -1,295 +0,0 @@ -Supporting ROCm with CRIU -========================= - -_Felix Kuehling _
-_Rajneesh Bardwaj _
-_David Yat Sin _
-_Yanning Yang _ - -# Introduction - -ROCm is the Radeon Open Compute Platform developed by AMD to support -high-performance computing and machine learning on AMD GPUs. It is a nearly -fully open-source software stack starting from the kernel mode GPU driver, -including compilers and language runtimes, all the way up to optimized -mathematics libraries, machine learning frameworks and communication libraries. - -Documentation for the ROCm platform can be found here: -https://rocmdocs.amd.com/en/latest/ - -CRIU is a tool for freezing and checkpointing running applications or -containers and later restoring them on the same or a different system. The -process is transparent to the application being checkpointed. It is mostly -implemented in user mode and relies heavily on Linux kernel features, e.g. -cgroups, ptrace, vmsplice, and more. It can checkpoint and restore most -applications relying on standard libraries. However, it is not able to -checkpoint and restore applications using device drivers, with their own -per-application kernel mode state, out of the box. This includes ROCm -applications using the KFD device driver to access GPU hardware resources. CRIU -includes some plugin hooks to allow extending it to add such support in the -future. - -A common environment for ROCm applications is in data centers and compute -clusters. In this environment, migrating applications using CRIU would be -beneficial and desirable. This paper outlines AMDs plans for adding ROCm -support to CRIU. - -# State associated with ROCm applications - -ROCm applications communicate with the kernel mode driver “amdgpu.ko” through -the Thunk library “libhsakmt.so” to enumerate available GPUs, manage -GPU-accessible memory, user mode queues for submitting work to the GPUs, and -events for synchronizing with GPUs. Many of those APIs create and manipulate -state maintained in the kernel mode driver that would need to be saved and -restored by CRIU. - -## Memory - -ROCm manages memory in the form of buffer objects (BOs). We are also working on -a new memory management API that will be based on virtual address ranges. For -now, we are focusing on the buffer-object based memory management. - -There are different types of buffer objects supported: - -* VRAM (device memory managed by the kernel mode driver) -* GTT (system memory managed by the kernel mode driver) -* Userptr (normal system memory managed by user mode driver or application) -* Doorbell (special aperture for sending signals to the GPU for user mode command submissions) -* MMIO (special aperture for accessing GPU control registers, used for certain cache flushing operations) - -All these BOs are typically mapped into the GPU page tables for access by GPUs. -Most of them are also mapped for CPU access. The following BO properties need -to be saved and restored for CRIU to work with ROCm applications: - -* Buffer type -* Buffer handle -* Buffer size (page aligned) -* Virtual address for GPU mapping (page aligned) -* Device file offset for CPU mapping (for VRAM and GTT BOs) -* Memory contents (for VRAM and GTT BOs) - -## Queues - -ROCm uses user mode queues to submit work to the GPUs. There are several memory -buffers associated with queues. At the language runtime or application level, -they expose the ring buffer as well as a signal object to tell the GPU about -new commands added to the queue. The signal is mapped to a doorbell (a 64-bit -entry in the doorbell aperture mapped by the doorbell BO). Internally there are -other buffers needed for dispatch completion tracking, shader state saving -during queue preemption and the queue state itself. Some of these buffers are -managed in user mode, others are managed in kernel mode. - -When an application is checkpointed, we need to preempt all user mode queues -belonging to the process, and then save their state, including: - -* Queue type (compute or DMA) -* MQD (memory queue descriptor managed in kernel mode), with state such as - * ring buffer address - * read and write pointers - * doorbell offset - * pointer to AQL queue data structure -* Control stack (kernel-managed piece of state needed for resuming preempted queue) - -The rest of the queue state is contained in user-managed buffer objects that -will be saved by the memory state handling described above: - -* Ring buffer (userptr BO containing commands sent to the GPU) -* AQL queue data structure (userptr BO containing `struct hsa_queue_t`) -* EOP buffer (VRAM BO used for dispatch completion tracking by the command processor) -* Context save area (userptr BO for saving shader state of preempted wavefronts) - -## Events - -Events are used to implement interrupt-based sleeping/waiting for signals sent -from the GPU to the host. Signals are represented by some data structures in -KFD and an entry in a user-allocated, GPU-accessible BO with event slots. We -need to save the allocated set of event IDs and each event’s signaling state. -The contents of the event slots will be saved by the memory state handling -described above. - -## Topology - -When ROCm applications are started, they enumerate the device topology to find -available GPUs, their capabilities and connectivity. An application can be -checkpointed at any time, so it will not be at a safe place to re-enumerate the -topology when it is restored. Therefore, we can only support restoring -applications on systems with a very similar topology: - -* Same number of GPUs -* Same type of GPUs (i.e. instruction set, cache sizes, number of compute units, etc.) -* Same or larger memory size -* Same VRAM accessibility by the host -* Same connectivity and P2P memory support between GPUs - -At the KFD ioctl level, GPUs are identified by GPUIDs, which are unique -identifiers created by hashing various GPU properties. That way a GPUID will -not change during the lifetime of a process, even in a future where GPUs may be -added or removed dynamically. When restoring a process on a different system, -the GPUID may have changed. Or it may be desirable to restore a process using a -different subset of GPUs on the same system (using cgroups). Therefore, we will -need a translation of GPUIDs for restored processes that applies to all KFD -ioctl calls after an application was restored. - -# CRIU plugins - -CRIU provides plugin hooks for device files: - - int cr_plugin_dump_file(int fd, int id); - int cr_plugin_restore_file(int id); - -In a ROCm process, it will be invoked for `/dev/kfd` and `/dev/dri/renderD*` -device nodes. `/dev/kfd` is used for KFD ioctl calls to manage memory, queues, -signals and other functionality for all GPUs through a single device file -descriptor. `/dev/dri/renderD*` are per GPU device files, called render nodes, -that are used mostly for CPU mapping of VRAM and GTT BOs. Each BO is given a -unique offset in the render node of the corresponding GPU at allocation time. - -Render nodes are also used for memory management and command submission by the -Mesa user mode driver for video decoding and post processing. These use cases -are relevant even in data centers. Support for this is not an immediate -priority but planned for the future. This will require saving additional state -as well as synchronization with any outstanding jobs. For now, there is no -kernel-mode state associated with `/dev/renderD*`. - -The two existing plugins can be used for saving and restoring most state -associated with ROCm applications. We are planning to add new ioctl calls to -`/dev/kfd` to help with this. - -## Dumping - -At the “dump” stage, the ioctl will execute in the context of the CRIU dumper -process. But the file descriptor (fd) is “drained” from the process being saved -by the parasite code that CRIU injects into its target. This allows the plugin -to make an ioctl call with enough context to allow KFD to access all the kernel -mode state associated with the target process. CRIU is ptrace attached to the -target process. KFD can use that fact to authorize access to the target -process' information. - -The contents of GTT and VRAM BOs are not automatically saved by CRIU. CRIU can -only support saving the contents of normal pageable mappings. GTT and VRAM BOs -are special device file IO mappings. Therefore, our dumper plugin will need to -save the contents of these BOs. In the initial implementation they can be -accessed through `/proc//mem`. For better performance we can use a DMA -engine in the GPU to copy the data to system memory. - -## Restoring - -At the “restore” stage we first need to ensure that the topology of visible -devices (in the cgroup) is compatible with the topology that was saved. Once -this is confirmed, we can use a new ioctl to load the saved state back into -KFD. This ioctl will run in the context of the process being restored, so no -special authorization is needed. However, some of the data being copied back -into kernel mode could have been tampered with. MQDs and control stacks provide -access to privileged GPU registers. Therefore, the restore ioctl will only be -allowed to run with root privileges. - -## Remapping render nodes and mmap offsets - -BOs are mapped for CPU access by mmapping the GPU's render node at a specific -offset. The offset within the render node device file identifies the BO. -However, when we recreate the BOs, we cannot guarantee that they will be -restored with the same mmap offset that was saved, because the mmap offset -address space per device is shared system wide. - -When a process is restored on a different GPU, it will need to map the BOs from -a different render node device file altogether. - -A new plugin call will be needed to translate device file names and mmap -offsets to the newly allocated ones, before CRIU's PIE code restores the VMA -mappings. Fortunately, ROCm user mode does not remember the file names and mmap -offsets after establishing the mappings, so changing the device files and mmap -offsets under the hood will not be noticed by ROCm user mode. - -*This new plugin is enabled by the new hook `__UPDATE_VMA_MAP` in our RFC patch -series.* - -## Resuming GPU execution - -At the time of running the `cr_plugin_restore_file` plugin, it is too early to -restore userptr GPU page table mappings and their MMU notifiers. These mappings -mirror CPU page tables into GPU page tables using the HMM mirror API in the -kernel. The MMU notifiers notify the driver when the virtual address mapping -changes so that the GPU mapping can be updated. - -This needs to happen after the restorer PIE code has restored all the VMAs at -their correct virtual addresses. Otherwise, the HMM mirroring will simply fail. -Before all the GPU memory mappings are in place, it is also too early to resume -the user mode queue execution on the GPUs. - -Therefore, a new plugin is needed that runs in the context of the master -restore process after the restorer PIE code has restored all the VMAs and -returned control to all the restored processes via sigreturn. It needs to be -called once for each restored target process to finalize userptr mappings and -to resume execution on the GPUs. - -*This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC -patch series.* - -## Restoring BO content in parallel - -Restoring the BO content is an important part in the restore of GPU state and -usually takes a significant amount of time. A possible location for this -procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook -blocks the target process from performing other restore operations, which -hinders further optimization of the restore process. - -Therefore, a new plugin hook that runs in the master restore process is -introduced, and it interacts with the `cr_plugin_restore_file` hook to complete -the restore of BO content. Specifically, the target process only needs to send -the relevant BOs to the master restore process, while this new hook handles all -the restore of buffer objects. Through this method, during the restore of the BO -content, the target process can perform other restore operations, thus -accelerating the restore procedure. This is an implementation of the gCROP -method proposed in the ACM SoCC'24 paper: [On-demand and Parallel -Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510). - -*This optimization technique is enabled by the `__POST_FORKING` hook.* - -## Other CRIU changes - -In addition to the new plugins, we need to make some changes to CRIU itself to -support device file VMAs. Currently CRIU will simply fail to dump a process -that has such PFN or IO memory mappings. While CRIU will not need to save the -contents of those VMAs, we do need CRIU to save and restore the VMAs -themselves, with translated mmap offsets (see “Remapping mmap offsets” above). - -## Security considerations - -The new “dump” ioctl we are adding to `/dev/kfd` will expose information about -remote processes. This is a potential security threat. CRIU will be -ptrace-attached to the target process, which gives it full access to the state -of the process being dumped. KFD can use ptrace attachment to authorize the use -of the new ioctl on a specific target process. - -The new “restore” ioctl will load privileged information from user mode back -into the kernel driver and the hardware. This includes MQD contents, which will -eventually be loaded into HQD registers, as well as a control stack, which is a -series of low-level commands that will be executed by the command processor. -Therefore, we are limiting this ioctl to the root user. If CRIU restore must be -possible for non-root users, we need to sanitize the privileged state to ensure -it cannot be used to circumvent system security policies (e.g. arbitrary code -execution in privileged contexts with access to page tables etc.). - -Modified mmap offsets could potentially be used to access BOs belonging to -different processes. This potential threat is not new with CRIU. `amdgpu.ko` -already implements checking of mmap offsets to ensure a context (represented by -a render node file descriptor) is only allowed access to its own BOs. - -# Glossary - -Term | Definition ---- | --- -CRIU | Checkpoint/Restore In Userspace -ROCm | Radeon Open Compute Platform -Thunk | User-mode API interface to interact with amdgpu.ko -KFD | AMD Kernel Fusion Driver -Mesa | Open source OpenGL implementation -GTT | Graphics Translation Table, also used to denote kernel-managed system memory for GPU access -VRAM | Video RAM -BO | Buffer Object -HMM | Heterogeneous Memory Management -AQL | Architected Queueing Language -EOP | End of pipe (event indicating shader dispatch completion) -MQD | Memory Queue Descriptors -HQD | Hardware Queue Descriptors -PIE | Position Independent Executable diff --git a/plugins/amdgpu/amdgpu_drm.h b/plugins/amdgpu/amdgpu_drm.h deleted file mode 100644 index 69227a12b..000000000 --- a/plugins/amdgpu/amdgpu_drm.h +++ /dev/null @@ -1,1801 +0,0 @@ -/* amdgpu_drm.h -- Public header for the amdgpu driver -*- linux-c -*- - * - * Copyright 2000 Precision Insight, Inc., Cedar Park, Texas. - * Copyright 2000 VA Linux Systems, Inc., Fremont, California. - * Copyright 2002 Tungsten Graphics, Inc., Cedar Park, Texas. - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * Authors: - * Kevin E. Martin - * Gareth Hughes - * Keith Whitwell - */ - -#ifndef __AMDGPU_DRM_H__ -#define __AMDGPU_DRM_H__ - -#include "drm.h" - -#if defined(__cplusplus) -extern "C" { -#endif - -#define DRM_AMDGPU_GEM_CREATE 0x00 -#define DRM_AMDGPU_GEM_MMAP 0x01 -#define DRM_AMDGPU_CTX 0x02 -#define DRM_AMDGPU_BO_LIST 0x03 -#define DRM_AMDGPU_CS 0x04 -#define DRM_AMDGPU_INFO 0x05 -#define DRM_AMDGPU_GEM_METADATA 0x06 -#define DRM_AMDGPU_GEM_WAIT_IDLE 0x07 -#define DRM_AMDGPU_GEM_VA 0x08 -#define DRM_AMDGPU_WAIT_CS 0x09 -#define DRM_AMDGPU_GEM_OP 0x10 -#define DRM_AMDGPU_GEM_USERPTR 0x11 -#define DRM_AMDGPU_WAIT_FENCES 0x12 -#define DRM_AMDGPU_VM 0x13 -#define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 -#define DRM_AMDGPU_SCHED 0x15 -#define DRM_AMDGPU_USERQ 0x16 -#define DRM_AMDGPU_USERQ_SIGNAL 0x17 -#define DRM_AMDGPU_USERQ_WAIT 0x18 -#define DRM_AMDGPU_GEM_LIST_HANDLES 0x19 -/* not upstream */ -#define DRM_AMDGPU_GEM_DGMA 0x5c - -/* hybrid specific ioctls */ -#define DRM_AMDGPU_SEM 0x5b - -#define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) -#define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) -#define DRM_IOCTL_AMDGPU_CTX DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CTX, union drm_amdgpu_ctx) -#define DRM_IOCTL_AMDGPU_BO_LIST DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_BO_LIST, union drm_amdgpu_bo_list) -#define DRM_IOCTL_AMDGPU_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_CS, union drm_amdgpu_cs) -#define DRM_IOCTL_AMDGPU_INFO DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_INFO, struct drm_amdgpu_info) -#define DRM_IOCTL_AMDGPU_GEM_METADATA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_METADATA, struct drm_amdgpu_gem_metadata) -#define DRM_IOCTL_AMDGPU_GEM_WAIT_IDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_WAIT_IDLE, union drm_amdgpu_gem_wait_idle) -#define DRM_IOCTL_AMDGPU_GEM_VA DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_VA, struct drm_amdgpu_gem_va) -#define DRM_IOCTL_AMDGPU_WAIT_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_CS, union drm_amdgpu_wait_cs) -#define DRM_IOCTL_AMDGPU_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_OP, struct drm_amdgpu_gem_op) -#define DRM_IOCTL_AMDGPU_GEM_USERPTR DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_USERPTR, struct drm_amdgpu_gem_userptr) -#define DRM_IOCTL_AMDGPU_WAIT_FENCES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_WAIT_FENCES, union drm_amdgpu_wait_fences) -#define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_VM, union drm_amdgpu_vm) -#define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union drm_amdgpu_fence_to_handle) -#define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + DRM_AMDGPU_SCHED, union drm_amdgpu_sched) -#define DRM_IOCTL_AMDGPU_USERQ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ, union drm_amdgpu_userq) -#define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) -#define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) -#define DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_LIST_HANDLES, struct drm_amdgpu_gem_list_handles) - -#define DRM_IOCTL_AMDGPU_GEM_DGMA DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_DGMA, struct drm_amdgpu_gem_dgma) - -/** - * DOC: memory domains - * - * %AMDGPU_GEM_DOMAIN_CPU System memory that is not GPU accessible. - * Memory in this pool could be swapped out to disk if there is pressure. - * - * %AMDGPU_GEM_DOMAIN_GTT GPU accessible system memory, mapped into the - * GPU's virtual address space via gart. Gart memory linearizes non-contiguous - * pages of system memory, allows GPU access system memory in a linearized - * fashion. - * - * %AMDGPU_GEM_DOMAIN_VRAM Local video memory. For APUs, it is memory - * carved out by the BIOS. - * - * %AMDGPU_GEM_DOMAIN_GDS Global on-chip data storage used to share data - * across shader threads. - * - * %AMDGPU_GEM_DOMAIN_GWS Global wave sync, used to synchronize the - * execution of all the waves on a device. - * - * %AMDGPU_GEM_DOMAIN_OA Ordered append, used by 3D or Compute engines - * for appending data. - * - * %AMDGPU_GEM_DOMAIN_DOORBELL Doorbell. It is an MMIO region for - * signalling user mode queues. - * - * %AMDGPU_GEM_DOMAIN_MMIO_REMAP MMIO remap page (special mapping for HDP flushing). - */ -/* hybrid specific ioctls */ -#define DRM_IOCTL_AMDGPU_SEM DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_SEM, union drm_amdgpu_sem) - -#define AMDGPU_GEM_DOMAIN_CPU 0x1 -#define AMDGPU_GEM_DOMAIN_GTT 0x2 -#define AMDGPU_GEM_DOMAIN_VRAM 0x4 -#define AMDGPU_GEM_DOMAIN_GDS 0x8 -#define AMDGPU_GEM_DOMAIN_GWS 0x10 -#define AMDGPU_GEM_DOMAIN_OA 0x20 -#define AMDGPU_GEM_DOMAIN_DOORBELL 0x40 -#define AMDGPU_GEM_DOMAIN_MMIO_REMAP 0x80 -#define AMDGPU_GEM_DOMAIN_DGMA 0x400 -#define AMDGPU_GEM_DOMAIN_DGMA_IMPORT 0x800 - -#define AMDGPU_GEM_DOMAIN_MASK (AMDGPU_GEM_DOMAIN_CPU | \ - AMDGPU_GEM_DOMAIN_GTT | \ - AMDGPU_GEM_DOMAIN_VRAM | \ - AMDGPU_GEM_DOMAIN_GDS | \ - AMDGPU_GEM_DOMAIN_GWS | \ - AMDGPU_GEM_DOMAIN_OA |\ - AMDGPU_GEM_DOMAIN_DOORBELL |\ - AMDGPU_GEM_DOMAIN_MMIO_REMAP |\ - AMDGPU_GEM_DOMAIN_DGMA |\ - AMDGPU_GEM_DOMAIN_DGMA_IMPORT) - -/* Flag that CPU access will be required for the case of VRAM domain */ -#define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED (1 << 0) -/* Flag that CPU access will not work, this VRAM domain is invisible */ -#define AMDGPU_GEM_CREATE_NO_CPU_ACCESS (1 << 1) -/* Flag that USWC attributes should be used for GTT */ -#define AMDGPU_GEM_CREATE_CPU_GTT_USWC (1 << 2) -/* Flag that the memory should be in VRAM and cleared */ -#define AMDGPU_GEM_CREATE_VRAM_CLEARED (1 << 3) -/* Flag that allocating the BO should use linear VRAM */ -#define AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS (1 << 5) -/* Flag that BO is always valid in this VM */ -#define AMDGPU_GEM_CREATE_VM_ALWAYS_VALID (1 << 6) -/* Flag that BO sharing will be explicitly synchronized */ -#define AMDGPU_GEM_CREATE_EXPLICIT_SYNC (1 << 7) -/* Flag that indicates allocating MQD gart on GFX9, where the mtype - * for the second page onward should be set to NC. It should never - * be used by user space applications. - */ -#define AMDGPU_GEM_CREATE_CP_MQD_GFX9 (1 << 8) -/* Flag that BO may contain sensitive data that must be wiped before - * releasing the memory - */ -#define AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE (1 << 9) -/* Flag that BO will be encrypted and that the TMZ bit should be - * set in the PTEs when mapping this buffer via GPUVM or - * accessing it with various hw blocks - */ -#define AMDGPU_GEM_CREATE_ENCRYPTED (1 << 10) -/* Flag that BO will be used only in preemptible context, which does - * not require GTT memory accounting - */ -#define AMDGPU_GEM_CREATE_PREEMPTIBLE (1 << 11) -/* Flag that BO can be discarded under memory pressure without keeping the - * content. - */ -#define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) -/* Flag that BO is shared coherently between multiple devices or CPU threads. - * May depend on GPU instructions to flush caches to system scope explicitly. - * - * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and - * may override the MTYPE selected in AMDGPU_VA_OP_MAP. - */ -#define AMDGPU_GEM_CREATE_COHERENT (1 << 13) -/* Flag that BO should not be cached by GPU. Coherent without having to flush - * GPU caches explicitly - * - * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and - * may override the MTYPE selected in AMDGPU_VA_OP_MAP. - */ -#define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) -/* Flag that BO should be coherent across devices when using device-level - * atomics. May depend on GPU instructions to flush caches to device scope - * explicitly, promoting them to system scope automatically. - * - * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and - * may override the MTYPE selected in AMDGPU_VA_OP_MAP. - */ -#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15) -/* Set PTE.D and recompress during GTT->VRAM moves according to TILING flags. */ -#define AMDGPU_GEM_CREATE_GFX12_DCC (1 << 16) - -/* hybrid specific */ -/* Flag that the memory should be in SPARSE resource */ -#define AMDGPU_GEM_CREATE_SPARSE (1ULL << 29) -/* Flag that the memory allocation should be from top of domain */ -#define AMDGPU_GEM_CREATE_TOP_DOWN (1ULL << 30) -/* Flag that the memory allocation should be pinned */ -#define AMDGPU_GEM_CREATE_NO_EVICT (1ULL << 31) - -struct drm_amdgpu_gem_create_in { - /** the requested memory size */ - __u64 bo_size; - /** physical start_addr alignment in bytes for some HW requirements */ - __u64 alignment; - /** the requested memory domains */ - __u64 domains; - /** allocation flags */ - __u64 domain_flags; -}; - -struct drm_amdgpu_gem_create_out { - /** returned GEM object handle */ - __u32 handle; - __u32 _pad; -}; - -union drm_amdgpu_gem_create { - struct drm_amdgpu_gem_create_in in; - struct drm_amdgpu_gem_create_out out; -}; - -/** Opcode to create new residency list. */ -#define AMDGPU_BO_LIST_OP_CREATE 0 -/** Opcode to destroy previously created residency list */ -#define AMDGPU_BO_LIST_OP_DESTROY 1 -/** Opcode to update resource information in the list */ -#define AMDGPU_BO_LIST_OP_UPDATE 2 - -struct drm_amdgpu_bo_list_in { - /** Type of operation */ - __u32 operation; - /** Handle of list or 0 if we want to create one */ - __u32 list_handle; - /** Number of BOs in list */ - __u32 bo_number; - /** Size of each element describing BO */ - __u32 bo_info_size; - /** Pointer to array describing BOs */ - __u64 bo_info_ptr; -}; - -struct drm_amdgpu_bo_list_entry { - /** Handle of BO */ - __u32 bo_handle; - /** New (if specified) BO priority to be used during migration */ - __u32 bo_priority; -}; - -struct drm_amdgpu_bo_list_out { - /** Handle of resource list */ - __u32 list_handle; - __u32 _pad; -}; - -union drm_amdgpu_bo_list { - struct drm_amdgpu_bo_list_in in; - struct drm_amdgpu_bo_list_out out; -}; - -/* context related */ -#define AMDGPU_CTX_OP_ALLOC_CTX 1 -#define AMDGPU_CTX_OP_FREE_CTX 2 -#define AMDGPU_CTX_OP_QUERY_STATE 3 -#define AMDGPU_CTX_OP_QUERY_STATE2 4 -#define AMDGPU_CTX_OP_GET_STABLE_PSTATE 5 -#define AMDGPU_CTX_OP_SET_STABLE_PSTATE 6 - -/* GPU reset status */ -#define AMDGPU_CTX_NO_RESET 0 -/* this the context caused it */ -#define AMDGPU_CTX_GUILTY_RESET 1 -/* some other context caused it */ -#define AMDGPU_CTX_INNOCENT_RESET 2 -/* unknown cause */ -#define AMDGPU_CTX_UNKNOWN_RESET 3 - -/* indicate gpu reset occurred after ctx created */ -#define AMDGPU_CTX_QUERY2_FLAGS_RESET (1<<0) -/* indicate vram lost occurred after ctx created */ -#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1) -/* indicate some job from this context once cause gpu hang */ -#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2) -/* indicate some errors are detected by RAS */ -#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE (1<<3) -#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE (1<<4) -/* indicate that the reset hasn't completed yet */ -#define AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS (1<<5) - -/* Context priority level */ -#define AMDGPU_CTX_PRIORITY_UNSET -2048 -#define AMDGPU_CTX_PRIORITY_VERY_LOW -1023 -#define AMDGPU_CTX_PRIORITY_LOW -512 -#define AMDGPU_CTX_PRIORITY_NORMAL 0 -/* - * When used in struct drm_amdgpu_ctx_in, a priority above NORMAL requires - * CAP_SYS_NICE or DRM_MASTER -*/ -#define AMDGPU_CTX_PRIORITY_HIGH 512 -#define AMDGPU_CTX_PRIORITY_VERY_HIGH 1023 - -/* select a stable profiling pstate for perfmon tools */ -#define AMDGPU_CTX_STABLE_PSTATE_FLAGS_MASK 0xf -#define AMDGPU_CTX_STABLE_PSTATE_NONE 0 -#define AMDGPU_CTX_STABLE_PSTATE_STANDARD 1 -#define AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK 2 -#define AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK 3 -#define AMDGPU_CTX_STABLE_PSTATE_PEAK 4 - -struct drm_amdgpu_ctx_in { - /** AMDGPU_CTX_OP_* */ - __u32 op; - /** Flags */ - __u32 flags; - __u32 ctx_id; - /** AMDGPU_CTX_PRIORITY_* */ - __s32 priority; -}; - -union drm_amdgpu_ctx_out { - struct { - __u32 ctx_id; - __u32 _pad; - } alloc; - - struct { - /** For future use, no flags defined so far */ - __u64 flags; - /** Number of resets caused by this context so far. */ - __u32 hangs; - /** Reset status since the last call of the ioctl. */ - __u32 reset_status; - } state; - - struct { - __u32 flags; - __u32 _pad; - } pstate; -}; - -union drm_amdgpu_ctx { - struct drm_amdgpu_ctx_in in; - union drm_amdgpu_ctx_out out; -}; - -/* user queue IOCTL operations */ -#define AMDGPU_USERQ_OP_CREATE 1 -#define AMDGPU_USERQ_OP_FREE 2 - -/* queue priority levels */ -/* low < normal low < normal high < high */ -#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_MASK 0x3 -#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_SHIFT 0 -#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_LOW 0 -#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_LOW 1 -#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_NORMAL_HIGH 2 -#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_PRIORITY_HIGH 3 /* admin only */ -/* for queues that need access to protected content */ -#define AMDGPU_USERQ_CREATE_FLAGS_QUEUE_SECURE (1 << 2) - -/* - * This structure is a container to pass input configuration - * info for all supported userqueue related operations. - * For operation AMDGPU_USERQ_OP_CREATE: user is expected - * to set all fields, excep the parameter 'queue_id'. - * For operation AMDGPU_USERQ_OP_FREE: the only input parameter expected - * to be set is 'queue_id', eveything else is ignored. - */ -struct drm_amdgpu_userq_in { - /** AMDGPU_USERQ_OP_* */ - __u32 op; - /** Queue id passed for operation USERQ_OP_FREE */ - __u32 queue_id; - /** the target GPU engine to execute workload (AMDGPU_HW_IP_*) */ - __u32 ip_type; - /** - * @doorbell_handle: the handle of doorbell GEM object - * associated with this userqueue client. - */ - __u32 doorbell_handle; - /** - * @doorbell_offset: 32-bit offset of the doorbell in the doorbell bo. - * Kernel will generate absolute doorbell offset using doorbell_handle - * and doorbell_offset in the doorbell bo. - */ - __u32 doorbell_offset; - /** - * @flags: flags used for queue parameters - */ - __u32 flags; - /** - * @queue_va: Virtual address of the GPU memory which holds the queue - * object. The queue holds the workload packets. - */ - __u64 queue_va; - /** - * @queue_size: Size of the queue in bytes, this needs to be 256-byte - * aligned. - */ - __u64 queue_size; - /** - * @rptr_va : Virtual address of the GPU memory which holds the ring RPTR. - * This object must be at least 8 byte in size and aligned to 8-byte offset. - */ - __u64 rptr_va; - /** - * @wptr_va : Virtual address of the GPU memory which holds the ring WPTR. - * This object must be at least 8 byte in size and aligned to 8-byte offset. - * - * Queue, RPTR and WPTR can come from the same object, as long as the size - * and alignment related requirements are met. - */ - __u64 wptr_va; - /** - * @mqd: MQD (memory queue descriptor) is a set of parameters which allow - * the GPU to uniquely define and identify a usermode queue. - * - * MQD data can be of different size for different GPU IP/engine and - * their respective versions/revisions, so this points to a __u64 * - * which holds IP specific MQD of this usermode queue. - */ - __u64 mqd; - /** - * @size: size of MQD data in bytes, it must match the MQD structure - * size of the respective engine/revision defined in UAPI for ex, for - * gfx11 workloads, size = sizeof(drm_amdgpu_userq_mqd_gfx11). - */ - __u64 mqd_size; -}; - -/* The structure to carry output of userqueue ops */ -struct drm_amdgpu_userq_out { - /** - * For operation AMDGPU_USERQ_OP_CREATE: This field contains a unique - * queue ID to represent the newly created userqueue in the system, otherwise - * it should be ignored. - */ - __u32 queue_id; - __u32 _pad; -}; - -union drm_amdgpu_userq { - struct drm_amdgpu_userq_in in; - struct drm_amdgpu_userq_out out; -}; - -/* GFX V11 IP specific MQD parameters */ -struct drm_amdgpu_userq_mqd_gfx11 { - /** - * @shadow_va: Virtual address of the GPU memory to hold the shadow buffer. - * Use AMDGPU_INFO_IOCTL to find the exact size of the object. - */ - __u64 shadow_va; - /** - * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. - * Use AMDGPU_INFO_IOCTL to find the exact size of the object. - */ - __u64 csa_va; -}; - -/* GFX V11 SDMA IP specific MQD parameters */ -struct drm_amdgpu_userq_mqd_sdma_gfx11 { - /** - * @csa_va: Virtual address of the GPU memory to hold the CSA buffer. - * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL - * to get the size. - */ - __u64 csa_va; -}; - -/* GFX V11 Compute IP specific MQD parameters */ -struct drm_amdgpu_userq_mqd_compute_gfx11 { - /** - * @eop_va: Virtual address of the GPU memory to hold the EOP buffer. - * This must be a from a separate GPU object, and use AMDGPU_INFO IOCTL - * to get the size. - */ - __u64 eop_va; -}; - -/* userq signal/wait ioctl */ -struct drm_amdgpu_userq_signal { - /** - * @queue_id: Queue handle used by the userq fence creation function - * to retrieve the WPTR. - */ - __u32 queue_id; - __u32 pad; - /** - * @syncobj_handles: The list of syncobj handles submitted by the user queue - * job to be signaled. - */ - __u64 syncobj_handles; - /** - * @num_syncobj_handles: A count that represents the number of syncobj handles in - * @syncobj_handles. - */ - __u64 num_syncobj_handles; - /** - * @bo_read_handles: The list of BO handles that the submitted user queue job - * is using for read only. This will update BO fences in the kernel. - */ - __u64 bo_read_handles; - /** - * @bo_write_handles: The list of BO handles that the submitted user queue job - * is using for write only. This will update BO fences in the kernel. - */ - __u64 bo_write_handles; - /** - * @num_bo_read_handles: A count that represents the number of read BO handles in - * @bo_read_handles. - */ - __u32 num_bo_read_handles; - /** - * @num_bo_write_handles: A count that represents the number of write BO handles in - * @bo_write_handles. - */ - __u32 num_bo_write_handles; -}; - -struct drm_amdgpu_userq_fence_info { - /** - * @va: A gpu address allocated for each queue which stores the - * read pointer (RPTR) value. - */ - __u64 va; - /** - * @value: A 64 bit value represents the write pointer (WPTR) of the - * queue commands which compared with the RPTR value to signal the - * fences. - */ - __u64 value; -}; - -struct drm_amdgpu_userq_wait { - /** - * @waitq_id: Queue handle used by the userq wait IOCTL to retrieve the - * wait queue and maintain the fence driver references in it. - */ - __u32 waitq_id; - __u32 pad; - /** - * @syncobj_handles: The list of syncobj handles submitted by the user queue - * job to get the va/value pairs. - */ - __u64 syncobj_handles; - /** - * @syncobj_timeline_handles: The list of timeline syncobj handles submitted by - * the user queue job to get the va/value pairs at given @syncobj_timeline_points. - */ - __u64 syncobj_timeline_handles; - /** - * @syncobj_timeline_points: The list of timeline syncobj points submitted by the - * user queue job for the corresponding @syncobj_timeline_handles. - */ - __u64 syncobj_timeline_points; - /** - * @bo_read_handles: The list of read BO handles submitted by the user queue - * job to get the va/value pairs. - */ - __u64 bo_read_handles; - /** - * @bo_write_handles: The list of write BO handles submitted by the user queue - * job to get the va/value pairs. - */ - __u64 bo_write_handles; - /** - * @num_syncobj_timeline_handles: A count that represents the number of timeline - * syncobj handles in @syncobj_timeline_handles. - */ - __u16 num_syncobj_timeline_handles; - /** - * @num_fences: This field can be used both as input and output. As input it defines - * the maximum number of fences that can be returned and as output it will specify - * how many fences were actually returned from the ioctl. - */ - __u16 num_fences; - /** - * @num_syncobj_handles: A count that represents the number of syncobj handles in - * @syncobj_handles. - */ - __u32 num_syncobj_handles; - /** - * @num_bo_read_handles: A count that represents the number of read BO handles in - * @bo_read_handles. - */ - __u32 num_bo_read_handles; - /** - * @num_bo_write_handles: A count that represents the number of write BO handles in - * @bo_write_handles. - */ - __u32 num_bo_write_handles; - /** - * @out_fences: The field is a return value from the ioctl containing the list of - * address/value pairs to wait for. - */ - __u64 out_fences; -}; - -/* sem related */ -#define AMDGPU_SEM_OP_CREATE_SEM 1 -#define AMDGPU_SEM_OP_WAIT_SEM 2 -#define AMDGPU_SEM_OP_SIGNAL_SEM 3 -#define AMDGPU_SEM_OP_DESTROY_SEM 4 -#define AMDGPU_SEM_OP_IMPORT_SEM 5 -#define AMDGPU_SEM_OP_EXPORT_SEM 6 - -struct drm_amdgpu_sem_in { - /** AMDGPU_SEM_OP_* */ - uint32_t op; - uint32_t handle; - uint32_t ctx_id; - uint32_t ip_type; - uint32_t ip_instance; - uint32_t ring; - uint64_t seq; -}; - -union drm_amdgpu_sem_out { - int32_t fd; - uint32_t handle; -}; - -union drm_amdgpu_sem { - struct drm_amdgpu_sem_in in; - union drm_amdgpu_sem_out out; -}; - -/* vm ioctl */ -#define AMDGPU_VM_OP_RESERVE_VMID 1 -#define AMDGPU_VM_OP_UNRESERVE_VMID 2 - -struct drm_amdgpu_vm_in { - /** AMDGPU_VM_OP_* */ - __u32 op; - __u32 flags; -}; - -struct drm_amdgpu_vm_out { - /** For future use, no flags defined so far */ - __u64 flags; -}; - -union drm_amdgpu_vm { - struct drm_amdgpu_vm_in in; - struct drm_amdgpu_vm_out out; -}; - -/* sched ioctl */ -#define AMDGPU_SCHED_OP_PROCESS_PRIORITY_OVERRIDE 1 -#define AMDGPU_SCHED_OP_CONTEXT_PRIORITY_OVERRIDE 2 - -struct drm_amdgpu_sched_in { - /* AMDGPU_SCHED_OP_* */ - __u32 op; - __u32 fd; - /** AMDGPU_CTX_PRIORITY_* */ - __s32 priority; - __u32 ctx_id; -}; - -union drm_amdgpu_sched { - struct drm_amdgpu_sched_in in; -}; - -/* - * This is not a reliable API and you should expect it to fail for any - * number of reasons and have fallback path that do not use userptr to - * perform any operation. - */ -#define AMDGPU_GEM_USERPTR_READONLY (1 << 0) -#define AMDGPU_GEM_USERPTR_ANONONLY (1 << 1) -#define AMDGPU_GEM_USERPTR_VALIDATE (1 << 2) -#define AMDGPU_GEM_USERPTR_REGISTER (1 << 3) - -struct drm_amdgpu_gem_userptr { - __u64 addr; - __u64 size; - /* AMDGPU_GEM_USERPTR_* */ - __u32 flags; - /* Resulting GEM handle */ - __u32 handle; -}; - -#define AMDGPU_GEM_DGMA_IMPORT 0 -#define AMDGPU_GEM_DGMA_QUERY_PHYS_ADDR 1 -struct drm_amdgpu_gem_dgma { - __u64 addr; - __u64 size; - __u32 op; - __u32 handle; -}; - -/* SI-CI-VI: */ -/* same meaning as the GB_TILE_MODE and GL_MACRO_TILE_MODE fields */ -#define AMDGPU_TILING_ARRAY_MODE_SHIFT 0 -#define AMDGPU_TILING_ARRAY_MODE_MASK 0xf -#define AMDGPU_TILING_PIPE_CONFIG_SHIFT 4 -#define AMDGPU_TILING_PIPE_CONFIG_MASK 0x1f -#define AMDGPU_TILING_TILE_SPLIT_SHIFT 9 -#define AMDGPU_TILING_TILE_SPLIT_MASK 0x7 -#define AMDGPU_TILING_MICRO_TILE_MODE_SHIFT 12 -#define AMDGPU_TILING_MICRO_TILE_MODE_MASK 0x7 -#define AMDGPU_TILING_BANK_WIDTH_SHIFT 15 -#define AMDGPU_TILING_BANK_WIDTH_MASK 0x3 -#define AMDGPU_TILING_BANK_HEIGHT_SHIFT 17 -#define AMDGPU_TILING_BANK_HEIGHT_MASK 0x3 -#define AMDGPU_TILING_MACRO_TILE_ASPECT_SHIFT 19 -#define AMDGPU_TILING_MACRO_TILE_ASPECT_MASK 0x3 -#define AMDGPU_TILING_NUM_BANKS_SHIFT 21 -#define AMDGPU_TILING_NUM_BANKS_MASK 0x3 - -/* GFX9 - GFX11: */ -#define AMDGPU_TILING_SWIZZLE_MODE_SHIFT 0 -#define AMDGPU_TILING_SWIZZLE_MODE_MASK 0x1f -#define AMDGPU_TILING_DCC_OFFSET_256B_SHIFT 5 -#define AMDGPU_TILING_DCC_OFFSET_256B_MASK 0xFFFFFF -#define AMDGPU_TILING_DCC_PITCH_MAX_SHIFT 29 -#define AMDGPU_TILING_DCC_PITCH_MAX_MASK 0x3FFF -#define AMDGPU_TILING_DCC_INDEPENDENT_64B_SHIFT 43 -#define AMDGPU_TILING_DCC_INDEPENDENT_64B_MASK 0x1 -#define AMDGPU_TILING_DCC_INDEPENDENT_128B_SHIFT 44 -#define AMDGPU_TILING_DCC_INDEPENDENT_128B_MASK 0x1 -#define AMDGPU_TILING_SCANOUT_SHIFT 63 -#define AMDGPU_TILING_SCANOUT_MASK 0x1 - -/* GFX12 and later: */ -#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_SHIFT 0 -#define AMDGPU_TILING_GFX12_SWIZZLE_MODE_MASK 0x7 -/* These are DCC recompression settings for memory management: */ -#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_SHIFT 3 -#define AMDGPU_TILING_GFX12_DCC_MAX_COMPRESSED_BLOCK_MASK 0x3 /* 0:64B, 1:128B, 2:256B */ -#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_SHIFT 5 -#define AMDGPU_TILING_GFX12_DCC_NUMBER_TYPE_MASK 0x7 /* CB_COLOR0_INFO.NUMBER_TYPE */ -#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_SHIFT 8 -#define AMDGPU_TILING_GFX12_DCC_DATA_FORMAT_MASK 0x3f /* [0:4]:CB_COLOR0_INFO.FORMAT, [5]:MM */ -/* When clearing the buffer or moving it from VRAM to GTT, don't compress and set DCC metadata - * to uncompressed. Set when parts of an allocation bypass DCC and read raw data. */ -#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_SHIFT 14 -#define AMDGPU_TILING_GFX12_DCC_WRITE_COMPRESS_DISABLE_MASK 0x1 -/* bit gap */ -#define AMDGPU_TILING_GFX12_SCANOUT_SHIFT 63 -#define AMDGPU_TILING_GFX12_SCANOUT_MASK 0x1 - -/* Set/Get helpers for tiling flags. */ -#define AMDGPU_TILING_SET(field, value) \ - (((__u64)(value) & AMDGPU_TILING_##field##_MASK) << AMDGPU_TILING_##field##_SHIFT) -#define AMDGPU_TILING_GET(value, field) \ - (((__u64)(value) >> AMDGPU_TILING_##field##_SHIFT) & AMDGPU_TILING_##field##_MASK) - -#define AMDGPU_GEM_METADATA_OP_SET_METADATA 1 -#define AMDGPU_GEM_METADATA_OP_GET_METADATA 2 - -/** The same structure is shared for input/output */ -struct drm_amdgpu_gem_metadata { - /** GEM Object handle */ - __u32 handle; - /** Do we want get or set metadata */ - __u32 op; - struct { - /** For future use, no flags defined so far */ - __u64 flags; - /** family specific tiling info */ - __u64 tiling_info; - __u32 data_size_bytes; - __u32 data[64]; - } data; -}; - -struct drm_amdgpu_gem_mmap_in { - /** the GEM object handle */ - __u32 handle; - __u32 _pad; -}; - -struct drm_amdgpu_gem_mmap_out { - /** mmap offset from the vma offset manager */ - __u64 addr_ptr; -}; - -union drm_amdgpu_gem_mmap { - struct drm_amdgpu_gem_mmap_in in; - struct drm_amdgpu_gem_mmap_out out; -}; - -struct drm_amdgpu_gem_wait_idle_in { - /** GEM object handle */ - __u32 handle; - /** For future use, no flags defined so far */ - __u32 flags; - /** Absolute timeout to wait */ - __u64 timeout; -}; - -struct drm_amdgpu_gem_wait_idle_out { - /** BO status: 0 - BO is idle, 1 - BO is busy */ - __u32 status; - /** Returned current memory domain */ - __u32 domain; -}; - -union drm_amdgpu_gem_wait_idle { - struct drm_amdgpu_gem_wait_idle_in in; - struct drm_amdgpu_gem_wait_idle_out out; -}; - -struct drm_amdgpu_wait_cs_in { - /* Command submission handle - * handle equals 0 means none to wait for - * handle equals ~0ull means wait for the latest sequence number - */ - __u64 handle; - /** Absolute timeout to wait */ - __u64 timeout; - __u32 ip_type; - __u32 ip_instance; - __u32 ring; - __u32 ctx_id; -}; - -struct drm_amdgpu_wait_cs_out { - /** CS status: 0 - CS completed, 1 - CS still busy */ - __u64 status; -}; - -union drm_amdgpu_wait_cs { - struct drm_amdgpu_wait_cs_in in; - struct drm_amdgpu_wait_cs_out out; -}; - -struct drm_amdgpu_fence { - __u32 ctx_id; - __u32 ip_type; - __u32 ip_instance; - __u32 ring; - __u64 seq_no; -}; - -struct drm_amdgpu_wait_fences_in { - /** This points to uint64_t * which points to fences */ - __u64 fences; - __u32 fence_count; - __u32 wait_all; - __u64 timeout_ns; -}; - -struct drm_amdgpu_wait_fences_out { - __u32 status; - __u32 first_signaled; -}; - -union drm_amdgpu_wait_fences { - struct drm_amdgpu_wait_fences_in in; - struct drm_amdgpu_wait_fences_out out; -}; - -#define AMDGPU_GEM_OP_GET_GEM_CREATE_INFO 0 -#define AMDGPU_GEM_OP_SET_PLACEMENT 1 -#define AMDGPU_GEM_OP_GET_MAPPING_INFO 2 - -struct drm_amdgpu_gem_vm_entry { - /* Start of mapping (in bytes) */ - __u64 addr; - - /* Size of mapping (in bytes) */ - __u64 size; - - /* Mapping offset */ - __u64 offset; - - /* flags needed to recreate mapping */ - __u64 flags; -}; - -/* Sets or returns a value associated with a buffer. */ -struct drm_amdgpu_gem_op { - /** GEM object handle */ - __u32 handle; - /** AMDGPU_GEM_OP_* */ - __u32 op; - /** Input or return value. For MAPPING_INFO op: pointer to array of struct drm_amdgpu_gem_vm_entry */ - __u64 value; - /** For MAPPING_INFO op: number of mappings (in/out) */ - __u32 num_entries; - - __u32 padding; -}; - -#define AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT (1 << 0) - -struct drm_amdgpu_gem_list_handles { - /* User pointer to array of drm_amdgpu_gem_bo_info_entry */ - __u64 entries; - - /* Size of entries buffer / Number of handles in process (if larger than size of buffer, must retry) */ - __u32 num_entries; - - __u32 padding; -}; - -struct drm_amdgpu_gem_list_handles_entry { - /* gem handle of buffer object */ - __u32 gem_handle; - - /* Currently just one flag: IS_IMPORT */ - __u32 flags; - - /* Size of bo */ - __u64 size; - - /* Preferred domains for GEM_CREATE */ - __u64 preferred_domains; - - /* GEM_CREATE flags for re-creation of buffer */ - __u64 alloc_flags; - - /* physical start_addr alignment in bytes for some HW requirements */ - __u64 alignment; -}; - -#define AMDGPU_VA_OP_MAP 1 -#define AMDGPU_VA_OP_UNMAP 2 -#define AMDGPU_VA_OP_CLEAR 3 -#define AMDGPU_VA_OP_REPLACE 4 - -/* Delay the page table update till the next CS */ -#define AMDGPU_VM_DELAY_UPDATE (1 << 0) - -/* Mapping flags */ -/* readable mapping */ -#define AMDGPU_VM_PAGE_READABLE (1 << 1) -/* writable mapping */ -#define AMDGPU_VM_PAGE_WRITEABLE (1 << 2) -/* executable mapping, new for VI */ -#define AMDGPU_VM_PAGE_EXECUTABLE (1 << 3) -/* partially resident texture */ -#define AMDGPU_VM_PAGE_PRT (1 << 4) -/* MTYPE flags use bit 5 to 8 */ -#define AMDGPU_VM_MTYPE_MASK (0xf << 5) -/* Default MTYPE. Pre-AI must use this. Recommended for newer ASICs. */ -#define AMDGPU_VM_MTYPE_DEFAULT (0 << 5) -/* Use Non Coherent MTYPE instead of default MTYPE */ -#define AMDGPU_VM_MTYPE_NC (1 << 5) -/* Use Write Combine MTYPE instead of default MTYPE */ -#define AMDGPU_VM_MTYPE_WC (2 << 5) -/* Use Cache Coherent MTYPE instead of default MTYPE */ -#define AMDGPU_VM_MTYPE_CC (3 << 5) -/* Use UnCached MTYPE instead of default MTYPE */ -#define AMDGPU_VM_MTYPE_UC (4 << 5) -/* Use Read Write MTYPE instead of default MTYPE */ -#define AMDGPU_VM_MTYPE_RW (5 << 5) -/* don't allocate MALL */ -#define AMDGPU_VM_PAGE_NOALLOC (1 << 9) - -struct drm_amdgpu_gem_va { - /** GEM object handle */ - __u32 handle; - __u32 _pad; - /** AMDGPU_VA_OP_* */ - __u32 operation; - /** AMDGPU_VM_PAGE_* */ - __u32 flags; - /** va address to assign . Must be correctly aligned.*/ - __u64 va_address; - /** Specify offset inside of BO to assign. Must be correctly aligned.*/ - __u64 offset_in_bo; - /** Specify mapping size. Must be correctly aligned. */ - __u64 map_size; - /** - * vm_timeline_point is a sequence number used to add new timeline point. - */ - __u64 vm_timeline_point; - /** - * The vm page table update fence is installed in given vm_timeline_syncobj_out - * at vm_timeline_point. - */ - __u32 vm_timeline_syncobj_out; - /** the number of syncobj handles in @input_fence_syncobj_handles */ - __u32 num_syncobj_handles; - /** Array of sync object handle to wait for given input fences */ - __u64 input_fence_syncobj_handles; -}; - -#define AMDGPU_HW_IP_GFX 0 -#define AMDGPU_HW_IP_COMPUTE 1 -#define AMDGPU_HW_IP_DMA 2 -#define AMDGPU_HW_IP_UVD 3 -#define AMDGPU_HW_IP_VCE 4 -#define AMDGPU_HW_IP_UVD_ENC 5 -#define AMDGPU_HW_IP_VCN_DEC 6 -/* - * From VCN4, AMDGPU_HW_IP_VCN_ENC is re-used to support - * both encoding and decoding jobs. - */ -#define AMDGPU_HW_IP_VCN_ENC 7 -#define AMDGPU_HW_IP_VCN_JPEG 8 -#define AMDGPU_HW_IP_VPE 9 -#define AMDGPU_HW_IP_NUM 10 - -#define AMDGPU_HW_IP_INSTANCE_MAX_COUNT 1 - -#define AMDGPU_CHUNK_ID_IB 0x01 -#define AMDGPU_CHUNK_ID_FENCE 0x02 -#define AMDGPU_CHUNK_ID_DEPENDENCIES 0x03 -#define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 -#define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 -#define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 -#define AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES 0x07 -#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT 0x08 -#define AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL 0x09 -#define AMDGPU_CHUNK_ID_CP_GFX_SHADOW 0x0a - -struct drm_amdgpu_cs_chunk { - __u32 chunk_id; - __u32 length_dw; - __u64 chunk_data; -}; - -struct drm_amdgpu_cs_in { - /** Rendering context id */ - __u32 ctx_id; - /** Handle of resource list associated with CS */ - __u32 bo_list_handle; - __u32 num_chunks; - __u32 flags; - /** this points to __u64 * which point to cs chunks */ - __u64 chunks; -}; - -struct drm_amdgpu_cs_out { - __u64 handle; -}; - -union drm_amdgpu_cs { - struct drm_amdgpu_cs_in in; - struct drm_amdgpu_cs_out out; -}; - -/* Specify flags to be used for IB */ - -/* This IB should be submitted to CE */ -#define AMDGPU_IB_FLAG_CE (1<<0) - -/* Preamble flag, which means the IB could be dropped if no context switch */ -#define AMDGPU_IB_FLAG_PREAMBLE (1<<1) - -/* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */ -#define AMDGPU_IB_FLAG_PREEMPT (1<<2) - -/* The IB fence should do the L2 writeback but not invalidate any shader - * caches (L2/vL1/sL1/I$). */ -#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3) - -/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER. - * This will reset wave ID counters for the IB. - */ -#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4) - -/* Flag the IB as secure (TMZ) - */ -#define AMDGPU_IB_FLAGS_SECURE (1 << 5) - -/* Tell KMD to flush and invalidate caches - */ -#define AMDGPU_IB_FLAG_EMIT_MEM_SYNC (1 << 6) - -struct drm_amdgpu_cs_chunk_ib { - __u32 _pad; - /** AMDGPU_IB_FLAG_* */ - __u32 flags; - /** Virtual address to begin IB execution */ - __u64 va_start; - /** Size of submission */ - __u32 ib_bytes; - /** HW IP to submit to */ - __u32 ip_type; - /** HW IP index of the same type to submit to */ - __u32 ip_instance; - /** Ring index to submit to */ - __u32 ring; -}; - -struct drm_amdgpu_cs_chunk_dep { - __u32 ip_type; - __u32 ip_instance; - __u32 ring; - __u32 ctx_id; - __u64 handle; -}; - -struct drm_amdgpu_cs_chunk_fence { - __u32 handle; - __u32 offset; -}; - -struct drm_amdgpu_cs_chunk_sem { - __u32 handle; -}; - -struct drm_amdgpu_cs_chunk_syncobj { - __u32 handle; - __u32 flags; - __u64 point; -}; - -#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ 0 -#define AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD 1 -#define AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD 2 - -union drm_amdgpu_fence_to_handle { - struct { - struct drm_amdgpu_fence fence; - __u32 what; - __u32 pad; - } in; - struct { - __u32 handle; - } out; -}; - -struct drm_amdgpu_cs_chunk_data { - union { - struct drm_amdgpu_cs_chunk_ib ib_data; - struct drm_amdgpu_cs_chunk_fence fence_data; - }; -}; - -#define AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW 0x1 - -struct drm_amdgpu_cs_chunk_cp_gfx_shadow { - __u64 shadow_va; - __u64 csa_va; - __u64 gds_va; - __u64 flags; -}; - -/* - * Query h/w info: Flag that this is integrated (a.h.a. fusion) GPU - * - */ -#define AMDGPU_IDS_FLAGS_FUSION 0x01 -#define AMDGPU_IDS_FLAGS_PREEMPTION 0x02 -#define AMDGPU_IDS_FLAGS_TMZ 0x04 -#define AMDGPU_IDS_FLAGS_CONFORMANT_TRUNC_COORD 0x08 -#define AMDGPU_IDS_FLAGS_GANG_SUBMIT 0x10 - -/* - * Query h/w info: Flag identifying VF/PF/PT mode - * - */ -#define AMDGPU_IDS_FLAGS_MODE_MASK 0x300 -#define AMDGPU_IDS_FLAGS_MODE_SHIFT 0x8 -#define AMDGPU_IDS_FLAGS_MODE_PF 0x0 -#define AMDGPU_IDS_FLAGS_MODE_VF 0x1 -#define AMDGPU_IDS_FLAGS_MODE_PT 0x2 - -/* indicate if acceleration can be working */ -#define AMDGPU_INFO_ACCEL_WORKING 0x00 -/* get the crtc_id from the mode object id? */ -#define AMDGPU_INFO_CRTC_FROM_ID 0x01 -/* query hw IP info */ -#define AMDGPU_INFO_HW_IP_INFO 0x02 -/* query hw IP instance count for the specified type */ -#define AMDGPU_INFO_HW_IP_COUNT 0x03 -/* timestamp for GL_ARB_timer_query */ -#define AMDGPU_INFO_TIMESTAMP 0x05 -/* Query the firmware version */ -#define AMDGPU_INFO_FW_VERSION 0x0e - /* Subquery id: Query VCE firmware version */ - #define AMDGPU_INFO_FW_VCE 0x1 - /* Subquery id: Query UVD firmware version */ - #define AMDGPU_INFO_FW_UVD 0x2 - /* Subquery id: Query GMC firmware version */ - #define AMDGPU_INFO_FW_GMC 0x03 - /* Subquery id: Query GFX ME firmware version */ - #define AMDGPU_INFO_FW_GFX_ME 0x04 - /* Subquery id: Query GFX PFP firmware version */ - #define AMDGPU_INFO_FW_GFX_PFP 0x05 - /* Subquery id: Query GFX CE firmware version */ - #define AMDGPU_INFO_FW_GFX_CE 0x06 - /* Subquery id: Query GFX RLC firmware version */ - #define AMDGPU_INFO_FW_GFX_RLC 0x07 - /* Subquery id: Query GFX MEC firmware version */ - #define AMDGPU_INFO_FW_GFX_MEC 0x08 - /* Subquery id: Query SMC firmware version */ - #define AMDGPU_INFO_FW_SMC 0x0a - /* Subquery id: Query SDMA firmware version */ - #define AMDGPU_INFO_FW_SDMA 0x0b - /* Subquery id: Query PSP SOS firmware version */ - #define AMDGPU_INFO_FW_SOS 0x0c - /* Subquery id: Query PSP ASD firmware version */ - #define AMDGPU_INFO_FW_ASD 0x0d - /* Subquery id: Query VCN firmware version */ - #define AMDGPU_INFO_FW_VCN 0x0e - /* Subquery id: Query GFX RLC SRLC firmware version */ - #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_CNTL 0x0f - /* Subquery id: Query GFX RLC SRLG firmware version */ - #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_GPM_MEM 0x10 - /* Subquery id: Query GFX RLC SRLS firmware version */ - #define AMDGPU_INFO_FW_GFX_RLC_RESTORE_LIST_SRM_MEM 0x11 - /* Subquery id: Query DMCU firmware version */ - #define AMDGPU_INFO_FW_DMCU 0x12 - #define AMDGPU_INFO_FW_TA 0x13 - /* Subquery id: Query DMCUB firmware version */ - #define AMDGPU_INFO_FW_DMCUB 0x14 - /* Subquery id: Query TOC firmware version */ - #define AMDGPU_INFO_FW_TOC 0x15 - /* Subquery id: Query CAP firmware version */ - #define AMDGPU_INFO_FW_CAP 0x16 - /* Subquery id: Query GFX RLCP firmware version */ - #define AMDGPU_INFO_FW_GFX_RLCP 0x17 - /* Subquery id: Query GFX RLCV firmware version */ - #define AMDGPU_INFO_FW_GFX_RLCV 0x18 - /* Subquery id: Query MES_KIQ firmware version */ - #define AMDGPU_INFO_FW_MES_KIQ 0x19 - /* Subquery id: Query MES firmware version */ - #define AMDGPU_INFO_FW_MES 0x1a - /* Subquery id: Query IMU firmware version */ - #define AMDGPU_INFO_FW_IMU 0x1b - /* Subquery id: Query VPE firmware version */ - #define AMDGPU_INFO_FW_VPE 0x1c - -/* number of bytes moved for TTM migration */ -#define AMDGPU_INFO_NUM_BYTES_MOVED 0x0f -/* the used VRAM size */ -#define AMDGPU_INFO_VRAM_USAGE 0x10 -/* the used GTT size */ -#define AMDGPU_INFO_GTT_USAGE 0x11 -/* Information about GDS, etc. resource configuration */ -#define AMDGPU_INFO_GDS_CONFIG 0x13 -/* Query information about VRAM and GTT domains */ -#define AMDGPU_INFO_VRAM_GTT 0x14 -/* Query information about register in MMR address space*/ -#define AMDGPU_INFO_READ_MMR_REG 0x15 -/* Query information about device: rev id, family, etc. */ -#define AMDGPU_INFO_DEV_INFO 0x16 -/* visible vram usage */ -#define AMDGPU_INFO_VIS_VRAM_USAGE 0x17 -/* number of TTM buffer evictions */ -#define AMDGPU_INFO_NUM_EVICTIONS 0x18 -/* Query memory about VRAM and GTT domains */ -#define AMDGPU_INFO_MEMORY 0x19 -/* Query vce clock table */ -#define AMDGPU_INFO_VCE_CLOCK_TABLE 0x1A -/* Query vbios related information */ -#define AMDGPU_INFO_VBIOS 0x1B - /* Subquery id: Query vbios size */ - #define AMDGPU_INFO_VBIOS_SIZE 0x1 - /* Subquery id: Query vbios image */ - #define AMDGPU_INFO_VBIOS_IMAGE 0x2 - /* Subquery id: Query vbios info */ - #define AMDGPU_INFO_VBIOS_INFO 0x3 -/* Query UVD handles */ -#define AMDGPU_INFO_NUM_HANDLES 0x1C -/* Query sensor related information */ -#define AMDGPU_INFO_SENSOR 0x1D - /* Subquery id: Query GPU shader clock */ - #define AMDGPU_INFO_SENSOR_GFX_SCLK 0x1 - /* Subquery id: Query GPU memory clock */ - #define AMDGPU_INFO_SENSOR_GFX_MCLK 0x2 - /* Subquery id: Query GPU temperature */ - #define AMDGPU_INFO_SENSOR_GPU_TEMP 0x3 - /* Subquery id: Query GPU load */ - #define AMDGPU_INFO_SENSOR_GPU_LOAD 0x4 - /* Subquery id: Query average GPU power */ - #define AMDGPU_INFO_SENSOR_GPU_AVG_POWER 0x5 - /* Subquery id: Query northbridge voltage */ - #define AMDGPU_INFO_SENSOR_VDDNB 0x6 - /* Subquery id: Query graphics voltage */ - #define AMDGPU_INFO_SENSOR_VDDGFX 0x7 - /* Subquery id: Query GPU stable pstate shader clock */ - #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_SCLK 0x8 - /* Subquery id: Query GPU stable pstate memory clock */ - #define AMDGPU_INFO_SENSOR_STABLE_PSTATE_GFX_MCLK 0x9 - /* Subquery id: Query GPU peak pstate shader clock */ - #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa - /* Subquery id: Query GPU peak pstate memory clock */ - #define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb - /* Subquery id: Query input GPU power */ - #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER 0xc -/* Number of VRAM page faults on CPU access. */ -#define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS 0x1E -#define AMDGPU_INFO_VRAM_LOST_COUNTER 0x1F -/* query ras mask of enabled features*/ -#define AMDGPU_INFO_RAS_ENABLED_FEATURES 0x20 -/* RAS MASK: UMC (VRAM) */ -#define AMDGPU_INFO_RAS_ENABLED_UMC (1 << 0) -/* RAS MASK: SDMA */ -#define AMDGPU_INFO_RAS_ENABLED_SDMA (1 << 1) -/* RAS MASK: GFX */ -#define AMDGPU_INFO_RAS_ENABLED_GFX (1 << 2) -/* RAS MASK: MMHUB */ -#define AMDGPU_INFO_RAS_ENABLED_MMHUB (1 << 3) -/* RAS MASK: ATHUB */ -#define AMDGPU_INFO_RAS_ENABLED_ATHUB (1 << 4) -/* RAS MASK: PCIE */ -#define AMDGPU_INFO_RAS_ENABLED_PCIE (1 << 5) -/* RAS MASK: HDP */ -#define AMDGPU_INFO_RAS_ENABLED_HDP (1 << 6) -/* RAS MASK: XGMI */ -#define AMDGPU_INFO_RAS_ENABLED_XGMI (1 << 7) -/* RAS MASK: DF */ -#define AMDGPU_INFO_RAS_ENABLED_DF (1 << 8) -/* RAS MASK: SMN */ -#define AMDGPU_INFO_RAS_ENABLED_SMN (1 << 9) -/* RAS MASK: SEM */ -#define AMDGPU_INFO_RAS_ENABLED_SEM (1 << 10) -/* RAS MASK: MP0 */ -#define AMDGPU_INFO_RAS_ENABLED_MP0 (1 << 11) -/* RAS MASK: MP1 */ -#define AMDGPU_INFO_RAS_ENABLED_MP1 (1 << 12) -/* RAS MASK: FUSE */ -#define AMDGPU_INFO_RAS_ENABLED_FUSE (1 << 13) -/* query video encode/decode caps */ -#define AMDGPU_INFO_VIDEO_CAPS 0x21 - /* Subquery id: Decode */ - #define AMDGPU_INFO_VIDEO_CAPS_DECODE 0 - /* Subquery id: Encode */ - #define AMDGPU_INFO_VIDEO_CAPS_ENCODE 1 -/* Query the max number of IBs per gang per submission */ -#define AMDGPU_INFO_MAX_IBS 0x22 -/* query last page fault info */ -#define AMDGPU_INFO_GPUVM_FAULT 0x23 -/* query FW object size and alignment */ -#define AMDGPU_INFO_UQ_FW_AREAS 0x24 - -/* Hybrid Stack Specific Defs*/ -/* gpu capability */ -#define AMDGPU_INFO_CAPABILITY 0x50 -/* virtual range */ -#define AMDGPU_INFO_VIRTUAL_RANGE 0x51 -/* query pin memory capability */ -#define AMDGPU_CAPABILITY_PIN_MEM_FLAG (1 << 0) -/* query direct gma capability */ -#define AMDGPU_CAPABILITY_DIRECT_GMA_FLAG (1 << 1) - -#define AMDGPU_INFO_MMR_SE_INDEX_SHIFT 0 -#define AMDGPU_INFO_MMR_SE_INDEX_MASK 0xff -#define AMDGPU_INFO_MMR_SH_INDEX_SHIFT 8 -#define AMDGPU_INFO_MMR_SH_INDEX_MASK 0xff - -struct drm_amdgpu_query_fw { - /** AMDGPU_INFO_FW_* */ - __u32 fw_type; - /** - * Index of the IP if there are more IPs of - * the same type. - */ - __u32 ip_instance; - /** - * Index of the engine. Whether this is used depends - * on the firmware type. (e.g. MEC, SDMA) - */ - __u32 index; - __u32 _pad; -}; - -/* Input structure for the INFO ioctl */ -struct drm_amdgpu_info { - /* Where the return value will be stored */ - __u64 return_pointer; - /* The size of the return value. Just like "size" in "snprintf", - * it limits how many bytes the kernel can write. */ - __u32 return_size; - /* The query request id. */ - __u32 query; - - union { - struct { - __u32 id; - __u32 _pad; - } mode_crtc; - - struct { - /** AMDGPU_HW_IP_* */ - __u32 type; - /** - * Index of the IP if there are more IPs of the same - * type. Ignored by AMDGPU_INFO_HW_IP_COUNT. - */ - __u32 ip_instance; - } query_hw_ip; - - struct { - __u32 dword_offset; - /** number of registers to read */ - __u32 count; - __u32 instance; - /** For future use, no flags defined so far */ - __u32 flags; - } read_mmr_reg; - - struct { - uint32_t aperture; - uint32_t _pad; - } virtual_range; - - struct drm_amdgpu_query_fw query_fw; - - struct { - __u32 type; - __u32 offset; - } vbios_info; - - struct { - __u32 type; - } sensor_info; - - struct { - __u32 type; - } video_cap; - }; -}; - -struct drm_amdgpu_info_gds { - /** GDS GFX partition size */ - __u32 gds_gfx_partition_size; - /** GDS compute partition size */ - __u32 compute_partition_size; - /** total GDS memory size */ - __u32 gds_total_size; - /** GWS size per GFX partition */ - __u32 gws_per_gfx_partition; - /** GSW size per compute partition */ - __u32 gws_per_compute_partition; - /** OA size per GFX partition */ - __u32 oa_per_gfx_partition; - /** OA size per compute partition */ - __u32 oa_per_compute_partition; - __u32 _pad; -}; - -struct drm_amdgpu_info_vram_gtt { - __u64 vram_size; - __u64 vram_cpu_accessible_size; - __u64 gtt_size; -}; - -struct drm_amdgpu_heap_info { - /** max. physical memory */ - __u64 total_heap_size; - - /** Theoretical max. available memory in the given heap */ - __u64 usable_heap_size; - - /** - * Number of bytes allocated in the heap. This includes all processes - * and private allocations in the kernel. It changes when new buffers - * are allocated, freed, and moved. It cannot be larger than - * heap_size. - */ - __u64 heap_usage; - - /** - * Theoretical possible max. size of buffer which - * could be allocated in the given heap - */ - __u64 max_allocation; -}; - -struct drm_amdgpu_memory_info { - struct drm_amdgpu_heap_info vram; - struct drm_amdgpu_heap_info cpu_accessible_vram; - struct drm_amdgpu_heap_info gtt; -}; - -struct drm_amdgpu_info_firmware { - __u32 ver; - __u32 feature; -}; - -struct drm_amdgpu_info_vbios { - __u8 name[64]; - __u8 vbios_pn[64]; - __u32 version; - __u32 pad; - __u8 vbios_ver_str[32]; - __u8 date[32]; -}; - -#define AMDGPU_VRAM_TYPE_UNKNOWN 0 -#define AMDGPU_VRAM_TYPE_GDDR1 1 -#define AMDGPU_VRAM_TYPE_DDR2 2 -#define AMDGPU_VRAM_TYPE_GDDR3 3 -#define AMDGPU_VRAM_TYPE_GDDR4 4 -#define AMDGPU_VRAM_TYPE_GDDR5 5 -#define AMDGPU_VRAM_TYPE_HBM 6 -#define AMDGPU_VRAM_TYPE_DDR3 7 -#define AMDGPU_VRAM_TYPE_DDR4 8 -#define AMDGPU_VRAM_TYPE_GDDR6 9 -#define AMDGPU_VRAM_TYPE_DDR5 10 -#define AMDGPU_VRAM_TYPE_LPDDR4 11 -#define AMDGPU_VRAM_TYPE_LPDDR5 12 -#define AMDGPU_VRAM_TYPE_HBM3E 13 - -#define AMDGPU_VRAM_TYPE_HBM_WIDTH 4096 - -struct drm_amdgpu_info_device { - /** PCI Device ID */ - __u32 device_id; - /** Internal chip revision: A0, A1, etc.) */ - __u32 chip_rev; - __u32 external_rev; - /** Revision id in PCI Config space */ - __u32 pci_rev; - __u32 family; - __u32 num_shader_engines; - __u32 num_shader_arrays_per_engine; - /* in KHz */ - __u32 gpu_counter_freq; - __u64 max_engine_clock; - __u64 max_memory_clock; - /* cu information */ - __u32 cu_active_number; - /* NOTE: cu_ao_mask is INVALID, DON'T use it */ - __u32 cu_ao_mask; - __u32 cu_bitmap[4][4]; - /** Render backend pipe mask. One render backend is CB+DB. */ - __u32 enabled_rb_pipes_mask; - __u32 num_rb_pipes; - __u32 num_hw_gfx_contexts; - /* PCIe version (the smaller of the GPU and the CPU/motherboard) */ - __u32 pcie_gen; - __u64 ids_flags; - /** Starting virtual address for UMDs. */ - __u64 virtual_address_offset; - /** The maximum virtual address */ - __u64 virtual_address_max; - /** Required alignment of virtual addresses. */ - __u32 virtual_address_alignment; - /** Page table entry - fragment size */ - __u32 pte_fragment_size; - __u32 gart_page_size; - /** constant engine ram size*/ - __u32 ce_ram_size; - /** video memory type info*/ - __u32 vram_type; - /** video memory bit width*/ - __u32 vram_bit_width; - /* vce harvesting instance */ - __u32 vce_harvest_config; - /* gfx double offchip LDS buffers */ - __u32 gc_double_offchip_lds_buf; - /* NGG Primitive Buffer */ - __u64 prim_buf_gpu_addr; - /* NGG Position Buffer */ - __u64 pos_buf_gpu_addr; - /* NGG Control Sideband */ - __u64 cntl_sb_buf_gpu_addr; - /* NGG Parameter Cache */ - __u64 param_buf_gpu_addr; - __u32 prim_buf_size; - __u32 pos_buf_size; - __u32 cntl_sb_buf_size; - __u32 param_buf_size; - /* wavefront size*/ - __u32 wave_front_size; - /* shader visible vgprs*/ - __u32 num_shader_visible_vgprs; - /* CU per shader array*/ - __u32 num_cu_per_sh; - /* number of tcc blocks*/ - __u32 num_tcc_blocks; - /* gs vgt table depth*/ - __u32 gs_vgt_table_depth; - /* gs primitive buffer depth*/ - __u32 gs_prim_buffer_depth; - /* max gs wavefront per vgt*/ - __u32 max_gs_waves_per_vgt; - /* PCIe number of lanes (the smaller of the GPU and the CPU/motherboard) */ - __u32 pcie_num_lanes; - /* always on cu bitmap */ - __u32 cu_ao_bitmap[4][4]; - /** Starting high virtual address for UMDs. */ - __u64 high_va_offset; - /** The maximum high virtual address */ - __u64 high_va_max; - /* gfx10 pa_sc_tile_steering_override */ - __u32 pa_sc_tile_steering_override; - /* disabled TCCs */ - __u64 tcc_disabled_mask; - __u64 min_engine_clock; - __u64 min_memory_clock; - /* The following fields are only set on gfx11+, older chips set 0. */ - __u32 tcp_cache_size; /* AKA GL0, VMEM cache */ - __u32 num_sqc_per_wgp; - __u32 sqc_data_cache_size; /* AKA SMEM cache */ - __u32 sqc_inst_cache_size; - __u32 gl1c_cache_size; - __u32 gl2c_cache_size; - __u64 mall_size; /* AKA infinity cache */ - /* high 32 bits of the rb pipes mask */ - __u32 enabled_rb_pipes_mask_hi; - /* shadow area size for gfx11 */ - __u32 shadow_size; - /* shadow area base virtual alignment for gfx11 */ - __u32 shadow_alignment; - /* context save area size for gfx11 */ - __u32 csa_size; - /* context save area base virtual alignment for gfx11 */ - __u32 csa_alignment; - /* Userq IP mask (1 << AMDGPU_HW_IP_*) */ - __u32 userq_ip_mask; - __u32 pad; -}; - -struct drm_amdgpu_info_hw_ip { - /** Version of h/w IP */ - __u32 hw_ip_version_major; - __u32 hw_ip_version_minor; - /** Capabilities */ - __u64 capabilities_flags; - /** command buffer address start alignment*/ - __u32 ib_start_alignment; - /** command buffer size alignment*/ - __u32 ib_size_alignment; - /** Bitmask of available rings. Bit 0 means ring 0, etc. */ - __u32 available_rings; - /** version info: bits 23:16 major, 15:8 minor, 7:0 revision */ - __u32 ip_discovery_version; - /* Userq available slots */ - __u32 userq_num_slots; -}; - -/* GFX metadata BO sizes and alignment info (in bytes) */ -struct drm_amdgpu_info_uq_fw_areas_gfx { - /* shadow area size */ - __u32 shadow_size; - /* shadow area base virtual mem alignment */ - __u32 shadow_alignment; - /* context save area size */ - __u32 csa_size; - /* context save area base virtual mem alignment */ - __u32 csa_alignment; -}; - -/* IP specific fw related information used in the - * subquery AMDGPU_INFO_UQ_FW_AREAS - */ -struct drm_amdgpu_info_uq_fw_areas { - union { - struct drm_amdgpu_info_uq_fw_areas_gfx gfx; - }; -}; - -struct drm_amdgpu_info_num_handles { - /** Max handles as supported by firmware for UVD */ - __u32 uvd_max_handles; - /** Handles currently in use for UVD */ - __u32 uvd_used_handles; -}; - -#define AMDGPU_VCE_CLOCK_TABLE_ENTRIES 6 - -struct drm_amdgpu_info_vce_clock_table_entry { - /** System clock */ - __u32 sclk; - /** Memory clock */ - __u32 mclk; - /** VCE clock */ - __u32 eclk; - __u32 pad; -}; - -struct drm_amdgpu_info_vce_clock_table { - struct drm_amdgpu_info_vce_clock_table_entry entries[AMDGPU_VCE_CLOCK_TABLE_ENTRIES]; - __u32 num_valid_entries; - __u32 pad; -}; - -/* query video encode/decode caps */ -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG2 0 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4 1 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1 2 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC 3 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC 4 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG 5 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9 6 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1 7 -#define AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT 8 - -struct drm_amdgpu_info_video_codec_info { - __u32 valid; - __u32 max_width; - __u32 max_height; - __u32 max_pixels_per_frame; - __u32 max_level; - __u32 pad; -}; - -struct drm_amdgpu_info_video_caps { - struct drm_amdgpu_info_video_codec_info codec_info[AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_COUNT]; -}; - -#define AMDGPU_VMHUB_TYPE_MASK 0xff -#define AMDGPU_VMHUB_TYPE_SHIFT 0 -#define AMDGPU_VMHUB_TYPE_GFX 0 -#define AMDGPU_VMHUB_TYPE_MM0 1 -#define AMDGPU_VMHUB_TYPE_MM1 2 -#define AMDGPU_VMHUB_IDX_MASK 0xff00 -#define AMDGPU_VMHUB_IDX_SHIFT 8 - -struct drm_amdgpu_info_gpuvm_fault { - __u64 addr; - __u32 status; - __u32 vmhub; -}; - -struct drm_amdgpu_info_uq_metadata_gfx { - /* shadow area size for gfx11 */ - __u32 shadow_size; - /* shadow area base virtual alignment for gfx11 */ - __u32 shadow_alignment; - /* context save area size for gfx11 */ - __u32 csa_size; - /* context save area base virtual alignment for gfx11 */ - __u32 csa_alignment; -}; - -struct drm_amdgpu_info_uq_metadata { - union { - struct drm_amdgpu_info_uq_metadata_gfx gfx; - }; -}; - -/* - * Supported GPU families - */ -#define AMDGPU_FAMILY_UNKNOWN 0 -#define AMDGPU_FAMILY_SI 110 /* Hainan, Oland, Verde, Pitcairn, Tahiti */ -#define AMDGPU_FAMILY_CI 120 /* Bonaire, Hawaii */ -#define AMDGPU_FAMILY_KV 125 /* Kaveri, Kabini, Mullins */ -#define AMDGPU_FAMILY_VI 130 /* Iceland, Tonga */ -#define AMDGPU_FAMILY_CZ 135 /* Carrizo, Stoney */ -#define AMDGPU_FAMILY_AI 141 /* Vega10 */ -#define AMDGPU_FAMILY_RV 142 /* Raven */ -#define AMDGPU_FAMILY_NV 143 /* Navi10 */ -#define AMDGPU_FAMILY_VGH 144 /* Van Gogh */ -#define AMDGPU_FAMILY_GC_11_0_0 145 /* GC 11.0.0 */ -#define AMDGPU_FAMILY_YC 146 /* Yellow Carp */ -#define AMDGPU_FAMILY_GC_11_0_1 148 /* GC 11.0.1 */ -#define AMDGPU_FAMILY_GC_10_3_6 149 /* GC 10.3.6 */ -#define AMDGPU_FAMILY_GC_10_3_7 151 /* GC 10.3.7 */ -#define AMDGPU_FAMILY_GC_11_5_0 150 /* GC 11.5.0 */ -#define AMDGPU_FAMILY_GC_12_0_0 152 /* GC 12.0.0 */ - -#ifndef HAVE_DRM_COLOR_CTM_3X4 -/* FIXME wrong namespace! */ -struct drm_color_ctm_3x4 { - /* - * Conversion matrix with 3x4 dimensions in S31.32 sign-magnitude - * (not two's complement!) format. - */ - __u64 matrix[12]; -}; -#endif - -/** - * Definition of System Unified Address (SUA) apertures - */ -#define AMDGPU_SUA_APERTURE_PRIVATE 1 -#define AMDGPU_SUA_APERTURE_SHARED 2 -struct drm_amdgpu_virtual_range { - uint64_t start; - uint64_t end; -}; - -struct drm_amdgpu_capability { - __u32 flag; - __u32 direct_gma_size; -}; - -/* - * Definition of free sync enter and exit signals - * We may have more options in the future - */ -#define AMDGPU_FREESYNC_FULLSCREEN_ENTER 1 -#define AMDGPU_FREESYNC_FULLSCREEN_EXIT 2 - -struct drm_amdgpu_freesync { - __u32 op; /* AMDGPU_FREESYNC_FULLSCREEN_ENTER or */ - /* AMDGPU_FREESYNC_FULLSCREEN_ENTER */ - __u32 spare[7]; -}; - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c deleted file mode 100644 index ee55bde0a..000000000 --- a/plugins/amdgpu/amdgpu_plugin.c +++ /dev/null @@ -1,2413 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "criu-plugin.h" -#include "plugin.h" -#include "criu-amdgpu.pb-c.h" -#include "util.h" -#include "util-pie.h" -#include "fdstore.h" - -#include "kfd_ioctl.h" -#include "xmalloc.h" -#include "criu-log.h" -#include "files.h" -#include "pstree.h" -#include "sockets.h" -#include "rst-malloc.h" - -#include "common/list.h" -#include "amdgpu_drm.h" -#include "amdgpu_plugin_dmabuf.h" -#include "amdgpu_plugin_drm.h" -#include "amdgpu_plugin_util.h" -#include "amdgpu_plugin_topology.h" -#include "amdgpu_socket_utils.h" - -#include "img-streamer.h" -#include "image.h" -#include "cr_options.h" -#include "util.h" - -struct vma_metadata { - struct list_head list; - uint64_t old_pgoff; - uint64_t new_pgoff; - uint64_t vma_entry; - uint32_t new_minor; - int fd; -}; - -/************************************ Global Variables ********************************************/ - -static LIST_HEAD(update_vma_info_list); - -size_t kfd_max_buffer_size; - -bool plugin_added_to_inventory = false; - -bool plugin_disabled = false; - -struct handle_id { - int handle; - int fdstore_id; -}; -struct shared_handle_ids { - int num_handles; - struct handle_id *handles; -}; -struct shared_handle_ids *shared_memory = NULL; - -static mutex_t *shared_memory_mutex; - -int current_pid; -/* - * In the case of a single process (common case), this optimization can effectively - * reduce the restore latency with parallel restore. In the case of multiple processes, - * states are already restored in parallel within different processes. Therefore, this - * optimization does not introduce further improvement and will be disabled by default - * in this case. The flag, parallel_disabled, is used to control whether the - * optimization is enabled or disabled. - */ -bool parallel_disabled = false; - -pthread_t parallel_thread = 0; -int parallel_thread_result = 0; -/**************************************************************************************************/ - -/* Call ioctl, restarting if it is interrupted */ -int kmtIoctl(int fd, unsigned long request, void *arg) -{ - int ret, max_retries = 200; - - do { - ret = ioctl(fd, request, arg); - } while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN)); - - if (ret == -1 && errno == EBADF) - /* In case pthread_atfork didn't catch it, this will - * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. - */ - pr_perror("KFD file descriptor not valid in this process"); - return ret; -} - -static void free_e(CriuKfd *e) -{ - for (int i = 0; i < e->n_bo_entries; i++) { - if (e->bo_entries[i]) - xfree(e->bo_entries[i]); - } - - for (int i = 0; i < e->n_device_entries; i++) { - if (e->device_entries[i]) { - for (int j = 0; j < e->device_entries[i]->n_iolinks; j++) - xfree(e->device_entries[i]->iolinks[j]); - - xfree(e->device_entries[i]); - } - } - xfree(e); -} - -static int allocate_device_entries(CriuKfd *e, int num_of_devices) -{ - e->device_entries = xmalloc(sizeof(KfdDeviceEntry *) * num_of_devices); - if (!e->device_entries) { - pr_err("Failed to allocate device_entries\n"); - return -ENOMEM; - } - - for (int i = 0; i < num_of_devices; i++) { - KfdDeviceEntry *entry = xzalloc(sizeof(*entry)); - - if (!entry) { - pr_err("Failed to allocate entry\n"); - return -ENOMEM; - } - - kfd_device_entry__init(entry); - - e->device_entries[i] = entry; - e->n_device_entries++; - } - return 0; -} - -static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr) -{ - e->bo_entries = xmalloc(sizeof(KfdBoEntry *) * num_bos); - if (!e->bo_entries) { - pr_err("Failed to allocate bo_info\n"); - return -ENOMEM; - } - - for (int i = 0; i < num_bos; i++) { - KfdBoEntry *entry = xzalloc(sizeof(*entry)); - - if (!entry) { - pr_err("Failed to allocate botest\n"); - return -ENOMEM; - } - - kfd_bo_entry__init(entry); - - e->bo_entries[i] = entry; - e->n_bo_entries++; - } - return 0; -} - -int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, KfdDeviceEntry **deviceEntries) -{ - uint32_t devinfo_index = 0; - struct tp_node *node; - - list_for_each_entry(node, &sys->nodes, listm_system) { - KfdDeviceEntry *devinfo = deviceEntries[devinfo_index++]; - - devinfo->node_id = node->id; - - if (NODE_IS_GPU(node)) { - devinfo->gpu_id = maps_get_dest_gpu(maps, node->gpu_id); - if (!devinfo->gpu_id) - return -EINVAL; - - devinfo->simd_count = node->simd_count; - devinfo->mem_banks_count = node->mem_banks_count; - devinfo->caches_count = node->caches_count; - devinfo->io_links_count = node->io_links_count; - devinfo->max_waves_per_simd = node->max_waves_per_simd; - devinfo->lds_size_in_kb = node->lds_size_in_kb; - devinfo->num_gws = node->num_gws; - devinfo->wave_front_size = node->wave_front_size; - devinfo->array_count = node->array_count; - devinfo->simd_arrays_per_engine = node->simd_arrays_per_engine; - devinfo->cu_per_simd_array = node->cu_per_simd_array; - devinfo->simd_per_cu = node->simd_per_cu; - devinfo->max_slots_scratch_cu = node->max_slots_scratch_cu; - devinfo->vendor_id = node->vendor_id; - devinfo->device_id = node->device_id; - devinfo->domain = node->domain; - devinfo->drm_render_minor = node->drm_render_minor; - devinfo->hive_id = node->hive_id; - devinfo->num_sdma_engines = node->num_sdma_engines; - devinfo->num_sdma_xgmi_engines = node->num_sdma_xgmi_engines; - devinfo->num_sdma_queues_per_engine = node->num_sdma_queues_per_engine; - devinfo->num_cp_queues = node->num_cp_queues; - devinfo->fw_version = node->fw_version; - devinfo->capability = node->capability; - devinfo->sdma_fw_version = node->sdma_fw_version; - devinfo->vram_public = node->vram_public; - devinfo->vram_size = node->vram_size; - } else { - devinfo->cpu_cores_count = node->cpu_cores_count; - } - - if (node->num_valid_iolinks) { - struct tp_iolink *iolink; - uint32_t iolink_index = 0; - - devinfo->iolinks = xmalloc(sizeof(DevIolink *) * node->num_valid_iolinks); - if (!devinfo->iolinks) - return -ENOMEM; - - list_for_each_entry(iolink, &node->iolinks, listm) { - if (!iolink->valid) - continue; - - devinfo->iolinks[iolink_index] = xmalloc(sizeof(DevIolink)); - if (!devinfo->iolinks[iolink_index]) - return -ENOMEM; - - dev_iolink__init(devinfo->iolinks[iolink_index]); - - devinfo->iolinks[iolink_index]->type = iolink->type; - devinfo->iolinks[iolink_index]->node_to_id = iolink->node_to_id; - iolink_index++; - } - devinfo->n_iolinks = iolink_index; - } - } - return 0; -} - -int devinfo_to_topology(KfdDeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) -{ - for (int i = 0; i < num_devices; i++) { - struct tp_node *node; - KfdDeviceEntry *devinfo = devinfos[i]; - - node = sys_add_node(sys, devinfo->node_id, devinfo->gpu_id); - if (!node) - return -ENOMEM; - - if (devinfo->cpu_cores_count) { - node->cpu_cores_count = devinfo->cpu_cores_count; - } else { - node->simd_count = devinfo->simd_count; - node->mem_banks_count = devinfo->mem_banks_count; - node->caches_count = devinfo->caches_count; - node->io_links_count = devinfo->io_links_count; - node->max_waves_per_simd = devinfo->max_waves_per_simd; - node->lds_size_in_kb = devinfo->lds_size_in_kb; - node->num_gws = devinfo->num_gws; - node->wave_front_size = devinfo->wave_front_size; - node->array_count = devinfo->array_count; - node->simd_arrays_per_engine = devinfo->simd_arrays_per_engine; - node->cu_per_simd_array = devinfo->cu_per_simd_array; - node->simd_per_cu = devinfo->simd_per_cu; - node->max_slots_scratch_cu = devinfo->max_slots_scratch_cu; - node->vendor_id = devinfo->vendor_id; - node->device_id = devinfo->device_id; - node->domain = devinfo->domain; - node->drm_render_minor = devinfo->drm_render_minor; - node->hive_id = devinfo->hive_id; - node->num_sdma_engines = devinfo->num_sdma_engines; - node->num_sdma_xgmi_engines = devinfo->num_sdma_xgmi_engines; - node->num_sdma_queues_per_engine = devinfo->num_sdma_queues_per_engine; - node->num_cp_queues = devinfo->num_cp_queues; - node->fw_version = devinfo->fw_version; - node->capability = devinfo->capability; - node->sdma_fw_version = devinfo->sdma_fw_version; - node->vram_public = devinfo->vram_public; - node->vram_size = devinfo->vram_size; - } - - for (int j = 0; j < devinfo->n_iolinks; j++) { - struct tp_iolink *iolink; - DevIolink *devlink = (devinfo->iolinks[j]); - - iolink = node_add_iolink(node, devlink->type, devlink->node_to_id); - if (!iolink) - return -ENOMEM; - } - } - return 0; -} - -void getenv_bool(const char *var, bool *value) -{ - char *value_str = getenv(var); - - if (value_str) { - if (!strcmp(value_str, "0") || !strcasecmp(value_str, "NO")) - *value = false; - else if (!strcmp(value_str, "1") || !strcasecmp(value_str, "YES")) - *value = true; - else - pr_err("Ignoring invalid value for %s=%s, expecting (YES/NO)\n", var, value_str); - } - pr_info("param: %s:%s\n", var, *value ? "Y" : "N"); -} - -void getenv_size_t(const char *var, size_t *value) -{ - char *value_str = getenv(var); - char *endp = value_str; - int sh = 0; - size_t size; - - if (value_str) { - size = (size_t)strtoul(value_str, &endp, 0); - if (errno || value_str == endp) { - pr_err("Ignoring invalid value for %s=%s, expecting a positive integer\n", var, value_str); - return; - } - switch (*endp) { - case 'k': - case 'K': - sh = 10; - break; - case 'M': - sh = 20; - break; - case 'G': - sh = 30; - break; - case '\0': - sh = 0; - break; - default: - pr_err("Ignoring invalid size suffix for %s=%s, expecting 'K'/k', 'M', or 'G'\n", var, value_str); - return; - } - if (SIZE_MAX >> sh < size) { - pr_err("Ignoring invalid value for %s=%s, exceeds SIZE_MAX\n", var, value_str); - return; - } - *value = size << sh; - } - pr_info("param: %s:0x%lx\n", var, *value); -} - -int amdgpu_plugin_init(int stage) -{ - if (stage == CR_PLUGIN_STAGE__RESTORE) { - if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { - plugin_disabled = true; - return 0; - } - } - - pr_info("initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); - - topology_init(&src_topology); - topology_init(&dest_topology); - maps_init(&checkpoint_maps); - maps_init(&restore_maps); - - if (stage == CR_PLUGIN_STAGE__RESTORE) { - if (has_children(root_item)) { - pr_info("Parallel restore disabled\n"); - parallel_disabled = true; - } else { - if (install_parallel_sock() < 0) { - pr_err("Failed to install parallel socket\n"); - return -1; - } - } - /* Default Values */ - kfd_fw_version_check = true; - kfd_sdma_fw_version_check = true; - kfd_caches_count_check = true; - kfd_num_gws_check = true; - kfd_vram_size_check = true; - kfd_numa_check = true; - kfd_capability_check = true; - - getenv_bool("KFD_FW_VER_CHECK", &kfd_fw_version_check); - getenv_bool("KFD_SDMA_FW_VER_CHECK", &kfd_sdma_fw_version_check); - getenv_bool("KFD_CACHES_COUNT_CHECK", &kfd_caches_count_check); - getenv_bool("KFD_NUM_GWS_CHECK", &kfd_num_gws_check); - getenv_bool("KFD_VRAM_SIZE_CHECK", &kfd_vram_size_check); - getenv_bool("KFD_NUMA_CHECK", &kfd_numa_check); - getenv_bool("KFD_CAPABILITY_CHECK", &kfd_capability_check); - } - kfd_max_buffer_size = 0; - getenv_size_t("KFD_MAX_BUFFER_SIZE", &kfd_max_buffer_size); - - return 0; -} - -void amdgpu_plugin_fini(int stage, int ret) -{ - if (plugin_disabled) - return; - - pr_info("finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); - - if (stage == CR_PLUGIN_STAGE__RESTORE) - sys_close_drm_render_devices(&dest_topology); - - maps_free(&checkpoint_maps); - maps_free(&restore_maps); - - topology_free(&src_topology); - topology_free(&dest_topology); -} - -CR_PLUGIN_REGISTER("amdgpu_plugin", amdgpu_plugin_init, amdgpu_plugin_fini) - -struct thread_data { - pthread_t thread; - uint64_t num_of_bos; - uint32_t gpu_id; - pid_t pid; - struct kfd_criu_bo_bucket *bo_buckets; - KfdBoEntry **bo_entries; - int drm_fd; - int ret; - int id; /* File ID used by CRIU to identify KFD image for this process */ -}; - -int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf) -{ - struct stat st_kfd; - int ret = 0; - - pr_debug("Enter %s\n", __func__); - ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); - if (ret == -1) { - pr_perror("stat error for /dev/kfd"); - return ret; - } - - /* If input device is KFD return device as supported */ - if (major(st_buf->st_rdev) == major(st_kfd.st_rdev)) { - pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev)); - return 0; - } - - /* Determine if input is a DRM device and therefore is supported */ - ret = amdgpu_plugin_drm_handle_device_vma(fd, st_buf); - if (ret) - pr_perror("%s(), Can't handle VMAs of input device", __func__); - - if (!ret && !plugin_added_to_inventory) { - ret = add_inventory_plugin(CR_PLUGIN_DESC.name); - if (ret) - pr_err("Failed to add AMDGPU plugin to inventory image\n"); - else - plugin_added_to_inventory = true; - } - - return ret; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma) - -int alloc_and_map(amdgpu_device_handle h_dev, uint64_t size, uint32_t domain, amdgpu_bo_handle *ph_bo, - amdgpu_va_handle *ph_va, uint64_t *p_gpu_addr, void **p_cpu_addr) -{ - struct amdgpu_bo_alloc_request alloc_req; - amdgpu_bo_handle h_bo; - amdgpu_va_handle h_va; - uint64_t gpu_addr; - void *cpu_addr; - int err; - - memset(&alloc_req, 0, sizeof(alloc_req)); - alloc_req.alloc_size = size; - alloc_req.phys_alignment = 0x1000; - alloc_req.preferred_heap = domain; - alloc_req.flags = 0; - err = amdgpu_bo_alloc(h_dev, &alloc_req, &h_bo); - if (err) { - pr_perror("failed to alloc BO"); - return err; - } - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, size, 0x1000, 0, &gpu_addr, &h_va, 0); - if (err) { - pr_perror("failed to alloc VA"); - goto err_va; - } - err = amdgpu_bo_va_op(h_bo, 0, size, gpu_addr, 0, AMDGPU_VA_OP_MAP); - if (err) { - pr_perror("failed to GPU map BO"); - goto err_gpu_map; - } - if (p_cpu_addr) { - err = amdgpu_bo_cpu_map(h_bo, &cpu_addr); - if (err) { - pr_perror("failed to CPU map BO"); - goto err_cpu_map; - } - *p_cpu_addr = cpu_addr; - } - - *ph_bo = h_bo; - *ph_va = h_va; - *p_gpu_addr = gpu_addr; - - return 0; - -err_cpu_map: - amdgpu_bo_va_op(h_bo, 0, size, gpu_addr, 0, AMDGPU_VA_OP_UNMAP); -err_gpu_map: - amdgpu_va_range_free(h_va); -err_va: - amdgpu_bo_free(h_bo); - return err; -} - -void free_and_unmap(uint64_t size, amdgpu_bo_handle h_bo, amdgpu_va_handle h_va, uint64_t gpu_addr, void *cpu_addr) -{ - if (cpu_addr) - amdgpu_bo_cpu_unmap(h_bo); - amdgpu_bo_va_op(h_bo, 0, size, gpu_addr, 0, AMDGPU_VA_OP_UNMAP); - amdgpu_va_range_free(h_va); - amdgpu_bo_free(h_bo); -} - -int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, - void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free) -{ - uint64_t src_bo_size, dst_bo_size, buffer_bo_size, bytes_remain, buffer_space_remain; - uint64_t gpu_addr_src, gpu_addr_dst, gpu_addr_ib, copy_src, copy_dst, copy_size; - amdgpu_va_handle h_va_src, h_va_dst, h_va_ib; - amdgpu_bo_handle h_bo_src, h_bo_dst, h_bo_ib; - struct amdgpu_bo_import_result res = { 0 }; - struct amdgpu_cs_ib_info ib_info; - amdgpu_bo_list_handle h_bo_list; - struct amdgpu_cs_request cs_req; - amdgpu_bo_handle resources[3]; - struct amdgpu_cs_fence fence; - uint32_t expired; - amdgpu_context_handle h_ctx; - uint32_t *ib = NULL; - int j, err, packets_per_buffer; - - buffer_bo_size = min(size, buffer_size); - packets_per_buffer = ((buffer_bo_size - 1) / max_copy_size) + 1; - src_bo_size = (type == SDMA_OP_VRAM_WRITE) ? buffer_bo_size : size; - dst_bo_size = (type == SDMA_OP_VRAM_READ) ? buffer_bo_size : size; - - plugin_log_msg("Enter %s\n", __func__); - - /* prepare src buffer */ - switch (type) { - case SDMA_OP_VRAM_WRITE: - err = amdgpu_create_bo_from_user_mem(h_dev, buffer, src_bo_size, &h_bo_src); - if (err) { - pr_perror("failed to create userptr for sdma"); - return -EFAULT; - } - break; - case SDMA_OP_VRAM_READ: - err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); - if (err) { - pr_perror("failed to import dmabuf handle from libdrm"); - return -EFAULT; - } - h_bo_src = res.buf_handle; - break; - default: - pr_perror("Invalid sdma operation"); - return -EINVAL; - } - - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, src_bo_size, 0x1000, 0, &gpu_addr_src, - &h_va_src, 0); - if (err) { - pr_perror("failed to alloc VA for src bo"); - goto err_src_va; - } - err = amdgpu_bo_va_op(h_bo_src, 0, src_bo_size, gpu_addr_src, 0, AMDGPU_VA_OP_MAP); - if (err) { - pr_perror("failed to GPU map the src BO"); - goto err_src_bo_map; - } - plugin_log_msg("Source BO: GPU VA: %lx, size: %lx\n", gpu_addr_src, src_bo_size); - - /* prepare dest buffer */ - switch (type) { - case SDMA_OP_VRAM_WRITE: - err = amdgpu_bo_import(h_dev, amdgpu_bo_handle_type_dma_buf_fd, shared_fd, &res); - if (err) { - pr_perror("failed to import dmabuf handle from libdrm"); - goto err_dst_bo_prep; - } - h_bo_dst = res.buf_handle; - break; - case SDMA_OP_VRAM_READ: - err = amdgpu_create_bo_from_user_mem(h_dev, buffer, dst_bo_size, &h_bo_dst); - if (err) { - pr_perror("failed to create userptr for sdma"); - goto err_dst_bo_prep; - } - break; - default: - pr_perror("Invalid sdma operation"); - goto err_dst_bo_prep; - } - - err = amdgpu_va_range_alloc(h_dev, amdgpu_gpu_va_range_general, dst_bo_size, 0x1000, 0, &gpu_addr_dst, - &h_va_dst, 0); - if (err) { - pr_perror("failed to alloc VA for dest bo"); - goto err_dst_va; - } - err = amdgpu_bo_va_op(h_bo_dst, 0, dst_bo_size, gpu_addr_dst, 0, AMDGPU_VA_OP_MAP); - if (err) { - pr_perror("failed to GPU map the dest BO"); - goto err_dst_bo_map; - } - plugin_log_msg("Dest BO: GPU VA: %lx, size: %lx\n", gpu_addr_dst, dst_bo_size); - - /* prepare ring buffer/indirect buffer for command submission - * each copy packet is 7 dwords so we need to alloc 28x size for ib - */ - err = alloc_and_map(h_dev, packets_per_buffer * 28, AMDGPU_GEM_DOMAIN_GTT, &h_bo_ib, &h_va_ib, &gpu_addr_ib, - (void **)&ib); - if (err) { - pr_perror("failed to allocate and map ib/rb"); - goto err_ib_gpu_alloc; - } - plugin_log_msg("Indirect BO: GPU VA: %lx, size: %lx\n", gpu_addr_ib, packets_per_buffer * 28); - - resources[0] = h_bo_src; - resources[1] = h_bo_dst; - resources[2] = h_bo_ib; - err = amdgpu_bo_list_create(h_dev, 3, resources, NULL, &h_bo_list); - if (err) { - pr_perror("failed to create BO resources list"); - goto err_bo_list; - } - - bytes_remain = size; - if (type == SDMA_OP_VRAM_WRITE) - copy_dst = gpu_addr_dst; - else - copy_src = gpu_addr_src; - - while (bytes_remain > 0) { - memset(&cs_req, 0, sizeof(cs_req)); - memset(&fence, 0, sizeof(fence)); - memset(&ib_info, 0, sizeof(ib_info)); - memset(ib, 0, packets_per_buffer * 28); - - if (type == SDMA_OP_VRAM_WRITE) { - err = read_fp(storage_fp, buffer, min(bytes_remain, buffer_bo_size)); - if (err) { - pr_perror("failed to read from storage"); - goto err_bo_list; - } - } - - buffer_space_remain = buffer_bo_size; - if (type == SDMA_OP_VRAM_WRITE) - copy_src = gpu_addr_src; - else - copy_dst = gpu_addr_dst; - j = 0; - - while (bytes_remain > 0 && buffer_space_remain > 0) { - copy_size = min(min(bytes_remain, max_copy_size), buffer_space_remain); - - ib[j++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0); - ib[j++] = copy_size; - ib[j++] = 0; - ib[j++] = 0xffffffff & copy_src; - ib[j++] = (0xffffffff00000000 & copy_src) >> 32; - ib[j++] = 0xffffffff & copy_dst; - ib[j++] = (0xffffffff00000000 & copy_dst) >> 32; - - copy_src += copy_size; - copy_dst += copy_size; - bytes_remain -= copy_size; - buffer_space_remain -= copy_size; - } - /* pad the IB to the required number of dw with SDMA_NOP */ - while (j & 7) - ib[j++] = SDMA_NOP; - - ib_info.ib_mc_address = gpu_addr_ib; - ib_info.size = j; - - cs_req.ip_type = AMDGPU_HW_IP_DMA; - /* possible future optimization: may use other rings, info available in - * amdgpu_query_hw_ip_info() - */ - cs_req.ring = 0; - cs_req.number_of_ibs = 1; - cs_req.ibs = &ib_info; - cs_req.resources = h_bo_list; - cs_req.fence_info.handle = NULL; - - err = amdgpu_cs_ctx_create(h_dev, &h_ctx); - if (err) { - pr_perror("failed to create context for SDMA command submission"); - goto err_ctx; - } - err = amdgpu_cs_submit(h_ctx, 0, &cs_req, 1); - if (err) { - pr_perror("failed to submit command for SDMA IB"); - goto err_cs_submit_ib; - } - - fence.context = h_ctx; - fence.ip_type = AMDGPU_HW_IP_DMA; - fence.ip_instance = 0; - fence.ring = 0; - fence.fence = cs_req.seq_no; - err = amdgpu_cs_query_fence_status(&fence, AMDGPU_TIMEOUT_INFINITE, 0, &expired); - if (err) { - pr_perror("failed to query fence status"); - goto err_cs_submit_ib; - } - if (!expired) { - pr_err("IB execution did not complete\n"); - err = -EBUSY; - goto err_cs_submit_ib; - } - - if (type == SDMA_OP_VRAM_READ) { - err = write_fp(storage_fp, buffer, buffer_bo_size - buffer_space_remain); - if (err) { - pr_perror("failed to write out to storage"); - goto err_cs_submit_ib; - } - } - -err_cs_submit_ib: - amdgpu_cs_ctx_free(h_ctx); - if (err) - break; - } -err_ctx: - amdgpu_bo_list_destroy(h_bo_list); -err_bo_list: - free_and_unmap(packets_per_buffer * 28, h_bo_ib, h_va_ib, gpu_addr_ib, ib); -err_ib_gpu_alloc: - err = amdgpu_bo_va_op(h_bo_dst, 0, size, gpu_addr_dst, 0, AMDGPU_VA_OP_UNMAP); - if (err) - pr_perror("failed to GPU unmap the dest BO %lx, size = %lx", gpu_addr_dst, size); -err_dst_bo_map: - err = amdgpu_va_range_free(h_va_dst); - if (err) - pr_perror("dest range free failed"); -err_dst_va: - if (!do_not_free) - err = amdgpu_bo_free(h_bo_dst); - if (err) - pr_perror("dest bo free failed"); -err_dst_bo_prep: - err = amdgpu_bo_va_op(h_bo_src, 0, size, gpu_addr_src, 0, AMDGPU_VA_OP_UNMAP); - if (err) - pr_perror("failed to GPU unmap the src BO %lx, size = %lx", gpu_addr_src, size); -err_src_bo_map: - err = amdgpu_va_range_free(h_va_src); - if (err) - pr_perror("src range free failed"); -err_src_va: - err = amdgpu_bo_free(h_bo_src); - if (err) - pr_perror("src bo free failed"); - plugin_log_msg("Leaving sdma_copy_bo, err = %d\n", err); - return err; -} - -void *dump_bo_contents(void *_thread_data) -{ - struct thread_data *thread_data = (struct thread_data *)_thread_data; - struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - struct amdgpu_gpu_info gpu_info = { 0 }; - amdgpu_device_handle h_dev; - size_t max_bo_size = 0, image_size = 0, buffer_size; - uint64_t max_copy_size; - uint32_t major, minor; - int num_bos = 0; - int i, ret = 0; - FILE *bo_contents_fp = NULL; - void *buffer = NULL; - char img_path[40]; - - pr_info("Thread[0x%x] started\n", thread_data->gpu_id); - - ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); - if (ret) { - pr_perror("failed to initialize device"); - goto exit; - } - plugin_log_msg("libdrm initialized successfully\n"); - - ret = amdgpu_query_gpu_info(h_dev, &gpu_info); - if (ret) { - pr_perror("failed to query gpuinfo via libdrm"); - goto exit; - } - - max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; - - for (i = 0; i < thread_data->num_of_bos; i++) { - if (bo_buckets[i].gpu_id == thread_data->gpu_id && - (bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) { - image_size += bo_buckets[i].size; - if (bo_buckets[i].size > max_bo_size) - max_bo_size = bo_buckets[i].size; - } - } - - buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; - - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); - if (!buffer) { - pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); - ret = -ENOMEM; - goto exit; - } - - snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); - bo_contents_fp = open_img_file(img_path, true, &image_size); - if (!bo_contents_fp) { - pr_perror("Cannot fopen %s", img_path); - ret = -EIO; - goto exit; - } - - for (i = 0; i < thread_data->num_of_bos; i++) { - if (bo_buckets[i].gpu_id != thread_data->gpu_id) - continue; - - if (!(bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) - continue; - - num_bos++; - - /* perform sDMA based vram copy */ - ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_READ, false); - - if (ret) { - pr_err("Failed to drain the BO using sDMA: bo_buckets[%d]\n", i); - break; - } - } - -exit: - pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); - - if (bo_contents_fp) - fclose(bo_contents_fp); - - xfree(buffer); - - amdgpu_device_deinitialize(h_dev); - - thread_data->ret = ret; - return NULL; -}; - -void *restore_bo_contents(void *_thread_data) -{ - struct thread_data *thread_data = (struct thread_data *)_thread_data; - struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets; - size_t image_size = 0, total_bo_size = 0, max_bo_size = 0, buffer_size; - struct amdgpu_gpu_info gpu_info = { 0 }; - amdgpu_device_handle h_dev; - uint64_t max_copy_size; - uint32_t major, minor; - FILE *bo_contents_fp = NULL; - void *buffer = NULL; - char img_path[40]; - int num_bos = 0; - int i, ret = 0; - - pr_info("Thread[0x%x] started\n", thread_data->gpu_id); - - ret = amdgpu_device_initialize(thread_data->drm_fd, &major, &minor, &h_dev); - if (ret) { - pr_perror("failed to initialize device"); - goto exit; - } - plugin_log_msg("libdrm initialized successfully\n"); - - ret = amdgpu_query_gpu_info(h_dev, &gpu_info); - if (ret) { - pr_perror("failed to query gpuinfo via libdrm"); - goto exit; - } - - max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; - - snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, thread_data->id, thread_data->gpu_id); - bo_contents_fp = open_img_file(img_path, false, &image_size); - if (!bo_contents_fp) { - pr_perror("Cannot fopen %s", img_path); - ret = -errno; - goto exit; - } - - for (i = 0; i < thread_data->num_of_bos; i++) { - if (bo_buckets[i].gpu_id == thread_data->gpu_id && - (bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) { - total_bo_size += bo_buckets[i].size; - - if (bo_buckets[i].size > max_bo_size) - max_bo_size = bo_buckets[i].size; - } - } - - if (total_bo_size != image_size) { - pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, total_bo_size); - - ret = -EINVAL; - goto exit; - } - - buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; - - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); - if (!buffer) { - pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); - ret = -ENOMEM; - goto exit; - } - - for (i = 0; i < thread_data->num_of_bos; i++) { - if (bo_buckets[i].gpu_id != thread_data->gpu_id) - continue; - - if (!(bo_buckets[i].alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))) - continue; - - num_bos++; - - ret = sdma_copy_bo(bo_buckets[i].dmabuf_fd, bo_buckets[i].size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_WRITE, false); - if (ret) { - pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); - break; - } - plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i); - } - -exit: - pr_info("Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret); - - if (bo_contents_fp) - fclose(bo_contents_fp); - - xfree(buffer); - - amdgpu_device_deinitialize(h_dev); - thread_data->ret = ret; - return NULL; -}; - -int check_hsakmt_shared_mem(uint64_t *shared_mem_size, uint32_t *shared_mem_magic) -{ - int ret; - struct stat st; - - ret = stat(HSAKMT_SHM_PATH, &st); - if (ret) { - *shared_mem_size = 0; - return 0; - } - - *shared_mem_size = st.st_size; - - /* First 4 bytes of shared file is the magic */ - ret = read_file(HSAKMT_SHM_PATH, shared_mem_magic, sizeof(*shared_mem_magic)); - if (ret) - pr_perror("Failed to read shared mem magic"); - else - plugin_log_msg("Shared mem magic:0x%x\n", *shared_mem_magic); - - return 0; -} - -int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t shared_mem_magic) -{ - int ret, fd; - struct stat st; - sem_t *sem = SEM_FAILED; - - if (!shared_mem_size) - return 0; - - if (!stat(HSAKMT_SHM_PATH, &st)) { - pr_debug("%s already exists\n", HSAKMT_SHM_PATH); - } else { - pr_info("Warning:%s was missing. Re-creating new file but we may lose perf counters\n", - HSAKMT_SHM_PATH); - fd = shm_open(HSAKMT_SHM, O_CREAT | O_RDWR, 0666); - - ret = ftruncate(fd, shared_mem_size); - if (ret < 0) { - pr_err("Failed to truncate shared mem %s\n", HSAKMT_SHM); - close(fd); - return -errno; - } - - ret = write(fd, &shared_mem_magic, sizeof(shared_mem_magic)); - if (ret != sizeof(shared_mem_magic)) { - pr_perror("Failed to restore shared mem magic"); - close(fd); - return -errno; - } - - close(fd); - } - - sem = sem_open(HSAKMT_SEM, O_CREAT, 0666, 1); - if (sem == SEM_FAILED) { - pr_perror("Failed to create %s", HSAKMT_SEM); - return -EACCES; - } - sem_close(sem); - return 0; -} - -int amdgpu_unpause_processes(int pid) -{ - int ret = 0; - struct kfd_ioctl_criu_args args = { 0 }; - struct list_head *l = get_dumped_fds(); - struct dumped_fd *st; - - list_for_each_entry(st, l, l) { - if (st->is_drm) { - close(st->fd); - } else { - args.op = KFD_CRIU_OP_UNPAUSE; - - ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args); - if (ret) { - pr_perror("Failed to unpause process"); - goto exit; - } - } - } - - if (post_dump_dmabuf_check() < 0) - ret = -1; - -exit: - pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); - clear_dumped_fds(); - - return ret; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes) - -int store_dmabuf_fd(int handle, int fd) -{ - int id; - - id = fdstore_add(fd); - mutex_lock(shared_memory_mutex); - for (int i = 0; i < shared_memory->num_handles; i++) { - if (shared_memory->handles[i].handle == handle) { - mutex_unlock(shared_memory_mutex); - return 0; - } - if (shared_memory->handles[i].handle == -1) { - shared_memory->handles[i].handle = handle; - shared_memory->handles[i].fdstore_id = id; - mutex_unlock(shared_memory_mutex); - return 0; - } - } - mutex_unlock(shared_memory_mutex); - - return -1; -} - -int amdgpu_id_for_handle(int handle) -{ - mutex_lock(shared_memory_mutex); - for (int i = 0; i < shared_memory->num_handles; i++) { - if (shared_memory->handles[i].handle == handle) { - mutex_unlock(shared_memory_mutex); - return shared_memory->handles[i].fdstore_id; - } - } - mutex_unlock(shared_memory_mutex); - return -1; -} - -int amdgpu_restore_init(void) -{ - if (!shared_memory) { - int protection = PROT_READ | PROT_WRITE; - int visibility = MAP_SHARED | MAP_ANONYMOUS; - size_t img_size; - FILE *img_fp = NULL; - int ret; - unsigned char *buf; - int num_handles = 0; - char img_path[PATH_MAX]; - CriuRenderNode *rd = NULL; - CriuKfd *e = NULL; - - DIR *d; - struct dirent *dir; - d = opendir("."); - if (d) { - while ((dir = readdir(d)) != NULL) { - if (strncmp("amdgpu-kfd-", dir->d_name, strlen("amdgpu-kfd-")) == 0) { - img_fp = open_img_file(dir->d_name, false, &img_size); - buf = xmalloc(img_size); - if (!buf) { - fclose(img_fp); - return -ENOMEM; - } - - ret = read_fp(img_fp, buf, img_size); - if (ret) { - pr_perror("Unable to read from %s", img_path); - fclose(img_fp); - xfree(buf); - return ret; - } - - fclose(img_fp); - e = criu_kfd__unpack(NULL, img_size, buf); - num_handles += e->num_of_bos; - criu_kfd__free_unpacked(e, NULL); - xfree(buf); - } - if (strncmp("amdgpu-renderD-", dir->d_name, strlen("amdgpu-renderD-")) == 0) { - img_fp = open_img_file(dir->d_name, false, &img_size); - buf = xmalloc(img_size); - if (!buf) { - fclose(img_fp); - return -ENOMEM; - } - - ret = read_fp(img_fp, buf, img_size); - if (ret) { - pr_perror("Unable to read from %s", img_path); - fclose(img_fp); - xfree(buf); - return ret; - } - - fclose(img_fp); - rd = criu_render_node__unpack(NULL, img_size, buf); - num_handles += rd->num_of_bos; - criu_render_node__free_unpacked(rd, NULL); - xfree(buf); - } - } - closedir(d); - } - - if (num_handles > 0) { - shared_memory = mmap(NULL, sizeof(shared_memory), protection, visibility, -1, 0); - shared_memory->num_handles = num_handles; - shared_memory->handles = mmap(NULL, sizeof(struct handle_id) * num_handles, protection, visibility, -1, 0); - - for (int i = 0; i < num_handles; i++) { - shared_memory->handles[i].handle = -1; - shared_memory->handles[i].fdstore_id = -1; - } - - shared_memory_mutex = shmalloc(sizeof(*shared_memory_mutex)); - if (!shared_memory_mutex) { - pr_err("Can't create amdgpu mutex\n"); - return -1; - } - mutex_init(shared_memory_mutex); - } - } - - return 0; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_INIT, amdgpu_restore_init) - -static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets, - CriuKfd *e) -{ - int ret = 0; - - pr_debug("Dumping %d devices\n", args->num_devices); - - /* When checkpointing on a node where there was already a checkpoint-restore before, the - * user_gpu_id and actual_gpu_id will be different. - * - * We store the user_gpu_id in the stored image files so that the stored images always have - * the gpu_id's of the node where the application was first launched. - */ - for (int i = 0; i < args->num_devices; i++) - maps_add_gpu_entry(&checkpoint_maps, device_buckets[i].actual_gpu_id, device_buckets[i].user_gpu_id); - - e->num_of_gpus = args->num_devices; - e->num_of_cpus = src_topology.num_nodes - args->num_devices; - - /* The ioctl will only return entries for GPUs, but we also store entries for CPUs and the - * information for CPUs is obtained from parsing system topology - */ - ret = allocate_device_entries(e, src_topology.num_nodes); - if (ret) - goto exit; - - pr_debug("Number of CPUs:%d GPUs:%d\n", e->num_of_cpus, e->num_of_gpus); - - /* Store topology information that was obtained from parsing /sys/class/kfd/kfd/topology/ */ - ret = topology_to_devinfo(&src_topology, &checkpoint_maps, e->device_entries); - if (ret) - goto exit; - -exit: - pr_info("Dumped devices %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); - return ret; -} - -static int save_bos(int id, int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) -{ - struct thread_data *thread_datas; - int ret = 0, i; - amdgpu_device_handle h_dev; - uint32_t major, minor; - - pr_debug("Dumping %d BOs\n", args->num_bos); - - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } - - e->num_of_bos = args->num_bos; - ret = allocate_bo_entries(e, e->num_of_bos, bo_buckets); - if (ret) - goto exit; - - for (i = 0; i < e->num_of_bos; i++) { - struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - KfdBoEntry *boinfo = e->bo_entries[i]; - - boinfo->gpu_id = bo_bucket->gpu_id; - boinfo->addr = bo_bucket->addr; - boinfo->size = bo_bucket->size; - boinfo->offset = bo_bucket->offset; - boinfo->alloc_flags = bo_bucket->alloc_flags; - - ret = amdgpu_device_initialize(node_get_drm_render_device(sys_get_node_by_gpu_id(&src_topology, bo_bucket->gpu_id)), &major, &minor, &h_dev); - - boinfo->handle = get_gem_handle(h_dev, bo_bucket->dmabuf_fd); - - amdgpu_device_deinitialize(h_dev); - } - for (i = 0; i < e->num_of_bos; i++) { - KfdBoEntry *boinfo = e->bo_entries[i]; - - ret = record_shared_bo(boinfo->handle, false); - if (ret) - goto exit; - } - - for (int i = 0; i < e->num_of_gpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - - dev = sys_get_node_by_index(&src_topology, i); - if (!dev) { - ret = -ENODEV; - goto exit; - } - - thread_datas[i].id = id; - thread_datas[i].gpu_id = dev->gpu_id; - thread_datas[i].bo_buckets = bo_buckets; - thread_datas[i].bo_entries = e->bo_entries; - thread_datas[i].pid = e->pid; - thread_datas[i].num_of_bos = args->num_bos; - thread_datas[i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[i].drm_fd < 0) { - ret = thread_datas[i].drm_fd; - goto exit; - } - - ret_thread = pthread_create(&thread_datas[i].thread, NULL, dump_bo_contents, (void *)&thread_datas[i]); - if (ret_thread) { - pr_err("Failed to create thread[%i]\n", i); - ret = -ret_thread; - goto exit; - } - } - - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; - } - } -exit: - for (int i = 0; i < e->num_of_bos; i++) { - if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) - close(bo_buckets[i].dmabuf_fd); - } - - xfree(thread_datas); - pr_info("Dumped bos %s (ret:%d)\n", ret ? "failed" : "ok", ret); - return ret; -} - -bool kernel_supports_criu(int fd) -{ - struct kfd_ioctl_get_version_args args = { 0 }; - bool close_fd = false, ret = true; - - if (fd < 0) { - fd = open(AMDGPU_KFD_DEVICE, O_RDONLY); - if (fd < 0) { - pr_perror("failed to open kfd in plugin"); - return false; - } - close_fd = true; - } - - if (kmtIoctl(fd, AMDKFD_IOC_GET_VERSION, &args) == -1) { - pr_perror("Failed to call get version ioctl"); - ret = false; - goto exit; - } - - pr_debug("Kernel IOCTL version:%d.%02d\n", args.major_version, args.minor_version); - - if (args.major_version != KFD_IOCTL_MAJOR_VERSION || args.minor_version < MIN_KFD_IOCTL_MINOR_VERSION) { - pr_err("CR not supported on current kernel (current:%02d.%02d min:%02d.%02d)\n", args.major_version, - args.minor_version, KFD_IOCTL_MAJOR_VERSION, MIN_KFD_IOCTL_MINOR_VERSION); - ret = false; - goto exit; - } - -exit: - if (close_fd) - close(fd); - - return ret; -} - -int amdgpu_plugin_dump_file(int fd, int id) -{ - struct kfd_ioctl_criu_args args = { 0 }; - char img_path[PATH_MAX]; - struct stat st, st_kfd; - unsigned char *buf; - CriuKfd *e = NULL; - int ret = 0; - size_t len; - - if (fstat(fd, &st) == -1) { - pr_perror("fstat error"); - return -1; - } - - ret = stat(AMDGPU_KFD_DEVICE, &st_kfd); - if (ret == -1) { - pr_perror("fstat error for /dev/kfd"); - return -1; - } - - if (topology_parse(&src_topology, "Checkpoint")) - return -1; - - /* We call topology_determine_iolinks to validate io_links. If io_links are not valid - * we do not store them inside the checkpointed images - */ - if (topology_determine_iolinks(&src_topology)) { - pr_err("Failed to determine iolinks from topology\n"); - return -1; - } - - /* Check whether this plugin was called for kfd, dmabuf or render nodes */ - ret = get_dmabuf_info(fd, &st); - if (ret < 0) { - pr_perror("Failed to get dmabuf info"); - return -1; - } - if (ret == 0) { - pr_info("Dumping dmabuf fd = %d\n", fd); - return amdgpu_plugin_dmabuf_dump(fd, id); - } - - if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { - - /* This is RenderD dumper plugin, for now just save renderD - * minor number to be used during restore. In later phases this - * needs to save more data for video decode etc. - */ - ret = amdgpu_plugin_drm_dump_file(fd, id, &st); - if (ret) - return ret; - - ret = record_dumped_fd(fd, true); - if (ret) - return ret; - - /* Need to return success here so that criu can call plugins for renderD nodes */ - return try_dump_dmabuf_list(); - } - - pr_info("%s() called for fd = %d\n", __func__, major(st.st_rdev)); - - /* KFD only allows ioctl calls from the same process that opened the KFD file descriptor. - * The existing /dev/kfd file descriptor that is passed in is only allowed to do IOCTL calls with - * CAP_CHECKPOINT_RESTORE/CAP_SYS_ADMIN. So kernel_supports_criu() needs to open its own file descriptor to - * perform the AMDKFD_IOC_GET_VERSION ioctl. - */ - if (!kernel_supports_criu(-1)) - return -ENOTSUP; - - args.op = KFD_CRIU_OP_PROCESS_INFO; - if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("Failed to call process info ioctl"); - ret = -1; - goto exit; - } - - pr_info("devices:%" PRIu32 " bos:%" PRIu32 " objects:%" PRIu32 " priv_data:%" PRIu64 "\n", - args.num_devices, args.num_bos, args.num_objects, args.priv_data_size); - - e = xmalloc(sizeof(*e)); - if (!e) { - pr_err("Failed to allocate proto structure\n"); - ret = -ENOMEM; - goto exit; - } - - criu_kfd__init(e); - e->pid = args.pid; - - args.devices = (uintptr_t)xzalloc((args.num_devices * sizeof(struct kfd_criu_device_bucket))); - if (!args.devices) { - ret = -ENOMEM; - goto exit; - } - - args.bos = (uintptr_t)xzalloc((args.num_bos * sizeof(struct kfd_criu_bo_bucket))); - if (!args.bos) { - ret = -ENOMEM; - goto exit; - } - - args.priv_data = (uintptr_t)xzalloc((args.priv_data_size)); - if (!args.priv_data) { - ret = -ENOMEM; - goto exit; - } - - args.op = KFD_CRIU_OP_CHECKPOINT; - ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args); - if (ret) { - pr_perror("Failed to call dumper (process) ioctl"); - goto exit; - } - - ret = save_devices(fd, &args, (struct kfd_criu_device_bucket *)args.devices, e); - if (ret) - goto exit; - - ret = save_bos(id, fd, &args, (struct kfd_criu_bo_bucket *)args.bos, e); - if (ret) - goto exit; - - e->num_of_objects = args.num_objects; - - e->priv_data.data = (void *)args.priv_data; - e->priv_data.len = args.priv_data_size; - - ret = check_hsakmt_shared_mem(&e->shared_mem_size, &e->shared_mem_magic); - if (ret) - goto exit; - - snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); - pr_info("img_path = %s\n", img_path); - - len = criu_kfd__get_packed_size(e); - - pr_info("Len = %ld\n", len); - - buf = xmalloc(len); - if (!buf) { - pr_perror("Failed to allocate memory to store protobuf"); - ret = -ENOMEM; - goto exit; - } - - criu_kfd__pack(e, buf); - - ret = write_img_file(img_path, buf, len); - - xfree(buf); - - ret = record_dumped_fd(fd, false); - if (ret) - goto exit; - -exit: - xfree((void *)args.devices); - xfree((void *)args.bos); - xfree((void *)args.priv_data); - - free_e(e); - - if (ret) - pr_err("Failed to dump (ret:%d)\n", ret); - else - pr_info("Dump successful\n"); - - return ret; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_EXT_FILE, amdgpu_plugin_dump_file) - -/* Restore per-device information */ -static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) -{ - struct kfd_criu_device_bucket *device_buckets; - int ret = 0, bucket_index = 0; - - pr_debug("Restoring %d devices\n", e->num_of_gpus); - args->num_devices = e->num_of_gpus; - device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices); - if (!device_buckets) - return -ENOMEM; - - args->devices = (uintptr_t)device_buckets; - - for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) { - struct kfd_criu_device_bucket *device_bucket; - KfdDeviceEntry *devinfo = e->device_entries[entries_i]; - struct tp_node *tp_node; - - if (!devinfo->gpu_id) - continue; - - device_bucket = &device_buckets[bucket_index++]; - - device_bucket->user_gpu_id = devinfo->gpu_id; - device_bucket->actual_gpu_id = maps_get_dest_gpu(&restore_maps, devinfo->gpu_id); - if (!device_bucket->actual_gpu_id) { - ret = -ENODEV; - goto exit; - } - - tp_node = sys_get_node_by_gpu_id(&dest_topology, device_bucket->actual_gpu_id); - if (!tp_node) { - ret = -ENODEV; - goto exit; - } - - device_bucket->drm_fd = node_get_drm_render_device(tp_node); - if (device_bucket->drm_fd < 0) { - pr_perror("Can't pass NULL drm render fd to driver"); - goto exit; - } else { - pr_info("passing drm render fd = %d to driver\n", device_bucket->drm_fd); - } - } - -exit: - pr_info("Restore devices %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); - return ret; -} - -static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) -{ - struct kfd_criu_bo_bucket *bo_buckets; - - pr_debug("Restoring %ld BOs\n", e->num_of_bos); - - args->num_bos = e->num_of_bos; - bo_buckets = xzalloc(sizeof(*bo_buckets) * args->num_bos); - if (!bo_buckets) - return -ENOMEM; - - args->bos = (uintptr_t)bo_buckets; - - for (int i = 0; i < args->num_bos; i++) { - struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - KfdBoEntry *bo_entry = e->bo_entries[i]; - - bo_bucket->gpu_id = bo_entry->gpu_id; - bo_bucket->addr = bo_entry->addr; - bo_bucket->size = bo_entry->size; - bo_bucket->offset = bo_entry->offset; - bo_bucket->alloc_flags = bo_entry->alloc_flags; - - plugin_log_msg("BO [%d] gpu_id:%x addr:%llx size:%llx offset:%llx\n", i, bo_bucket->gpu_id, - bo_bucket->addr, bo_bucket->size, bo_bucket->offset); - } - - pr_info("Restore BOs Ok\n"); - - return 0; -} - -int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int fd) -{ - struct vma_metadata *vma_md; - - vma_md = xmalloc(sizeof(*vma_md)); - if (!vma_md) { - return -ENOMEM; - } - - memset(vma_md, 0, sizeof(*vma_md)); - - vma_md->old_pgoff = offset; - vma_md->vma_entry = addr; - - vma_md->new_pgoff = restored_offset; - vma_md->fd = fd; - - list_add_tail(&vma_md->list, &update_vma_info_list); - - return 0; -} - -static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) -{ - struct thread_data *thread_datas = NULL; - int thread_i, ret = 0; - uint64_t offset = 0; - - for (int i = 0; i < e->num_of_bos; i++) { - struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; - struct tp_node *tp_node; - - if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT | - KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)) { - struct vma_metadata *vma_md; - uint32_t target_gpu_id; /* actual gpu_id where the BO will be restored */ - - vma_md = xmalloc(sizeof(*vma_md)); - if (!vma_md) { - ret = -ENOMEM; - goto exit; - } - - memset(vma_md, 0, sizeof(*vma_md)); - - vma_md->old_pgoff = bo_bucket->offset; - vma_md->vma_entry = bo_bucket->addr; - - target_gpu_id = maps_get_dest_gpu(&restore_maps, bo_bucket->gpu_id); - - tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!tp_node) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; - goto exit; - } - - vma_md->new_minor = tp_node->drm_render_minor; - vma_md->new_pgoff = bo_bucket->restored_offset; - vma_md->fd = node_get_drm_render_device(tp_node); - - plugin_log_msg("adding vma_entry:addr:0x%lx old-off:0x%lx " - "new_off:0x%lx new_minor:%d\n", - vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor); - - list_add_tail(&vma_md->list, &update_vma_info_list); - } - } - - if (!parallel_disabled) { - parallel_restore_cmd restore_cmd; - pr_info("Begin to send parallel restore cmd\n"); - ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd); - if (ret) - goto exit_parallel; - - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - uint32_t target_gpu_id; - struct tp_node *dev; - - if (!e->device_entries[i]->gpu_id) - continue; - - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; - goto exit_parallel; - } - parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd); - - for (int j = 0; j < e->num_of_bos; j++) { - if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id) - continue; - if (bo_buckets[j].alloc_flags & - (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { - parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id, - bo_buckets[j].size, offset, &restore_cmd); - offset += bo_buckets[j].size; - } - } - } - ret = send_parallel_restore_cmd(&restore_cmd); -exit_parallel: - free_parallel_restore_cmd(&restore_cmd); - } else { - thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus); - if (!thread_datas) { - ret = -ENOMEM; - goto exit; - } - - thread_i = 0; - for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) { - struct tp_node *dev; - int ret_thread = 0; - uint32_t target_gpu_id; - - if (!e->device_entries[i]->gpu_id) - continue; - - /* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */ - target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id); - - /* We need the fd for actual_gpu_id */ - dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!dev) { - pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); - ret = -ENODEV; - goto exit; - } - - thread_datas[thread_i].id = id; - thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id; - thread_datas[thread_i].bo_buckets = bo_buckets; - thread_datas[thread_i].bo_entries = e->bo_entries; - thread_datas[thread_i].pid = e->pid; - thread_datas[thread_i].num_of_bos = e->num_of_bos; - - thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev); - if (thread_datas[thread_i].drm_fd < 0) { - ret = -thread_datas[thread_i].drm_fd; - goto exit; - } - - ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents, - (void *)&thread_datas[thread_i]); - if (ret_thread) { - pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread); - ret = -ret_thread; - goto exit; - } - thread_i++; - } - - for (int i = 0; i < e->num_of_gpus; i++) { - pthread_join(thread_datas[i].thread, NULL); - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - if (thread_datas[i].ret) { - ret = thread_datas[i].ret; - goto exit; - } - } - } -exit: - for (int i = 0; i < e->num_of_bos; i++) { - if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD) - close(bo_buckets[i].dmabuf_fd); - } - if (thread_datas) - xfree(thread_datas); - return ret; -} - -int amdgpu_plugin_restore_file(int id, bool *retry_needed) -{ - int ret = 0, fd; - char img_path[PATH_MAX]; - unsigned char *buf; - CriuRenderNode *rd; - CriuKfd *e = NULL; - struct kfd_ioctl_criu_args args = { 0 }; - size_t img_size; - FILE *img_fp = NULL; - - *retry_needed = false; - - if (plugin_disabled) - return -ENOTSUP; - - pr_info("Initialized kfd plugin restorer with ID = %d\n", id); - - snprintf(img_path, sizeof(img_path), IMG_KFD_FILE, id); - - img_fp = open_img_file(img_path, false, &img_size); - if (!img_fp) { - struct tp_node *tp_node; - uint32_t target_gpu_id; - - /* This is restorer plugin for renderD nodes. Criu doesn't guarantee that they will - * be called before the plugin is called for kfd file descriptor. - * TODO: Currently, this code will only work if this function is called for /dev/kfd - * first as we assume restore_maps is already filled. Need to fix this later. - */ - snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id); - - img_fp = open_img_file(img_path, false, &img_size); - if (!img_fp) { - ret = amdgpu_plugin_dmabuf_restore(id); - if (ret == 1) { - /* This is a dmabuf fd, but the corresponding buffer object that was - * exported to make it has not yet been restored. Need to try again - * later when the buffer object exists, so it can be re-exported. - */ - *retry_needed = true; - return 0; - } - return ret; - } - pr_info("Restoring RenderD %s\n", img_path); - pr_debug("RenderD Image file size:%ld\n", img_size); - buf = xmalloc(img_size); - if (!buf) { - pr_perror("Failed to allocate memory"); - return -ENOMEM; - } - - ret = read_fp(img_fp, buf, img_size); - if (ret) { - pr_perror("Unable to read from %s", img_path); - xfree(buf); - return -1; - } - - rd = criu_render_node__unpack(NULL, img_size, buf); - if (rd == NULL) { - pr_perror("Unable to parse the RenderD message %d", id); - xfree(buf); - fclose(img_fp); - return -1; - } - fclose(img_fp); - - pr_info("render node gpu_id = 0x%04x\n", rd->gpu_id); - - target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id); - if (!target_gpu_id) { - fd = -ENODEV; - goto fail; - } - - tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); - if (!tp_node) { - fd = -ENODEV; - goto fail; - } - - pr_info("render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); - - fd = node_get_drm_render_device(tp_node); - if (fd < 0) { - pr_err("Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); - return -1; - } - - ret = amdgpu_plugin_drm_restore_file(fd, rd); - if (ret == 1) - *retry_needed = true; - if (ret < 0) { - fd = ret; - goto fail; - } - fail: - criu_render_node__free_unpacked(rd, NULL); - xfree(buf); - /* - * We need to use the file descriptor used to create the BOs for mmap later, otherwise the kernel DRM - * drivers will not allow the mmap. Therefore, we keep a copy of the file descriptor (stored in tp_node) - * so that we can return it in amdgpu_plugin_update_vmamap later. Also, CRIU core will dup and close the - * returned fd after this function returns, and this will make our fd invalid. So we return a dup'ed - * copy of the fd. CRIU core owns the duplicated returned fd, and amdgpu_plugin owns the fd stored in - * tp_node. - */ - - if (fd < 0) - return fd; - - if (!(*retry_needed)) { - fd = dup(fd); - if (fd == -1) { - pr_perror("unable to duplicate the render fd"); - return -1; - } - return fd; - } - - return 0; - } - - fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); - if (fd < 0) { - pr_perror("failed to open kfd in plugin"); - return -1; - } - - pr_info("Opened kfd, fd = %d\n", fd); - - if (!kernel_supports_criu(fd)) - return -ENOTSUP; - - pr_info("KFD Image file size:%ld\n", img_size); - buf = xmalloc(img_size); - if (!buf) { - fclose(img_fp); - return -ENOMEM; - } - - ret = read_fp(img_fp, buf, img_size); - if (ret) { - pr_perror("Unable to read from %s", img_path); - fclose(img_fp); - xfree(buf); - return ret; - } - - fclose(img_fp); - e = criu_kfd__unpack(NULL, img_size, buf); - if (e == NULL) { - pr_err("Unable to parse the KFD message %#x\n", id); - xfree(buf); - return -1; - } - - plugin_log_msg("read image file data\n"); - - /* - * Initialize fd_next to be 1 greater than the biggest file descriptor in use by the target restore process. - * This way, we know that the file descriptors we store will not conflict with file descriptors inside core - * CRIU. - */ - if (fd_next == -1) { - fd_next = find_unused_fd_pid(e->pid); - if (fd_next <= 0) { - pr_err("Failed to find unused fd (fd:%d)\n", fd_next); - ret = -EINVAL; - goto exit; - } - } - - ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology); - if (ret) { - pr_err("Failed to convert stored device information to topology\n"); - ret = -EINVAL; - goto exit; - } - - ret = topology_parse(&dest_topology, "Local"); - if (ret) { - pr_err("Failed to parse local system topology\n"); - goto exit; - } - - ret = set_restore_gpu_maps(&src_topology, &dest_topology, &restore_maps); - if (ret) { - pr_err("Failed to map GPUs\n"); - goto exit; - } - - ret = restore_devices(&args, e); - if (ret) - goto exit; - - ret = restore_bos(&args, e); - if (ret) - goto exit; - - args.num_objects = e->num_of_objects; - args.priv_data_size = e->priv_data.len; - args.priv_data = (uintptr_t)e->priv_data.data; - args.op = KFD_CRIU_OP_RESTORE; - - if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - pr_perror("Restore ioctl failed"); - ret = -1; - goto exit; - } - - if (ret < 0) - goto exit; - - for (int i = 0; i < args.num_bos; i++) { - struct kfd_criu_bo_bucket *bo_bucket = &((struct kfd_criu_bo_bucket *)args.bos)[i]; - KfdBoEntry *bo_entry = e->bo_entries[i]; - - if (bo_entry->handle != -1) { - store_dmabuf_fd(bo_entry->handle, bo_bucket->dmabuf_fd); - } - } - - ret = restore_bo_data(id, (struct kfd_criu_bo_bucket *)args.bos, e); - if (ret) - goto exit; - - ret = restore_hsakmt_shared_mem(e->shared_mem_size, e->shared_mem_magic); - -exit: - if (e) - criu_kfd__free_unpacked(e, NULL); - - xfree((void *)args.devices); - xfree((void *)args.bos); - xfree(buf); - - if (ret) { - pr_err("Failed to restore (ret:%d)\n", ret); - fd = ret; - } else { - pr_info("Restore successful (fd:%d)\n", fd); - } - - return fd; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, amdgpu_plugin_restore_file) - -/* return 0 if no match found - * return -1 for error. - * return 1 if vmap map must be adjusted. - */ -int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const uint64_t old_offset, - uint64_t *new_offset, int *updated_fd) -{ - struct vma_metadata *vma_md; - char path[PATH_MAX]; - char *p_begin; - char *p_end; - bool is_kfd = false, is_renderD = false; - - if (plugin_disabled) - return -ENOTSUP; - - plugin_log_msg("Enter %s\n", __func__); - - strncpy(path, in_path, sizeof(path)); - - p_begin = path; - p_end = p_begin + strlen(path); - - /* - * Paths sometimes have double forward slashes (e.g //dev/dri/renderD*) - * replace all '//' with '/'. - */ - while (p_begin < p_end - 1) { - if (*p_begin == '/' && *(p_begin + 1) == '/') - memmove(p_begin, p_begin + 1, p_end - p_begin); - else - p_begin++; - } - - if (!strncmp(path, "/dev/dri/renderD", strlen("/dev/dri/renderD"))) - is_renderD = true; - - if (!strcmp(path, AMDGPU_KFD_DEVICE)) - is_kfd = true; - - if (!is_renderD && !is_kfd) { - pr_info("Skipping unsupported path:%s addr:%lx old_offset:%lx\n", in_path, addr, old_offset); - return 0; - } - - list_for_each_entry(vma_md, &update_vma_info_list, list) { - if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) { - *new_offset = vma_md->new_pgoff; - - *updated_fd = -1; - if (is_renderD) { - int fd = dup(vma_md->fd); - if (fd == -1) { - pr_perror("unable to duplicate the render fd"); - return -1; - } - *updated_fd = fd; - } - - plugin_log_msg("old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, vma_md->new_pgoff, - *updated_fd); - - return 1; - } - } - pr_info("No match for addr:0x%lx offset:%lx\n", addr, old_offset); - return 0; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, amdgpu_plugin_update_vmamap) - -int amdgpu_plugin_resume_devices_late(int target_pid) -{ - struct kfd_ioctl_criu_args args = { 0 }; - int fd, exit_code = 0; - - if (plugin_disabled) - return -ENOTSUP; - - if (!parallel_disabled) { - pr_info("Close parallel restore server\n"); - if (close_parallel_restore_server()) { - pr_err("Close parallel restore server fail\n"); - return -1; - } - - exit_code = pthread_join(parallel_thread, NULL); - if (exit_code) { - pr_err("Failed to join parallel thread ret:%d\n", exit_code); - return -1; - } - if (parallel_thread_result) { - pr_err("Parallel restore fail\n"); - return parallel_thread_result; - } - } - - pr_info("Inside %s for target pid = %d\n", __func__, target_pid); - - fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); - if (fd < 0) { - pr_perror("failed to open kfd in plugin"); - return -ENOTSUP; - } - - args.pid = target_pid; - args.op = KFD_CRIU_OP_RESUME; - pr_info("Calling IOCTL to start notifiers and queues\n"); - if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { - if (errno == ESRCH) { - pr_info("Pid %d has no kfd process info\n", target_pid); - exit_code = -ENOTSUP; - } else { - pr_perror("restore late ioctl failed"); - exit_code = -1; - } - } - - clear_restore_state(); - - close(fd); - return exit_code; -} - -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) - -int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size) -{ - int ret = 0; - int drm_fd = -1; - uint32_t major, minor; - - struct amdgpu_gpu_info gpu_info = { 0 }; - - drm_fd = open_drm_render_device(dev_minor); - if (drm_fd < 0) { - return drm_fd; - } - - ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev); - if (ret) { - pr_perror("Failed to initialize device"); - goto err; - } - - ret = amdgpu_query_gpu_info(*h_dev, &gpu_info); - if (ret) { - pr_perror("failed to query gpuinfo via libdrm"); - goto err; - } - *max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; - return 0; -err: - amdgpu_device_deinitialize(*h_dev); - return ret; -} - -FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size) -{ - char img_path[PATH_MAX]; - size_t image_size = 0; - FILE *bo_contents_fp = NULL; - - snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id); - bo_contents_fp = open_img_file(img_path, false, &image_size); - if (!bo_contents_fp) { - pr_perror("Cannot fopen %s", img_path); - return NULL; - } - - if (tot_size != image_size) { - pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size); - fclose(bo_contents_fp); - return NULL; - } - return bo_contents_fp; -} - -struct parallel_thread_data { - pthread_t thread; - uint32_t gpu_id; - int minor; - parallel_restore_cmd *restore_cmd; - int ret; -}; - -void *parallel_restore_bo_contents(void *_thread_data) -{ - struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data; - amdgpu_device_handle h_dev; - uint64_t max_copy_size; - size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0; - FILE *bo_contents_fp = NULL; - parallel_restore_entry *entry; - parallel_restore_cmd *restore_cmd = thread_data->restore_cmd; - int ret = 0; - int offset = 0; - void *buffer = NULL; - - ret = init_dev(thread_data->minor, &h_dev, &max_copy_size); - if (ret) { - goto err; - } - - for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { - if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) { - total_bo_size += restore_cmd->entries[i].size; - max_bo_size = max(restore_cmd->entries[i].size, max_bo_size); - } - } - - buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size; - - bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size); - if (bo_contents_fp == NULL) { - ret = -1; - goto err_sdma; - } - offset = ftell(bo_contents_fp); - - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); - if (!buffer) { - pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); - ret = -ENOMEM; - goto err_sdma; - } - - for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) { - if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id) - continue; - - entry = &restore_cmd->entries[i]; - fseeko(bo_contents_fp, entry->read_offset + offset, SEEK_SET); - ret = sdma_copy_bo(restore_cmd->fds_write[entry->write_id], entry->size, bo_contents_fp, - buffer, buffer_size, h_dev, - max_copy_size, SDMA_OP_VRAM_WRITE, false); - - if (ret) { - pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); - goto err_sdma; - } - } - -err_sdma: - if (bo_contents_fp) - fclose(bo_contents_fp); - if (buffer) - xfree(buffer); - amdgpu_device_deinitialize(h_dev); -err: - thread_data->ret = ret; - return NULL; -} - -void *restore_device_parallel_worker(void *arg) -{ - while (1) { - parallel_restore_cmd restore_cmd = { 0 }; - struct parallel_thread_data *thread_datas = NULL; - int ret; - int error_occurred = 0, join_ret = 0, created_threads = 0; - - ret = recv_parallel_restore_cmd(&restore_cmd); - if (ret) { - if (ret == 1) { - *(int *)arg = 0; - goto exit; - } - goto err; - } - - thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num); - if (!thread_datas) { - ret = -ENOMEM; - goto err; - } - - for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) { - thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id; - thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor; - thread_datas[created_threads].restore_cmd = &restore_cmd; - - ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents, - (void *)&thread_datas[created_threads]); - if (ret) { - pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret); - error_occurred = 1; - break; - } - } - - for (int i = 0; i < created_threads; i++) { - join_ret = pthread_join(thread_datas[i].thread, NULL); - if (join_ret != 0) { - pr_err("pthread_join failed for Thread[0x%x] ret:%d\n", - thread_datas[i].gpu_id, join_ret); - if (!error_occurred) { - ret = join_ret; - error_occurred = 1; - } - } - - pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret); - - /* Check thread return value */ - if (thread_datas[i].ret && !error_occurred) { - ret = thread_datas[i].ret; - error_occurred = 1; - } - } - - if (thread_datas) - xfree(thread_datas); -err: - free_parallel_restore_cmd(&restore_cmd); - - if (ret) { - *(int *)arg = ret; - return NULL; - } - } -exit: - return NULL; -} - -/* - * While the background thread is running, some processing functions (e.g., stop_cgroupd) - * in the main thread need to block SIGCHLD. To prevent interference from this background - * thread, SIGCHLD is blocked in this thread. - */ -static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg) -{ - int ret = 0; - sigset_t blockmask, oldmask; - - sigemptyset(&blockmask); - sigaddset(&blockmask, SIGCHLD); - sigprocmask(SIG_BLOCK, &blockmask, &oldmask); - - ret = pthread_create(newthread, NULL, f, arg); - if (ret) { - pr_err("Create worker thread fail: %d\n", ret); - return -1; - } - - sigprocmask(SIG_SETMASK, &oldmask, NULL); - return 0; -} - -int amdgpu_plugin_post_forking(void) -{ - if (plugin_disabled) - return -ENOTSUP; - - if (parallel_disabled) - return 0; - - return back_thread_create(¶llel_thread, restore_device_parallel_worker, ¶llel_thread_result); -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking) diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.c b/plugins/amdgpu/amdgpu_plugin_dmabuf.c deleted file mode 100644 index 11c9792e3..000000000 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.c +++ /dev/null @@ -1,197 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "common/list.h" -#include "criu-amdgpu.pb-c.h" - -#include "xmalloc.h" -#include "criu-log.h" -#include "amdgpu_plugin_drm.h" -#include "amdgpu_plugin_util.h" -#include "amdgpu_plugin_dmabuf.h" -#include "fdstore.h" - -#include "util.h" -#include "common/scm.h" - -struct dmabuf { - int id; - int dmabuf_fd; - struct list_head node; -}; - -static LIST_HEAD(dmabuf_list); - -/* Return < 0 for error, > 0 for "not a dmabuf" and 0 "is a dmabuf" */ -int get_dmabuf_info(int fd, struct stat *st) -{ - char path[PATH_MAX]; - - if (read_fd_link(fd, path, sizeof(path)) < 0) - return -1; - - if (strncmp(path, DMABUF_LINK, strlen(DMABUF_LINK)) != 0) - return 1; - - return 0; -} - -int __amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) -{ - int ret = 0; - char path[PATH_MAX]; - size_t len = 0; - unsigned char *buf = NULL; - int gem_handle; - - gem_handle = handle_for_shared_bo_fd(dmabuf_fd); - if (gem_handle < 0) { - pr_err("Failed to get handle for dmabuf_fd = %d\n", dmabuf_fd); - return -EAGAIN; /* Retry needed */ - } - - CriuDmabufNode *node = xmalloc(sizeof(*node)); - if (!node) { - pr_err("Failed to allocate memory for dmabuf node\n"); - return -ENOMEM; - } - criu_dmabuf_node__init(node); - - node->gem_handle = gem_handle; - - if (node->gem_handle < 0) { - pr_err("Failed to get handle for dmabuf_fd\n"); - xfree(node); - return -EINVAL; - } - - /* Serialize metadata to a file */ - snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); - len = criu_dmabuf_node__get_packed_size(node); - buf = xmalloc(len); - if (!buf) { - pr_err("Failed to allocate buffer for dmabuf metadata\n"); - xfree(node); - return -ENOMEM; - } - criu_dmabuf_node__pack(node, buf); - ret = write_img_file(path, buf, len); - - xfree(buf); - xfree(node); - return ret; -} - -int amdgpu_plugin_dmabuf_restore(int id) -{ - char path[PATH_MAX]; - size_t img_size; - FILE *img_fp = NULL; - int ret = 0; - CriuDmabufNode *rd = NULL; - unsigned char *buf = NULL; - int fd_id; - - snprintf(path, sizeof(path), IMG_DMABUF_FILE, id); - - /* Read serialized metadata */ - img_fp = open_img_file(path, false, &img_size); - if (!img_fp) { - pr_err("Failed to open dmabuf metadata file: %s\n", path); - return -EINVAL; - } - - pr_debug("dmabuf Image file size:%ld\n", img_size); - buf = xmalloc(img_size); - if (!buf) { - pr_perror("Failed to allocate memory"); - return -ENOMEM; - } - - ret = read_fp(img_fp, buf, img_size); - if (ret) { - pr_perror("Unable to read from %s", path); - xfree(buf); - return ret; - } - - rd = criu_dmabuf_node__unpack(NULL, img_size, buf); - if (rd == NULL) { - pr_perror("Unable to parse the dmabuf message %d", id); - xfree(buf); - fclose(img_fp); - return -1; - } - fclose(img_fp); - - /* Match GEM handle with shared_dmabuf list */ - fd_id = amdgpu_id_for_handle(rd->gem_handle); - if (fd_id == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); - return 1; - } - - int dmabuf_fd = fdstore_get(fd_id); - if (dmabuf_fd == -1) { - pr_err("Failed to find dmabuf_fd for GEM handle = %d\n", rd->gem_handle); - return 1; /* Retry needed */ - } - - pr_info("Restored dmabuf_fd = %d for GEM handle = %d\n", dmabuf_fd, rd->gem_handle); - ret = dmabuf_fd; - - pr_info("Successfully restored dmabuf_fd %d\n", dmabuf_fd); - criu_dmabuf_node__free_unpacked(rd, NULL); - xfree(buf); - return ret; -} - -int amdgpu_plugin_dmabuf_dump(int dmabuf_fd, int id) -{ - int ret; - - ret = __amdgpu_plugin_dmabuf_dump(dmabuf_fd, id); - if (ret == -EAGAIN) { - struct dmabuf *b = xmalloc(sizeof(*b)); - b->id = id; - b->dmabuf_fd = dmabuf_fd; - list_add(&b->node, &dmabuf_list); - return 0; - } - return ret; -} - -int try_dump_dmabuf_list() -{ - struct dmabuf *b, *t; - list_for_each_entry_safe(b, t, &dmabuf_list, node) { - int ret = __amdgpu_plugin_dmabuf_dump(b->dmabuf_fd, b->id); - if (ret == -EAGAIN) - continue; - if (ret) - return ret; - list_del(&b->node); - xfree(b); - } - return 0; -} - -int post_dump_dmabuf_check() -{ - if (!list_empty(&dmabuf_list)) { - pr_err("Not all dma buffers have been dumped\n"); - return -1; - } - return 0; -} diff --git a/plugins/amdgpu/amdgpu_plugin_dmabuf.h b/plugins/amdgpu/amdgpu_plugin_dmabuf.h deleted file mode 100644 index f07af7ee0..000000000 --- a/plugins/amdgpu/amdgpu_plugin_dmabuf.h +++ /dev/null @@ -1,16 +0,0 @@ - -#ifndef __AMDGPU_PLUGIN_DMABUF_H__ -#define __AMDGPU_PLUGIN_DMABUF_H__ - -#include "amdgpu_plugin_util.h" -#include "criu-amdgpu.pb-c.h" - -int amdgpu_plugin_dmabuf_dump(int fd, int id); -int amdgpu_plugin_dmabuf_restore(int id); - -int try_dump_dmabuf_list(); -int post_dump_dmabuf_check(); - -int get_dmabuf_info(int fd, struct stat *st); - -#endif /* __AMDGPU_PLUGIN_DMABUF_H__ */ \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_drm.c b/plugins/amdgpu/amdgpu_plugin_drm.c deleted file mode 100644 index 3520bca7a..000000000 --- a/plugins/amdgpu/amdgpu_plugin_drm.c +++ /dev/null @@ -1,569 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "common/list.h" -#include "files.h" -#include "fdstore.h" - -#include "criu-amdgpu.pb-c.h" - -/* Define __user as empty for kernel headers in user-space */ -#define __user -#include "drm.h" - -#include -#include - -#include "xmalloc.h" -#include "amdgpu_drm.h" -#include "amdgpu_plugin_drm.h" -#include "amdgpu_plugin_util.h" -#include "amdgpu_plugin_topology.h" - -#include "util.h" -#include "common/scm.h" - -int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd) -{ - uint32_t handle; - int fd = amdgpu_device_get_fd(h_dev); - - if (dmabuf_fd == -1) { - return -1; - } - - if (drmPrimeFDToHandle(fd, dmabuf_fd, &handle)) - return -1; - - return handle; -} - -int drmIoctl(int fd, unsigned long request, void *arg) -{ - int ret, max_retries = 200; - - do { - ret = ioctl(fd, request, arg); - } while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN)); - - if (ret == -1 && errno == EBADF) - /* In case pthread_atfork didn't catch it, this will - * make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN. - */ - pr_perror("KFD file descriptor not valid in this process"); - return ret; -} - -static int allocate_bo_entries(CriuRenderNode *e, int num_bos) -{ - e->bo_entries = xmalloc(sizeof(DrmBoEntry *) * num_bos); - if (!e->bo_entries) { - pr_err("Failed to allocate bo_info\n"); - return -ENOMEM; - } - - for (int i = 0; i < num_bos; i++) { - DrmBoEntry *entry = xzalloc(sizeof(*entry)); - - if (!entry) { - pr_err("Failed to allocate botest\n"); - return -ENOMEM; - } - - drm_bo_entry__init(entry); - - e->bo_entries[i] = entry; - e->n_bo_entries++; - } - return 0; -} - -static int allocate_vm_entries(DrmBoEntry *e, int num_vms) -{ - e->vm_entries = xmalloc(sizeof(DrmVmEntry *) * num_vms); - if (!e->vm_entries) { - pr_err("Failed to allocate bo_info\n"); - return -ENOMEM; - } - - for (int i = 0; i < num_vms; i++) { - DrmVmEntry *entry = xzalloc(sizeof(*entry)); - - if (!entry) { - pr_err("Failed to allocate botest\n"); - return -ENOMEM; - } - - drm_vm_entry__init(entry); - - e->vm_entries[i] = entry; - e->n_vm_entries++; - } - return 0; -} - -static void free_e(CriuRenderNode *e) -{ - for (int i = 0; i < e->n_bo_entries; i++) { - if (e->bo_entries[i]) - xfree(e->bo_entries[i]); - } - - xfree(e); -} - -int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st) -{ - char path[PATH_MAX]; - struct stat drm; - int ret = 0; - - snprintf(path, sizeof(path), AMDGPU_DRM_DEVICE, DRM_FIRST_RENDER_NODE); - ret = stat(path, &drm); - if (ret == -1) { - pr_err("Error in getting stat for: %s\n", path); - return ret; - } - - if ((major(st->st_rdev) != major(drm.st_rdev)) || - (minor(st->st_rdev) < minor(drm.st_rdev)) || - (minor(st->st_rdev) > DRM_LAST_RENDER_NODE)) { - pr_err("Can't handle VMA mapping of input device\n"); - return -ENOTSUP; - } - - pr_debug("AMD DRI(maj,min) = %d:%d VMA Device FD(maj,min) = %d:%d\n", - major(drm.st_rdev), minor(drm.st_rdev), - major(st->st_rdev), minor(st->st_rdev)); - - return 0; -} - -static int restore_bo_contents_drm(int drm_render_minor, CriuRenderNode *rd, int drm_fd, int *dmabufs) -{ - size_t image_size = 0, max_bo_size = 0, buffer_size; - struct amdgpu_gpu_info gpu_info = { 0 }; - amdgpu_device_handle h_dev; - uint64_t max_copy_size; - uint32_t major, minor; - FILE *bo_contents_fp = NULL; - void *buffer = NULL; - char img_path[40]; - int i, ret = 0; - - ret = amdgpu_device_initialize(drm_fd, &major, &minor, &h_dev); - if (ret) { - pr_perror("failed to initialize device"); - goto exit; - } - plugin_log_msg("libdrm initialized successfully\n"); - - ret = amdgpu_query_gpu_info(h_dev, &gpu_info); - if (ret) { - pr_perror("failed to query gpuinfo via libdrm"); - goto exit; - } - - max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE : - SDMA_LINEAR_COPY_MAX_SIZE - 1; - - for (i = 0; i < rd->num_of_bos; i++) { - if (rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) { - if (rd->bo_entries[i]->size > max_bo_size) - max_bo_size = rd->bo_entries[i]->size; - } - } - - buffer_size = max_bo_size; - - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size); - if (!buffer) { - pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE."); - ret = -ENOMEM; - goto exit; - } - - for (i = 0; i < rd->num_of_bos; i++) { - if (!(rd->bo_entries[i]->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT))) - continue; - - if (rd->bo_entries[i]->num_of_vms == 0) - continue; - - snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, drm_render_minor, i); - - bo_contents_fp = open_img_file(img_path, false, &image_size); - - ret = sdma_copy_bo(dmabufs[i], rd->bo_entries[i]->size, bo_contents_fp, buffer, buffer_size, h_dev, max_copy_size, - SDMA_OP_VRAM_WRITE, true); - if (ret) { - pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i); - break; - } - plugin_log_msg("** Successfully filled the BO using sDMA: bo_buckets[%d] **\n", i); - - if (bo_contents_fp) - fclose(bo_contents_fp); - } - -exit: - for (int i = 0; i < rd->num_of_bos; i++) { - if (dmabufs[i] != KFD_INVALID_FD) - close(dmabufs[i]); - } - - xfree(buffer); - - amdgpu_device_deinitialize(h_dev); - return ret; -} - -int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm) -{ - CriuRenderNode *rd = NULL; - char path[PATH_MAX]; - unsigned char *buf; - int minor; - int len; - int ret; - size_t image_size; - struct tp_node *tp_node; - struct drm_amdgpu_gem_list_handles list_handles_args = { 0 }; - struct drm_amdgpu_gem_list_handles_entry *list_handles_entries; - int num_bos; - - rd = xmalloc(sizeof(*rd)); - if (!rd) { - ret = -ENOMEM; - goto exit; - } - criu_render_node__init(rd); - - /* Get the topology node of the DRM device */ - minor = minor(drm->st_rdev); - rd->drm_render_minor = minor; - rd->id = id; - - num_bos = 8; - list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); - list_handles_args.num_entries = num_bos; - list_handles_args.entries = (uintptr_t)list_handles_entries; - - ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); - if (ret && errno == EINVAL) { - pr_info("This kernel appears not to have AMDGPU_GEM_LIST_HANDLES ioctl. Consider disabling Dmabuf IPC or updating your kernel.\n"); - list_handles_args.num_entries = 0; - } else if (ret) { - pr_perror("Failed to call bo info ioctl"); - goto exit; - } - - if (list_handles_args.num_entries > num_bos) { - num_bos = list_handles_args.num_entries; - xfree(list_handles_entries); - list_handles_entries = xzalloc(sizeof(struct drm_amdgpu_gem_list_handles_entry) * num_bos); - list_handles_args.num_entries = num_bos; - list_handles_args.entries = (uintptr_t)list_handles_entries; - ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_LIST_HANDLES, &list_handles_args); - if (ret) { - pr_perror("Failed to call bo info ioctl"); - goto exit; - } - } else { - num_bos = list_handles_args.num_entries; - } - - rd->num_of_bos = num_bos; - ret = allocate_bo_entries(rd, num_bos); - if (ret) - goto exit; - - for (int i = 0; i < num_bos; i++) { - int num_vm_entries = 8; - struct drm_amdgpu_gem_vm_entry *vm_info_entries; - struct drm_amdgpu_gem_op vm_info_args = { 0 }; - DrmBoEntry *boinfo = rd->bo_entries[i]; - struct drm_amdgpu_gem_list_handles_entry handle_entry = list_handles_entries[i]; - union drm_amdgpu_gem_mmap mmap_args = { 0 }; - int dmabuf_fd; - uint32_t major, minor; - amdgpu_device_handle h_dev; - void *buffer = NULL; - char img_path[40]; - FILE *bo_contents_fp = NULL; - int device_fd; - - boinfo->size = handle_entry.size; - - boinfo->alloc_flags = handle_entry.alloc_flags; - boinfo->preferred_domains = handle_entry.preferred_domains; - boinfo->alignment = handle_entry.alignment; - boinfo->handle = handle_entry.gem_handle; - boinfo->is_import = (handle_entry.flags & AMDGPU_GEM_LIST_HANDLES_FLAG_IS_IMPORT) || shared_bo_has_exporter(boinfo->handle); - - mmap_args.in.handle = boinfo->handle; - - if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { - pr_perror("Error Failed to call mmap ioctl"); - ret = -1; - goto exit; - } - - boinfo->offset = mmap_args.out.addr_ptr; - - vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); - vm_info_args.handle = handle_entry.gem_handle; - vm_info_args.num_entries = num_vm_entries; - vm_info_args.value = (uintptr_t)vm_info_entries; - vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; - ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); - if (ret) { - pr_perror("Failed to call vm info ioctl"); - goto exit; - } - - if (vm_info_args.num_entries > num_vm_entries) { - num_vm_entries = vm_info_args.num_entries; - xfree(vm_info_entries); - vm_info_entries = xzalloc(sizeof(struct drm_amdgpu_gem_vm_entry) * num_vm_entries); - vm_info_args.handle = handle_entry.gem_handle; - vm_info_args.num_entries = num_vm_entries; - vm_info_args.value = (uintptr_t)vm_info_entries; - vm_info_args.op = AMDGPU_GEM_OP_GET_MAPPING_INFO; - ret = drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_OP, &vm_info_args); - if (ret) { - pr_perror("Failed to call vm info ioctl"); - goto exit; - } - } else { - num_vm_entries = vm_info_args.num_entries; - } - - boinfo->num_of_vms = num_vm_entries; - ret = allocate_vm_entries(boinfo, num_vm_entries); - if (ret) - goto exit; - - for (int j = 0; j < num_vm_entries; j++) { - DrmVmEntry *vminfo = boinfo->vm_entries[j]; - - boinfo->addr = vm_info_entries[j].addr; - vminfo->addr = vm_info_entries[j].addr; - vminfo->size = vm_info_entries[j].size; - vminfo->offset = vm_info_entries[j].offset; - vminfo->flags = vm_info_entries[j].flags; - } - - ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); - - device_fd = amdgpu_device_get_fd(h_dev); - - drmPrimeHandleToFD(device_fd, boinfo->handle, 0, &dmabuf_fd); - - snprintf(img_path, sizeof(img_path), IMG_DRM_PAGES_FILE, rd->id, rd->drm_render_minor, i); - bo_contents_fp = open_img_file(img_path, true, &image_size); - - posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), handle_entry.size); - - ret = sdma_copy_bo(dmabuf_fd, handle_entry.size, bo_contents_fp, buffer, handle_entry.size, h_dev, 0x1000, - SDMA_OP_VRAM_READ, false); - - if (dmabuf_fd != KFD_INVALID_FD) - close(dmabuf_fd); - - if (bo_contents_fp) - fclose(bo_contents_fp); - - ret = amdgpu_device_deinitialize(h_dev); - if (ret) - goto exit; - - xfree(vm_info_entries); - } - xfree(list_handles_entries); - - for (int i = 0; i < num_bos; i++) { - DrmBoEntry *boinfo = rd->bo_entries[i]; - - ret = record_shared_bo(boinfo->handle, boinfo->is_import); - if (ret) - goto exit; - } - - tp_node = sys_get_node_by_render_minor(&src_topology, minor); - if (!tp_node) { - pr_err("Failed to find a device with minor number = %d\n", minor); - return -ENODEV; - } - - /* Get the GPU_ID of the DRM device */ - rd->gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); - if (!rd->gpu_id) { - pr_err("Failed to find valid gpu_id for the device = %d\n", rd->gpu_id); - return -ENODEV; - } - - len = criu_render_node__get_packed_size(rd); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; - - criu_render_node__pack(rd, buf); - - snprintf(path, sizeof(path), IMG_DRM_FILE, id); - ret = write_img_file(path, buf, len); - - xfree(buf); -exit: - free_e(rd); - return ret; -} - -int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd) -{ - int ret = 0; - bool retry_needed = false; - uint32_t major, minor; - amdgpu_device_handle h_dev; - int device_fd; - int *dmabufs = xzalloc(sizeof(int) * rd->num_of_bos); - - ret = amdgpu_device_initialize(fd, &major, &minor, &h_dev); - if (ret) { - pr_info("Error in init amdgpu device\n"); - goto exit; - } - - device_fd = amdgpu_device_get_fd(h_dev); - - for (int i = 0; i < rd->num_of_bos; i++) { - DrmBoEntry *boinfo = rd->bo_entries[i]; - int dmabuf_fd = -1; - uint32_t handle; - struct drm_gem_change_handle change_args = { 0 }; - union drm_amdgpu_gem_mmap mmap_args = { 0 }; - struct drm_amdgpu_gem_va va_args = { 0 }; - int fd_id; - - if (work_already_completed(boinfo->handle, rd->drm_render_minor)) { - continue; - } else if (boinfo->handle != -1) { - if (boinfo->is_import) { - fd_id = amdgpu_id_for_handle(boinfo->handle); - if (fd_id == -1) { - retry_needed = true; - continue; - } - dmabuf_fd = fdstore_get(fd_id); - } - } - - if (boinfo->is_import) { - drmPrimeFDToHandle(device_fd, dmabuf_fd, &handle); - } else { - union drm_amdgpu_gem_create create_args = { 0 }; - - create_args.in.bo_size = boinfo->size; - create_args.in.alignment = boinfo->alignment; - create_args.in.domains = boinfo->preferred_domains; - create_args.in.domain_flags = boinfo->alloc_flags; - - if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_CREATE, &create_args) == -1) { - pr_perror("Error Failed to call create ioctl"); - ret = -1; - goto exit; - } - handle = create_args.out.handle; - - drmPrimeHandleToFD(device_fd, handle, 0, &dmabuf_fd); - } - - change_args.handle = handle; - change_args.new_handle = boinfo->handle; - - if (drmIoctl(fd, DRM_IOCTL_GEM_CHANGE_HANDLE, &change_args) == -1) { - pr_perror("Error Failed to call change ioctl; check if the kernel has DRM_IOCTL_GEM_CHANGE_HANDLE support"); - ret = -1; - goto exit; - } - - if (!boinfo->is_import) - store_dmabuf_fd(boinfo->handle, dmabuf_fd); - - dmabufs[i] = dmabuf_fd; - - ret = record_completed_work(boinfo->handle, rd->drm_render_minor); - if (ret) - goto exit; - - mmap_args.in.handle = boinfo->handle; - - if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_MMAP, &mmap_args) == -1) { - pr_perror("Error Failed to call mmap ioctl"); - ret = -1; - goto exit; - } - - for (int j = 0; j < boinfo->num_of_vms; j++) { - DrmVmEntry *vminfo = boinfo->vm_entries[j]; - - va_args.handle = boinfo->handle; - va_args.operation = AMDGPU_VA_OP_MAP; - va_args.flags = vminfo->flags; - va_args.va_address = vminfo->addr; - va_args.offset_in_bo = vminfo->offset; - va_args.map_size = vminfo->size; - - if (drmIoctl(fd, DRM_IOCTL_AMDGPU_GEM_VA, &va_args) == -1) { - pr_perror("Error Failed to call gem va ioctl"); - ret = -1; - goto exit; - } - } - - ret = save_vma_updates(boinfo->offset, boinfo->addr, mmap_args.out.addr_ptr, fd); - if (ret < 0) - goto exit; - } - - if (ret) { - pr_info("Error in deinit amdgpu device\n"); - goto exit; - } - - ret = record_completed_work(-1, rd->drm_render_minor); - if (ret) - goto exit; - - ret = amdgpu_device_deinitialize(h_dev); - - if (rd->num_of_bos > 0) { - ret = restore_bo_contents_drm(rd->drm_render_minor, rd, fd, dmabufs); - if (ret) - goto exit; - } - -exit: - if (ret < 0) - return ret; - xfree(dmabufs); - - return retry_needed; -} diff --git a/plugins/amdgpu/amdgpu_plugin_drm.h b/plugins/amdgpu/amdgpu_plugin_drm.h deleted file mode 100644 index c766def56..000000000 --- a/plugins/amdgpu/amdgpu_plugin_drm.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef __AMDGPU_PLUGIN_DRM_H__ -#define __AMDGPU_PLUGIN_DRM_H__ - -#include -#include "common/list.h" - -#include "xmalloc.h" -#include "criu-log.h" -#include "kfd_ioctl.h" -#include "amdgpu_plugin_util.h" -#include "amdgpu_plugin_topology.h" - - -/** - * Determines if VMA's of input file descriptor belong to amdgpu's - * DRM device and are therefore supported - */ -int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm); - -/** - * Serialize meta-data about a particular DRM device, its number of BOs, - * etc into a file. The serialized filename has in it the value ID that - * is passed in as a parameter - */ -int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm); - -int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd); - -int amdgpu_plugin_drm_unpause_file(int fd); - -int amdgpu_id_for_handle(int handle); - -int store_dmabuf_fd(int handle, int fd); - -int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd); - -int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id); - -#endif /* __AMDGPU_PLUGIN_DRM_H__ */ - diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c deleted file mode 100644 index 730f2e028..000000000 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ /dev/null @@ -1,1457 +0,0 @@ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "common/list.h" - -#include "xmalloc.h" -#include "kfd_ioctl.h" -#include "amdgpu_plugin_util.h" -#include "amdgpu_plugin_topology.h" - -#define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" -#define MAX_PARAMETER_LEN 64 - -/* User override options */ -/* Skip firmware version check */ -bool kfd_fw_version_check = true; -/* Skip SDMA firmware version check */ -bool kfd_sdma_fw_version_check = true; -/* Skip caches count check */ -bool kfd_caches_count_check = true; -/* Skip num gws check */ -bool kfd_num_gws_check = true; -/* Skip vram size check */ -bool kfd_vram_size_check = true; -/* Preserve NUMA regions */ -bool kfd_numa_check = true; -/* Skip capability check */ -bool kfd_capability_check = true; - -/* - * During dump, we can use any fd value so fd_next is always -1. - * During restore, we have to use a fd value that does not conflict with fd values in use by the target restore process. - * fd_next is initialized as 1 greater than the highest-numbered file descriptor used by the target restore process. - */ -int fd_next = -1; - -int open_drm_render_device(int minor) -{ - char path[128]; - int fd, ret_fd; - - if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) { - pr_perror("DRM render minor %d out of range [%d, %d]", minor, DRM_FIRST_RENDER_NODE, - DRM_LAST_RENDER_NODE); - return -EINVAL; - } - - snprintf(path, sizeof(path), "/dev/dri/renderD%d", minor); - fd = open(path, O_RDWR | O_CLOEXEC); - if (fd < 0) { - if (errno != ENOENT && errno != EPERM) { - pr_err("Failed to open %s: %s\n", path, strerror(errno)); - if (errno == EACCES) - pr_err("Check user is in \"video\" group\n"); - } - return -EBADFD; - } - - if (fd_next < 0) - return fd; - - ret_fd = fcntl(fd, F_DUPFD, fd_next++); - close(fd); - - if (ret_fd < 0) - pr_perror("Failed to duplicate fd for minor:%d (fd_next:%d)", minor, fd_next); - - return ret_fd; -} - -static const char *link_type(uint32_t type) -{ - switch (type) { - case TOPO_IOLINK_TYPE_PCIE: - return "PCIe"; - case TOPO_IOLINK_TYPE_XGMI: - return "XGMI"; - } - return "Unsupported"; -} - -static struct tp_node *p2pgroup_get_node_by_gpu_id(const struct tp_p2pgroup *group, const uint32_t gpu_id) -{ - struct tp_node *node; - - list_for_each_entry(node, &group->nodes, listm_p2pgroup) { - if (node->gpu_id == gpu_id) - return node; - } - return NULL; -} - -int node_get_drm_render_device(struct tp_node *node) -{ - if (node->drm_fd < 0) - node->drm_fd = open_drm_render_device(node->drm_render_minor); - - return node->drm_fd; -} - -void sys_close_drm_render_devices(struct tp_system *sys) -{ - struct tp_node *node; - - list_for_each_entry(node, &sys->nodes, listm_system) { - if (node->drm_fd >= 0) { - close(node->drm_fd); - node->drm_fd = -1; - } - } -} - -static struct tp_iolink *node_get_iolink_to_node_id(const struct tp_node *node, const uint32_t type, - const uint32_t node_id) -{ - struct tp_iolink *iolink; - - list_for_each_entry(iolink, &node->iolinks, listm) { - if (iolink->node_to_id == node_id && iolink->type == type) - return iolink; - } - return NULL; -} - -struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor) -{ - struct tp_node *node; - - list_for_each_entry(node, &sys->nodes, listm_system) { - if (node->drm_render_minor == drm_render_minor) - return node; - } - return NULL; -} - -struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index) -{ - struct tp_node *node; - - list_for_each_entry(node, &sys->nodes, listm_system) { - if (NODE_IS_GPU(node) && index-- == 0) - return node; - } - return NULL; -} - -struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id) -{ - struct tp_node *node; - - list_for_each_entry(node, &sys->nodes, listm_system) { - if (node->gpu_id == gpu_id) - return node; - } - return NULL; -} - -static struct tp_node *sys_get_node_by_node_id(const struct tp_system *sys, const uint32_t node_id) -{ - struct tp_node *node; - - list_for_each_entry(node, &sys->nodes, listm_system) { - if (node->id == node_id) - return node; - } - return NULL; -} - -static struct tp_p2pgroup *sys_get_p2pgroup_with_gpu_id(const struct tp_system *sys, const int type, - const uint32_t gpu_id) -{ - struct tp_p2pgroup *p2pgroup; - - list_for_each_entry(p2pgroup, &sys->xgmi_groups, listm_system) { - if (p2pgroup->type != type) - continue; - - if (p2pgroup_get_node_by_gpu_id(p2pgroup, gpu_id)) - return p2pgroup; - } - return NULL; -} - -static struct tp_iolink *get_tp_peer_iolink(const struct tp_node *from_node, const struct tp_node *to_node, - const uint8_t type) -{ - struct tp_iolink *iolink; - - list_for_each_entry(iolink, &from_node->iolinks, listm) { - if (iolink->node_to_id == to_node->id && iolink->type == type) - return iolink; - } - return NULL; -} - -static bool maps_dest_cpu_mapped(const struct device_maps *maps, const uint32_t dest_id) -{ - struct id_map *id_map; - - list_for_each_entry(id_map, &maps->cpu_maps, listm) { - if (id_map->dest == dest_id) - return true; - } - return false; -} - -static uint32_t maps_get_dest_cpu(const struct device_maps *maps, const uint32_t src_id) -{ - struct id_map *id_map; - - list_for_each_entry(id_map, &maps->cpu_maps, listm) { - if (id_map->src == src_id) - return id_map->dest; - } - return INVALID_CPU_ID; -} - -bool maps_dest_gpu_mapped(const struct device_maps *maps, const uint32_t dest_id) -{ - struct id_map *id_map; - - list_for_each_entry(id_map, &maps->gpu_maps, listm) { - if (id_map->dest == dest_id) - return true; - } - return false; -} - -uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id) -{ - struct id_map *id_map; - - list_for_each_entry(id_map, &maps->gpu_maps, listm) { - if (id_map->src == src_id) - return id_map->dest; - } - return 0; -} - -static struct id_map *maps_add_cpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id) -{ - struct id_map *id_map = xzalloc(sizeof(*id_map)); - - if (!id_map) { - pr_err("Failed to allocate memory for id_map\n"); - return NULL; - } - - id_map->src = src_id; - id_map->dest = dest_id; - - list_add_tail(&id_map->listm, &maps->cpu_maps); - - maps->tail_cpu = &id_map->listm; - - pr_debug("Added CPU mapping [%02d -> %02d]\n", src_id, dest_id); - return id_map; -} - -struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id) -{ - struct id_map *id_map = xzalloc(sizeof(*id_map)); - - if (!id_map) { - pr_err("Failed to allocate memory for id_map\n"); - return NULL; - } - - id_map->src = src_id; - id_map->dest = dest_id; - - list_add_tail(&id_map->listm, &maps->gpu_maps); - - maps->tail_gpu = &id_map->listm; - - pr_debug("Added GPU mapping [0x%04X -> 0x%04X]\n", src_id, dest_id); - return id_map; -} - -static void maps_print(struct device_maps *maps) -{ - struct id_map *id_map; - - pr_info("===Maps===============\n"); - list_for_each_entry(id_map, &maps->gpu_maps, listm) - pr_info("GPU: 0x%04X -> 0x%04X\n", id_map->src, id_map->dest); - - list_for_each_entry(id_map, &maps->cpu_maps, listm) - pr_info("CPU: %02d -> %02d\n", id_map->src, id_map->dest); - pr_info("======================\n"); -} - -void maps_init(struct device_maps *maps) -{ - INIT_LIST_HEAD(&maps->cpu_maps); - INIT_LIST_HEAD(&maps->gpu_maps); - maps->tail_cpu = 0; - maps->tail_gpu = 0; -} - -void maps_free(struct device_maps *maps) -{ - while (!list_empty(&maps->cpu_maps)) { - struct id_map *map = list_first_entry(&maps->cpu_maps, struct id_map, listm); - - list_del(&map->listm); - xfree(map); - } - while (!list_empty(&maps->gpu_maps)) { - struct id_map *map = list_first_entry(&maps->gpu_maps, struct id_map, listm); - - list_del(&map->listm); - xfree(map); - } -} - -static void maps_pop(struct device_maps *maps, struct device_maps *remove) -{ - if (remove->tail_cpu) - list_cut_position(&remove->cpu_maps, &maps->cpu_maps, remove->tail_cpu); - - if (remove->tail_gpu) - list_cut_position(&remove->gpu_maps, &maps->gpu_maps, remove->tail_gpu); - - maps_free(remove); -} - -static int maps_push(struct device_maps *maps, struct device_maps *new) -{ - struct id_map *src_id_map, *dest_id_map; - - list_for_each_entry(src_id_map, &new->cpu_maps, listm) { - list_for_each_entry(dest_id_map, &maps->cpu_maps, listm) { - if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) { - pr_err("CPU mapping already exists src [%02d->%02d] new [%02d->%02d]\n", - src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest); - return -EINVAL; - } - } - } - list_for_each_entry(src_id_map, &new->gpu_maps, listm) { - list_for_each_entry(dest_id_map, &maps->gpu_maps, listm) { - if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) { - pr_err("GPU mapping already exists src [0x%04X -> 0x%04X] new [0x%04X -> 0x%04X]\n", - src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest); - return -EINVAL; - } - } - } - - list_splice(&new->cpu_maps, &maps->cpu_maps); - list_splice(&new->gpu_maps, &maps->gpu_maps); - - return 0; -} - -struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id) -{ - struct tp_iolink *iolink = xzalloc(sizeof(*iolink)); - - if (!iolink) - return NULL; - - iolink->type = type; - /* iolink->node_to will be filled in topology_determine_iolinks */ - iolink->node_to_id = node_to_id; - iolink->node_from = node; - - list_add_tail(&iolink->listm, &node->iolinks); - return iolink; -} - -struct tp_p2pgroup *sys_add_group(struct tp_system *sys, uint32_t type) -{ - struct tp_p2pgroup *group; - - group = xzalloc(sizeof(*group)); - if (!group) - return NULL; - - INIT_LIST_HEAD(&group->nodes); - group->type = type; - list_add_tail(&group->listm_system, &sys->xgmi_groups); - if (type == TOPO_IOLINK_TYPE_XGMI) - sys->num_xgmi_groups++; - - return group; -} - -struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id) -{ - struct tp_node *node = NULL; - - node = xzalloc(sizeof(*node)); - if (!node) - return NULL; - - node->id = id; - node->gpu_id = gpu_id; - node->drm_fd = -1; - INIT_LIST_HEAD(&node->iolinks); - list_add_tail(&node->listm_system, &sys->nodes); - sys->num_nodes++; - - return node; -} - -static bool get_prop(char *line, char *name, uint64_t *value) -{ - char format[16]; - sprintf(format, " %%%ds %%lu", MAX_PARAMETER_LEN); - if (sscanf(line, format, name, value) != 2) - return false; - return true; -} - -/* Parse node properties in /sys/class/kfd/kfd/topology/nodes/N/properties */ -static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path) -{ - FILE *file; - char path[300]; - char line[300]; - - sprintf(path, "%s/properties", dir_path); - file = fopen(path, "r"); - if (!file) { - pr_perror("Failed to access %s", path); - return -EFAULT; - } - - while (fgets(line, sizeof(line), file)) { - char name[MAX_PARAMETER_LEN + 1]; - uint64_t value; - - memset(name, 0, sizeof(name)); - if (!get_prop(line, name, &value)) - goto fail; - - if (!strcmp(name, "cpu_cores_count")) - dev->cpu_cores_count = (uint32_t)value; - else if (!strcmp(name, "simd_count")) - dev->simd_count = (uint32_t)value; - else if (!strcmp(name, "mem_banks_count")) - dev->mem_banks_count = (uint32_t)value; - else if (!strcmp(name, "caches_count")) - dev->caches_count = (uint32_t)value; - else if (!strcmp(name, "io_links_count")) - dev->io_links_count = (uint32_t)value; - else if (!strcmp(name, "max_waves_per_simd")) - dev->max_waves_per_simd = (uint32_t)value; - else if (!strcmp(name, "lds_size_in_kb")) - dev->lds_size_in_kb = (uint32_t)value; - else if (!strcmp(name, "num_gws")) - dev->num_gws = (uint32_t)value; - else if (!strcmp(name, "wave_front_size")) - dev->wave_front_size = (uint32_t)value; - else if (!strcmp(name, "array_count")) - dev->array_count = (uint32_t)value; - else if (!strcmp(name, "simd_arrays_per_engine")) - dev->simd_arrays_per_engine = (uint32_t)value; - else if (!strcmp(name, "cu_per_simd_array")) - dev->cu_per_simd_array = (uint32_t)value; - else if (!strcmp(name, "simd_per_cu")) - dev->simd_per_cu = (uint32_t)value; - else if (!strcmp(name, "max_slots_scratch_cu")) - dev->max_slots_scratch_cu = (uint32_t)value; - else if (!strcmp(name, "vendor_id")) - dev->vendor_id = (uint32_t)value; - else if (!strcmp(name, "device_id")) - dev->device_id = (uint32_t)value; - else if (!strcmp(name, "domain")) - dev->domain = (uint32_t)value; - else if (!strcmp(name, "drm_render_minor")) - dev->drm_render_minor = (uint32_t)value; - else if (!strcmp(name, "hive_id")) - dev->hive_id = value; - else if (!strcmp(name, "num_sdma_engines")) - dev->num_sdma_engines = (uint32_t)value; - else if (!strcmp(name, "num_sdma_xgmi_engines")) - dev->num_sdma_xgmi_engines = (uint32_t)value; - else if (!strcmp(name, "num_sdma_queues_per_engine")) - dev->num_sdma_queues_per_engine = (uint32_t)value; - else if (!strcmp(name, "num_cp_queues")) - dev->num_cp_queues = (uint32_t)value; - else if (!strcmp(name, "fw_version")) - dev->fw_version = (uint32_t)value; - else if (!strcmp(name, "capability")) - dev->capability = (uint32_t)value; - else if (!strcmp(name, "sdma_fw_version")) - dev->sdma_fw_version = (uint32_t)value; - - if (!dev->gpu_id && dev->cpu_cores_count >= 1) { - /* This is a CPU - we do not need to parse the other information */ - break; - } - } - - fclose(file); - return 0; -fail: - pr_err("Failed to parse line = %s\n", line); - fclose(file); - return -EINVAL; -} - -/* Parse node memory properties in /sys/class/kfd/kfd/topology/nodes/N/mem_banks */ -static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path) -{ - struct dirent *dirent_node; - DIR *d_node; - char path[300]; - FILE *file = NULL; - uint32_t heap_type = 0; - uint64_t mem_size = 0; - int ret; - - if (!NODE_IS_GPU(node)) - return 0; - - sprintf(path, "%s/mem_banks", dir_path); - - d_node = opendir(path); - if (!d_node) { - pr_perror("Can't open %s", path); - return -EACCES; - } - - while ((dirent_node = readdir(d_node)) != NULL) { - char line[300]; - char bank_path[1024]; - struct stat st; - int id; - - heap_type = 0; - mem_size = 0; - - /* Only parse numeric directories */ - if (sscanf(dirent_node->d_name, "%d", &id) != 1) - continue; - - snprintf(bank_path, sizeof(bank_path), "%s/%s", path, dirent_node->d_name); - if (stat(bank_path, &st)) { - pr_err("Cannot to access %s\n", path); - ret = -EACCES; - goto fail; - } - if ((st.st_mode & S_IFMT) == S_IFDIR) { - char properties_path[PATH_MAX]; - - snprintf(properties_path, sizeof(properties_path), "%s/properties", bank_path); - - file = fopen(properties_path, "r"); - if (!file) { - pr_perror("Failed to access %s", properties_path); - ret = -EACCES; - goto fail; - } - - while (fgets(line, sizeof(line), file)) { - char name[MAX_PARAMETER_LEN + 1]; - uint64_t value; - - memset(name, 0, sizeof(name)); - if (!get_prop(line, name, &value)) { - ret = -EINVAL; - goto fail; - } - - if (!strcmp(name, "heap_type")) - heap_type = (uint32_t)value; - if (!strcmp(name, "size_in_bytes")) - mem_size = value; - } - - fclose(file); - file = NULL; - } - - if (heap_type == TOPO_HEAP_TYPE_PUBLIC || heap_type == TOPO_HEAP_TYPE_PRIVATE) - break; - } - - if ((heap_type != TOPO_HEAP_TYPE_PUBLIC && heap_type != TOPO_HEAP_TYPE_PRIVATE) || !mem_size) { - pr_err("Failed to determine memory type and size for device in %s\n", dir_path); - ret = -EINVAL; - goto fail; - } - - node->vram_public = (heap_type == TOPO_HEAP_TYPE_PUBLIC); - node->vram_size = mem_size; - closedir(d_node); - return 0; -fail: - if (file) - fclose(file); - closedir(d_node); - return ret; -} - -/* Parse node iolinks properties in /sys/class/kfd/kfd/topology/nodes/N/io_links */ -static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path) -{ - struct dirent *dirent_node; - DIR *d_node; - char path[300]; - FILE *file = NULL; - int ret = 0; - - snprintf(path, sizeof(path), "%s/io_links", dir_path); - - d_node = opendir(path); - if (!d_node) { - pr_perror("Can't open %s", path); - return -EACCES; - } - - while ((dirent_node = readdir(d_node)) != NULL) { - char line[300]; - char iolink_path[1024]; - struct stat st; - int id; - - uint32_t iolink_type = 0; - uint32_t node_to_id = 0; - - /* Only parse numeric directories */ - if (sscanf(dirent_node->d_name, "%d", &id) != 1) - continue; - - snprintf(iolink_path, sizeof(iolink_path), "%s/%s", path, dirent_node->d_name); - if (stat(iolink_path, &st)) { - pr_err("Cannot to access %s\n", path); - ret = -EACCES; - goto fail; - } - if ((st.st_mode & S_IFMT) == S_IFDIR) { - char properties_path[PATH_MAX]; - - snprintf(properties_path, sizeof(properties_path), "%s/properties", iolink_path); - - file = fopen(properties_path, "r"); - if (!file) { - pr_perror("Failed to access %s", properties_path); - ret = -EACCES; - goto fail; - } - - while (fgets(line, sizeof(line), file)) { - char name[MAX_PARAMETER_LEN + 1]; - uint64_t value; - - memset(name, 0, sizeof(name)); - if (!get_prop(line, name, &value)) { - ret = -EINVAL; - goto fail; - } - - if (!strcmp(name, "type")) - iolink_type = (uint32_t)value; - if (!strcmp(name, "node_to")) - node_to_id = (uint32_t)value; - } - fclose(file); - file = NULL; - } - - /* We only store the link information for now, then once all topology parsing is - * finished we will confirm iolinks - */ - if (iolink_type == TOPO_IOLINK_TYPE_PCIE || iolink_type == TOPO_IOLINK_TYPE_XGMI) { - if (!node_add_iolink(node, iolink_type, node_to_id)) { - ret = -ENOMEM; - goto fail; - } - } - } - closedir(d_node); - return 0; -fail: - if (file) - fclose(file); - - closedir(d_node); - return ret; -} - -/* Parse a node (CPU or GPU) in /sys/class/kfd/kfd/topology/nodes/N */ -static int parse_topo_node(struct tp_node *node, const char *dir_path) -{ - if (parse_topo_node_properties(node, dir_path)) { - pr_err("Failed to parse node properties\n"); - return -EINVAL; - } - if (parse_topo_node_mem_banks(node, dir_path)) { - pr_err("Failed to parse node mem_banks\n"); - return -EINVAL; - } - if (parse_topo_node_iolinks(node, dir_path)) { - pr_err("Failed to parse node iolinks\n"); - return -EINVAL; - } - return 0; -} - -static const char *p2pgroup_to_str(struct tp_p2pgroup *group) -{ - static char topology_printstr[200]; - struct tp_node *node; - size_t str_len = 0; - - topology_printstr[0] = '\0'; - str_len += sprintf(&topology_printstr[str_len], "type:%s:", link_type(group->type)); - - list_for_each_entry(node, &group->nodes, listm_p2pgroup) { - str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id); - } - return topology_printstr; -} - -static const char *mapping_list_to_str(struct list_head *node_list) -{ - static char topology_printstr[200]; - struct tp_node *node; - size_t str_len = 0; - - topology_printstr[0] = '\0'; - list_for_each_entry(node, node_list, listm_mapping) - str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id); - - return topology_printstr; -} - -void topology_print(const struct tp_system *sys, const char *message) -{ - struct tp_node *node; - struct tp_p2pgroup *xgmi_group; - - pr_info("===System Topology=[%12s]==================================\n", message); - list_for_each_entry(node, &sys->nodes, listm_system) { - struct tp_iolink *iolink; - - if (!NODE_IS_GPU(node)) { - pr_info("[%d] CPU\n", node->id); - pr_info(" cpu_cores_count:%u\n", node->cpu_cores_count); - } else { - pr_info("[%d] GPU gpu_id:0x%04X\n", node->id, node->gpu_id); - pr_info(" vendor_id:%u device_id:%u\n", node->vendor_id, node->device_id); - pr_info(" vram_public:%c vram_size:%lu\n", node->vram_public ? 'Y' : 'N', node->vram_size); - pr_info(" io_links_count:%u capability:%u\n", node->io_links_count, node->capability); - pr_info(" mem_banks_count:%u caches_count:%d lds_size_in_kb:%u\n", node->mem_banks_count, - node->caches_count, node->lds_size_in_kb); - pr_info(" simd_count:%u max_waves_per_simd:%u\n", node->simd_count, - node->max_waves_per_simd); - pr_info(" num_gws:%u wave_front_size:%u array_count:%u\n", node->num_gws, - node->wave_front_size, node->array_count); - pr_info(" simd_arrays_per_engine:%u simd_per_cu:%u\n", node->simd_arrays_per_engine, - node->simd_per_cu); - pr_info(" max_slots_scratch_cu:%u cu_per_simd_array:%u\n", node->max_slots_scratch_cu, - node->cu_per_simd_array); - pr_info(" num_sdma_engines:%u\n", node->num_sdma_engines); - pr_info(" num_sdma_xgmi_engines:%u num_sdma_queues_per_engine:%u\n", - node->num_sdma_xgmi_engines, node->num_sdma_queues_per_engine); - pr_info(" num_cp_queues:%u fw_version:%u sdma_fw_version:%u\n", node->num_cp_queues, - node->fw_version, node->sdma_fw_version); - } - list_for_each_entry(iolink, &node->iolinks, listm) { - if (!iolink->valid) - continue; - - pr_info(" iolink type:%s node-to:%d (0x%04X) node-from:%d bi-dir:%s\n", - link_type(iolink->type), iolink->node_to_id, iolink->node_to->gpu_id, - iolink->node_from->id, iolink->peer ? "Y" : "N"); - } - } - - pr_info("===Groups==========================================================\n"); - list_for_each_entry(xgmi_group, &sys->xgmi_groups, listm_system) - pr_info("%s\n", p2pgroup_to_str(xgmi_group)); - pr_info("===================================================================\n"); -} - -void topology_init(struct tp_system *sys) -{ - memset(sys, 0, sizeof(*sys)); - INIT_LIST_HEAD(&sys->nodes); - INIT_LIST_HEAD(&sys->xgmi_groups); -} - -void topology_free(struct tp_system *sys) -{ - while (!list_empty(&sys->nodes)) { - struct tp_node *node = list_first_entry(&sys->nodes, struct tp_node, listm_system); - - list_del(&node->listm_system); - - while (!list_empty(&node->iolinks)) { - struct tp_iolink *iolink = list_first_entry(&node->iolinks, struct tp_iolink, listm); - - list_del(&iolink->listm); - xfree(iolink); - } - xfree(node); - } - - while (!list_empty(&sys->xgmi_groups)) { - struct tp_p2pgroup *p2pgroup = list_first_entry(&sys->xgmi_groups, struct tp_p2pgroup, listm_system); - - list_del(&p2pgroup->listm_system); - xfree(p2pgroup); - } - - /* Update Topology as being freed */ - sys->parsed = false; -} - -/** - * @brief Validates iolinks and determine XGMI hives in a system topology - * - * On some systems, some GPUs may not be accessible because they are masked by cgroups, but the - * iolinks to these GPUs are still visible. If the peer GPU is not accessible, we consider that link - * invalid. - * In a XGMI hive, each GPU will have a bi-directional iolink to every other GPU. So we create a - * XGMI group (hive) and add all the GPUs in that hive to the group when iterating over the first - * GPU in that group. - * - * @param sys system topology - * @return 0 if successful, errno if failed. - */ -int topology_determine_iolinks(struct tp_system *sys) -{ - int ret = 0; - struct tp_node *node; - - list_for_each_entry(node, &sys->nodes, listm_system) { - struct tp_iolink *iolink; - - list_for_each_entry(iolink, &node->iolinks, listm) { - struct tp_p2pgroup *group = NULL; - struct tp_node *peer_node = NULL; - struct tp_iolink *peer_iolink = NULL; - - peer_node = sys_get_node_by_node_id(sys, iolink->node_to_id); - if (!peer_node) { - /* node not accessible, usually because it is masked by cgroups */ - iolink->valid = false; - continue; - } - iolink->valid = true; - node->num_valid_iolinks++; - - iolink->node_to = peer_node; - peer_iolink = get_tp_peer_iolink(peer_node, node, iolink->type); - if (!peer_iolink) - continue; /* This is a one-dir link */ - - /* We confirmed both sides have same type of iolink */ - iolink->peer = peer_iolink; - peer_iolink->peer = iolink; - - if (iolink->type == TOPO_IOLINK_TYPE_XGMI) { - group = sys_get_p2pgroup_with_gpu_id(sys, iolink->type, node->gpu_id); - if (!group) { - /* This GPU does not already belong to a group so we create - * a new group - */ - group = sys_add_group(sys, iolink->type); - if (!group) { - ret = -ENOMEM; - goto fail; - } - list_add_tail(&node->listm_p2pgroup, &group->nodes); - } - - /* Also add peer GPU to this group */ - if (!p2pgroup_get_node_by_gpu_id(group, peer_node->gpu_id)) - list_add_tail(&peer_node->listm_p2pgroup, &group->nodes); - } - } - } - -fail: - /* In case of failure, caller function will call topology_free which will free groups that - * were successfully allocated - */ - return ret; -} - -/** - * @brief Parse system topology - * - * Parse system topology exposed by the drivers in /sys/class/kfd/kfd/topology and fill in the - * system topology structure. - * - * @param sys system topology structure to be filled by this function - * @param message print this message when printing the topology to logs - * @return 0 if successful, errno if failed. - */ -int topology_parse(struct tp_system *sys, const char *message) -{ - struct dirent *dirent_system; - DIR *d_system; - char path[300]; - int ret; - - if (sys->parsed) - return 0; - - sys->parsed = true; - INIT_LIST_HEAD(&sys->nodes); - INIT_LIST_HEAD(&sys->xgmi_groups); - - d_system = opendir(TOPOLOGY_PATH); - if (!d_system) { - pr_perror("Can't open %s", TOPOLOGY_PATH); - return -EACCES; - } - - while ((dirent_system = readdir(d_system)) != NULL) { - struct stat stbuf; - int id, fd; - - /* Only parse numeric directories */ - if (sscanf(dirent_system->d_name, "%d", &id) != 1) - continue; - - sprintf(path, "%s%s", TOPOLOGY_PATH, dirent_system->d_name); - if (stat(path, &stbuf)) { - /* When cgroup is masking some devices, the path exists, but it is not - * accessible, this is not an error - */ - pr_info("Cannot to access %s\n", path); - continue; - } - - if ((stbuf.st_mode & S_IFMT) == S_IFDIR) { - struct tp_node *node; - int len; - char gpu_id_path[300]; - char read_buf[7]; /* Max gpu_id len is 6 chars */ - unsigned int gpu_id; - - sprintf(gpu_id_path, "%s/%s/gpu_id", TOPOLOGY_PATH, dirent_system->d_name); - fd = open(gpu_id_path, O_RDONLY); - if (fd < 0) { - pr_perror("Failed to access %s", gpu_id_path); - continue; - } - - len = read(fd, read_buf, sizeof(read_buf) - 1); - close(fd); - if (len < 0) - continue; - - read_buf[len] = '\0'; - - if (sscanf(read_buf, "%d", &gpu_id) != 1) - continue; - - node = sys_add_node(sys, id, gpu_id); - if (!node) { - ret = -ENOMEM; - goto fail; - } - - if (parse_topo_node(node, path)) { - pr_err("Failed to parse node %s\n", path); - ret = -EINVAL; - goto fail; - } - } - } - closedir(d_system); - return 0; - -fail: - topology_free(sys); - return ret; -} - -static bool device_properties_match(struct tp_node *src, struct tp_node *dest) -{ - if (src->simd_count == dest->simd_count && src->mem_banks_count == dest->mem_banks_count && - src->io_links_count == dest->io_links_count && src->max_waves_per_simd == dest->max_waves_per_simd && - src->lds_size_in_kb == dest->lds_size_in_kb && src->wave_front_size == dest->wave_front_size && - src->array_count == dest->array_count && src->simd_arrays_per_engine == dest->simd_arrays_per_engine && - src->cu_per_simd_array == dest->cu_per_simd_array && src->simd_per_cu == dest->simd_per_cu && - src->max_slots_scratch_cu == dest->max_slots_scratch_cu && src->vendor_id == dest->vendor_id && - src->device_id == dest->device_id && src->num_sdma_engines == dest->num_sdma_engines && - src->num_sdma_xgmi_engines == dest->num_sdma_xgmi_engines && - src->num_sdma_queues_per_engine == dest->num_sdma_queues_per_engine && - src->num_cp_queues == dest->num_cp_queues && src->vram_public == dest->vram_public && - (!kfd_capability_check || (src->capability == dest->capability)) && - (!kfd_vram_size_check || (src->vram_size <= dest->vram_size)) && - (!kfd_num_gws_check || (src->num_gws <= dest->num_gws)) && - (!kfd_caches_count_check || (src->caches_count <= dest->caches_count)) && - (!kfd_fw_version_check || (src->fw_version <= dest->fw_version)) && - (!kfd_sdma_fw_version_check || (src->sdma_fw_version <= dest->sdma_fw_version))) { - return true; - } - return false; -} - -/** - * @brief Determines whether iolink dest can be used to replace src - * - * @param src source iolink - * @param dest destination iolink - * @return true if dest can replace src - */ -static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest) -{ - if (!src->valid) - return true; - - if (!dest->valid) - return false; - - if (NODE_IS_GPU(src->node_to) != NODE_IS_GPU(dest->node_to)) - return false; - - /* XGMI link can replace PCIE links */ - if (src->type == TOPO_IOLINK_TYPE_XGMI && dest->type == TOPO_IOLINK_TYPE_PCIE) - return false; - - /* bi-directional links can replace uni-directional links */ - if (src->peer != NULL && dest->peer == NULL) - return false; - - return true; -} - -/** - * @brief Determines whether src_node can be mapped to dest_node - * - * Nodes compatibility are determined by: - * 1. Comparing the node properties - * 2. Making sure iolink mappings to CPUs would be compatible with existing iolink mappings in maps - * - * If src_node and dest_node are mappable, then map_device will push the new mapping - * for src_node -> dest_node into new_maps. - * @param src_sys system topology information on source system - * @param dest_sys system topology information on destination system - * @param src_node source GPU - * @param dest_node destination GPU - * @param maps list of existing device maps - * @param new_maps if nodes are mappable, then GPU and CPU mappings will be added to this list - * @return true if src_node and dest_node are mappable - */ -static bool map_device(struct tp_system *src_sys, struct tp_system *dest_sys, struct tp_node *src_node, - struct tp_node *dest_node, struct device_maps *maps, struct device_maps *new_maps) -{ - struct tp_iolink *src_iolink; - - pr_debug("Evaluating mapping nodes [0x%04X -> 0x%04X]\n", src_node->gpu_id, dest_node->gpu_id); - - /* Compare GPU properties from /sys/class/kfd/kfd/topology/nodes/N/properties */ - if (!device_properties_match(src_node, dest_node)) { - pr_debug("[0x%04X -> 0x%04X] Device properties do not match\n", src_node->gpu_id, dest_node->gpu_id); - return false; - } - - if (src_node->num_valid_iolinks > dest_node->num_valid_iolinks) { - pr_debug("[0x%04X -> 0x%04X] Mismatch between number of iolinks\n", src_node->gpu_id, - dest_node->gpu_id); - return false; - } - - list_for_each_entry(src_iolink, &src_node->iolinks, listm) { - /* Go through list of iolinks to CPU and compare them */ - - if (!NODE_IS_GPU(src_iolink->node_to)) { - bool matched_iolink = false; - /* This is a iolink to CPU */ - pr_debug("Found link to CPU node:%02d\n", src_iolink->node_to->id); - - if (!kfd_numa_check) { - struct tp_iolink *dest_iolink; - - list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) { - if (iolink_match(src_iolink, dest_iolink)) - matched_iolink = true; - } - } else { - uint32_t dest_cpu_node_id; - - dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id); - if (dest_cpu_node_id == INVALID_CPU_ID) - dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id); - - if (dest_cpu_node_id == INVALID_CPU_ID) { - struct tp_iolink *dest_iolink; - list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) { - if (iolink_match(src_iolink, dest_iolink) && - !maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) && - !maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) { - if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id, - dest_iolink->node_to->id)) - /* This is a critical error because - * we are out of memory - */ - return false; - - matched_iolink = true; - break; - } - } - } else { - pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id, - dest_cpu_node_id); - /* Confirm that the link to this CPU is same or better */ - - struct tp_iolink *dest_iolink = node_get_iolink_to_node_id( - dest_node, src_iolink->type, dest_cpu_node_id); - - if (dest_iolink && iolink_match(src_iolink, dest_iolink)) - matched_iolink = true; - } - } - if (!matched_iolink) { - pr_debug("[0x%04X -> 0x%04X] Mismatch between iolink to CPU\n", src_node->gpu_id, - dest_node->gpu_id); - - return false; - } - } else { - /* If GPUs have P2P-PCIe iolinks to this GPU, then at least one CPU will - * also have a P2P-PCIe iolink to this GPU, so it seems that we do not need - * to consider P2P-PCIe iolinks from GPU to GPU for now. Once P2P-PCIe - * iolinks are exposed via p2p_links we may have to add additional code here - * to validate P2P-PCIe links between GPUs. - */ - } - } - pr_debug("[0x%04X -> 0x%04X] Map is possible\n", src_node->gpu_id, dest_node->gpu_id); - - if (!maps_add_gpu_entry(new_maps, src_node->gpu_id, dest_node->gpu_id)) { - /* This is a critical error because we are out of memory */ - return false; - } - maps_print(new_maps); - return true; -} - -/** - * @brief Determines whether list of GPUs in src_nodes are mappable to dest_nodes - * - * This function will pick the first node from src_nodes and iterate through all the nodes in - * dest_nodes and call map_device to determine whether the node is mappable. - * If a node from dest_nodes is mappable to the first node from src_nodes: - * 1. This function will remove the first node from src_nodes and the node from dest_nodes - * 2. Push sub-mappings (new_maps) generated by map_device into existing mappings (maps) - * 3. Recursively check whether remaining nodes in src_nodes and dest_nodes are mappable. - * - * Once src_nodes is empty then we have successfully mapped all the nodes and maps contains a full - * list of GPU mappings. - * - * If there are no nodes in dest_nodes that can be mapped to the first node in src_nodes, then this - * means we cannot build a full mapping list with the current list of mappings. We backtrack by - * popping the newly generated sub-mappings(new_maps) from existing mappings (maps) and add the two - * nodes back to src_nodes and dest_nodes and return false. When this function returns false, the - * caller function will try a different path by trying to map the first node from src_nodes to the - * next node in dest_nodes. - * - * @param src_sys system topology information on source system - * @param dest_sys system topology information on destination system - * @param src_node list of source GPUs that need to be mapped - * @param dest_node list of destination GPUs that need to be mapped - * @param maps list of device maps based on current map path - * @return true if all nodes from src_nodes and dest_nodes are mappable - */ -static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_nodes, - struct list_head *dest_nodes, struct device_maps *maps) -{ - struct tp_node *src_node, *dest_node, *dest_node_tmp; - struct device_maps new_maps; - - /* Pick the first src node from the list of nodes and look for a dest node that is mappable. - * If we find a mappable destination node, then we add src node and dest node mapping to - * device_maps and recursively try to map the remaining nodes in the list. - * If there are no more src nodes in the list, then we have found a successful combination - * of src to dest nodes that are mappable. - */ - if (list_empty(src_nodes)) { - pr_debug("All nodes mapped successfully\n"); - return true; - } - - pr_debug("Mapping list src nodes [%s]\n", mapping_list_to_str(src_nodes)); - pr_debug("Mapping list dest nodes [%s]\n", mapping_list_to_str(dest_nodes)); - - src_node = list_first_entry(src_nodes, struct tp_node, listm_mapping); - pr_debug("Looking for match for node 0x%04X\n", src_node->gpu_id); - - list_del(&src_node->listm_mapping); - - list_for_each_entry_safe(dest_node, dest_node_tmp, dest_nodes, listm_mapping) { - maps_init(&new_maps); - if (map_device(src_sys, dest_sys, src_node, dest_node, maps, &new_maps)) { - pr_debug("Matched destination node 0x%04X\n", dest_node->gpu_id); - - /* src node and dest node are mappable, add device_maps generated by - * map_device to list of current valid device_maps, and recursively try to - * map remaining nodes in the list. - */ - - list_del(&dest_node->listm_mapping); - if (maps_push(maps, &new_maps)) - return false; - - if (map_devices(src_sys, dest_sys, src_nodes, dest_nodes, maps)) { - pr_debug("Matched nodes 0x%04X and after\n", dest_node->gpu_id); - return true; - } else { - /* We could not map remaining nodes in the list. Add dest node back - * to list and try to map next dest node in list to current src - * node. - */ - pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, " - "adding list back\n", - src_node->gpu_id, dest_node->gpu_id); - - list_add(&dest_node->listm_mapping, dest_nodes); - maps_pop(maps, &new_maps); - } - } - } - pr_debug("Failed to map nodes 0x%04X and after\n", src_node->gpu_id); - - /* Either: We could not find a mappable dest node for current node, or we could not build a - * combination from the remaining nodes in the lists. Add src node back to the list and - * caller function will try next possible combination. - */ - list_add(&src_node->listm_mapping, src_nodes); - - return false; -} - -/** - * @brief Determines whether list of GPUs in src_xgmi_groups are mappable to list of GPUs in - * dest_xgmi_groups - * - * This function will pick the first XGMI group (hive) from src_xgmi_groups and iterate through the - * XGMI groups in dest_xgmi_groups. If the group in dest_xgmi_groups is mappable then this function - * will remove the hives from src_xgmi_groups and dest_xgmi_groups and recursively try to map the - * remaining hives in src_xgmi_groups and dest_xgmi_groups. - * - * If src_xgmi_groups is empty, then this means that we have successfully mapped all the XGMI hives - * and we have a full list of GPU mappings in maps. - * - * If we cannot find a hive inside dest_xgmi_groups that is mappable to the first hive from - * src_xgmi_groups, then this means that this path is not valid and we need to backtrack. We - * backtrack by adding the hives back into src_xgmi_groups and dest_xgmi_groups and returning false. - * The caller function will then try a different path by trying to map the first hive in - * src_xgmi_groups to the next hive in dest_xgmi_groups. - * - * @param src_sys system topology information on source system - * @param dest_sys system topology information on destination system - * @param src_xgmi_groups list of source XGMI hives that need to be mapped - * @param dest_xgmi_groups list of destination XGMI hives that need to be mapped - * @param maps list of device maps based on current map path - * @return true if all nodes from src_nodes and dest_nodes are mappable - */ -bool match_xgmi_groups(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_xgmi_groups, - struct list_head *dest_xgmi_groups, struct device_maps *maps) -{ - struct tp_p2pgroup *src_group; - struct tp_p2pgroup *dest_group; - struct tp_p2pgroup *dest_group_tmp; - - if (list_empty(src_xgmi_groups)) { - pr_debug("All groups matched successfully\n"); - return true; - } - - /* Pick the first src XGMI group from the list. Then try to match src XGMI group with a - * dest XGMI group. If we have a dest XGMI group that is mappable, then we try to - * recursively map the next src XGMI group in the list, with remaining dest XGMI groups. - * If there are no more src XGMI groups in the list, then this means we have successfully - * mapped all the groups and we have a valid device_maps - */ - src_group = list_first_entry(src_xgmi_groups, struct tp_p2pgroup, listm_system); - pr_debug("Looking for match for group [%s]\n", p2pgroup_to_str(src_group)); - - list_del(&src_group->listm_system); - - list_for_each_entry_safe(dest_group, dest_group_tmp, dest_xgmi_groups, listm_system) { - struct tp_node *node; - - LIST_HEAD(src_nodes); - LIST_HEAD(dest_nodes); - - if (src_group->num_nodes > dest_group->num_nodes) - continue; - - pr_debug("Trying destination group [%s]\n", p2pgroup_to_str(dest_group)); - - list_for_each_entry(node, &src_group->nodes, listm_p2pgroup) - list_add_tail(&node->listm_mapping, &src_nodes); - - list_for_each_entry(node, &dest_group->nodes, listm_p2pgroup) - list_add_tail(&node->listm_mapping, &dest_nodes); - - /* map_devices will populate maps if successful */ - if (map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) { - /* All the nodes in current src XGMI group are mappable with nodes in - * current dest XGMI group. Remove the current groups from the lists - * and recursively try to match remaining groups - */ - list_del(&dest_group->listm_system); - pr_debug("Matched destination group [%s]\n", p2pgroup_to_str(dest_group)); - if (match_xgmi_groups(src_sys, dest_sys, src_xgmi_groups, dest_xgmi_groups, maps)) { - pr_debug("Matched subgroups of [%s]\n", p2pgroup_to_str(dest_group)); - - xfree(src_group); - xfree(dest_group); - return true; - } else { - /* We were not able to map the remaining XGMI groups so we add the - * current dest XGMI group back to the list of unmapped groups, and - * try to map current src XGMI group with the next dest XGMI in the - * list of XGMI groups - */ - list_add(&dest_group->listm_system, dest_xgmi_groups); - } - } - } - - /* We have not found a mappable dest XGMI group. We discard this combination. If this is - * the first src XGMI group in the list, then it is not possible to match the XGMI groups. - * If this was a recursive call, then the calling instance of function will try the next - * combination of XGMI groups - */ - - pr_debug("Failed to match groups [%s]\n", p2pgroup_to_str(src_group)); - list_add_tail(&src_group->listm_system, src_xgmi_groups); - - return false; -} - -/** - * @brief Builds a list of GPU mappings from source topology to destination topology - * - * The topology on the destination system may not be identical to the topology on the source - * system, e.g There can be GPUs with different device ID's and they may be enumerated in a - * different order. This function builds a list of GPU mappings from the source topology to the - * destination topology and stores it in maps. - * - * The function will first validate all the iolinks and determine XGMI groups (hives) by calling the - * topology_determine_iolinks(). It will then try to match the GPUs that belong to XGMI hives and - * after that, match the remaining GPUs. - * - * @param src_sys system topology information on source system - * @param dest_sys system topology information on destination system - * @param maps list of device maps that was generated by this function - * @return true if we were able to build a full list of GPU mappings. - */ -int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, struct device_maps *maps) -{ - struct tp_node *node; - int ret = 0; - int src_num_gpus = 0; - int dest_num_gpus = 0; - - maps_init(maps); - - ret = topology_determine_iolinks(src_sys); - if (ret) { - pr_err("Failed to determine iolinks from source (checkpointed) topology\n"); - return ret; - } - topology_print(src_sys, "Source "); - - ret = topology_determine_iolinks(dest_sys); - if (ret) { - pr_err("Failed to determine iolinks from destination (local) topology\n"); - return ret; - } - topology_print(dest_sys, "Destination"); - - /* Make sure we have same number of GPUs in src and dest */ - list_for_each_entry(node, &src_sys->nodes, listm_system) { - if (NODE_IS_GPU(node)) - src_num_gpus++; - } - list_for_each_entry(node, &dest_sys->nodes, listm_system) { - if (NODE_IS_GPU(node)) - dest_num_gpus++; - } - - if (src_num_gpus != dest_num_gpus) { - pr_err("Number of devices mismatch (checkpointed:%d local:%d)\n", src_num_gpus, dest_num_gpus); - return -EINVAL; - } - - if (src_sys->num_xgmi_groups > dest_sys->num_xgmi_groups) { - pr_err("Number of xgmi groups mismatch (checkpointed:%d local:%d)\n", src_sys->num_xgmi_groups, - dest_sys->num_xgmi_groups); - return -EINVAL; - } - - /* First try to match the XGMI hives */ - if (src_sys->num_xgmi_groups) { - if (!match_xgmi_groups(src_sys, dest_sys, &src_sys->xgmi_groups, &dest_sys->xgmi_groups, maps)) { - pr_err("Failed to match all GPU groups\n"); - return -EINVAL; - } - pr_info("Current maps after XGMI groups matched\n"); - maps_print(maps); - } - - /* We matched all the XGMI hives, now match remaining GPUs */ - LIST_HEAD(src_nodes); - LIST_HEAD(dest_nodes); - - list_for_each_entry(node, &src_sys->nodes, listm_system) { - if (NODE_IS_GPU(node) && !maps_get_dest_gpu(maps, node->gpu_id)) - list_add(&node->listm_mapping, &src_nodes); - } - - list_for_each_entry(node, &dest_sys->nodes, listm_system) { - if (NODE_IS_GPU(node) && !maps_dest_gpu_mapped(maps, node->gpu_id)) - list_add(&node->listm_mapping, &dest_nodes); - } - - if (!map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) { - pr_err("Failed to match remaining nodes\n"); - return -EINVAL; - } - - pr_info("Maps after all nodes matched\n"); - maps_print(maps); - - return ret; -} - -int topology_gpu_count(struct tp_system *sys) -{ - struct tp_node *node; - int count = 0; - - list_for_each_entry(node, &sys->nodes, listm_system) - if (NODE_IS_GPU(node)) - count++; - return count; -} - diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h deleted file mode 100644 index e19f8e7ce..000000000 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ /dev/null @@ -1,132 +0,0 @@ -#ifndef __KFD_PLUGIN_TOPOLOGY_H__ -#define __KFD_PLUGIN_TOPOLOGY_H__ - -#define DRM_FIRST_RENDER_NODE 128 -#define DRM_LAST_RENDER_NODE 255 - -#define TOPO_HEAP_TYPE_PUBLIC 1 /* HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC */ -#define TOPO_HEAP_TYPE_PRIVATE 2 /* HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE */ - -#define TOPO_IOLINK_TYPE_ANY 0 /* HSA_IOLINKTYPE_UNDEFINED */ -#define TOPO_IOLINK_TYPE_PCIE 2 /* HSA_IOLINKTYPE_PCIEXPRESS */ -#define TOPO_IOLINK_TYPE_XGMI 11 /* HSA_IOLINK_TYPE_XGMI */ - -#define NODE_IS_GPU(node) ((node)->gpu_id != 0) -#define INVALID_CPU_ID 0xFFFF - -/*************************************** Structures ***********************************************/ -struct tp_node; - -struct tp_iolink { - struct list_head listm; - uint32_t type; - uint32_t node_to_id; - struct tp_node *node_to; - struct tp_node *node_from; - bool valid; /* Set to false if target node is not accessible */ - struct tp_iolink *peer; /* If link is bi-directional, peer link */ -}; - -struct tp_node { - uint32_t id; - uint32_t gpu_id; - uint32_t cpu_cores_count; - uint32_t simd_count; - uint32_t mem_banks_count; - uint32_t caches_count; - uint32_t io_links_count; - uint32_t max_waves_per_simd; - uint32_t lds_size_in_kb; - uint32_t num_gws; - uint32_t wave_front_size; - uint32_t array_count; - uint32_t simd_arrays_per_engine; - uint32_t cu_per_simd_array; - uint32_t simd_per_cu; - uint32_t max_slots_scratch_cu; - uint32_t vendor_id; - uint32_t device_id; - uint32_t domain; - uint32_t drm_render_minor; - uint64_t hive_id; - uint32_t num_sdma_engines; - uint32_t num_sdma_xgmi_engines; - uint32_t num_sdma_queues_per_engine; - uint32_t num_cp_queues; - uint32_t fw_version; - uint32_t capability; - uint32_t sdma_fw_version; - bool vram_public; - uint64_t vram_size; - - struct list_head listm_system; - struct list_head listm_p2pgroup; - struct list_head listm_mapping; /* Used only during device mapping */ - - uint32_t num_valid_iolinks; - struct list_head iolinks; - - int drm_fd; -}; - -struct tp_p2pgroup { - uint32_t type; - uint32_t num_nodes; - struct list_head listm_system; - struct list_head nodes; -}; - -struct tp_system { - bool parsed; - uint32_t num_nodes; - struct list_head nodes; - uint32_t num_xgmi_groups; - struct list_head xgmi_groups; -}; - -struct id_map { - uint32_t src; - uint32_t dest; - - struct list_head listm; -}; - -struct device_maps { - struct list_head cpu_maps; /* CPUs are mapped using node_id */ - struct list_head gpu_maps; - - struct list_head *tail_cpu; /* GPUs are mapped using gpu_id */ - struct list_head *tail_gpu; -}; - -/**************************************** Functions ***********************************************/ -void topology_init(struct tp_system *sys); -void topology_free(struct tp_system *topology); - -int topology_parse(struct tp_system *topology, const char *msg); -int topology_determine_iolinks(struct tp_system *sys); -void topology_print(const struct tp_system *sys, const char *msg); - -int topology_gpu_count(struct tp_system *topology); - -struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id); - -struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id); -struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id); - -struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id); -struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); -struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index); - -int open_drm_render_device(int minor); -int node_get_drm_render_device(struct tp_node *node); -void sys_close_drm_render_devices(struct tp_system *sys); - -int set_restore_gpu_maps(struct tp_system *tp_checkpoint, struct tp_system *tp_local, struct device_maps *maps); - -uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id); - -void maps_init(struct device_maps *maps); -void maps_free(struct device_maps *maps); - -#endif /* __KFD_PLUGIN_TOPOLOGY_H__ */ diff --git a/plugins/amdgpu/amdgpu_plugin_util.c b/plugins/amdgpu/amdgpu_plugin_util.c deleted file mode 100644 index 592562474..000000000 --- a/plugins/amdgpu/amdgpu_plugin_util.c +++ /dev/null @@ -1,330 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include "common/list.h" - -#include -#include - -#include "criu-plugin.h" -#include "plugin.h" -#include "criu-amdgpu.pb-c.h" - -#include "img-streamer.h" -#include "image.h" -#include "cr_options.h" - -#include "xmalloc.h" -#include "criu-log.h" -#include "kfd_ioctl.h" -#include "amdgpu_drm.h" -#include "amdgpu_plugin_util.h" -#include "amdgpu_plugin_topology.h" -#include "amdgpu_plugin_drm.h" - -static LIST_HEAD(dumped_fds); -static LIST_HEAD(shared_bos); -static LIST_HEAD(completed_work); - -/* Helper structures to encode device topology of SRC and DEST platforms */ -struct tp_system src_topology; -struct tp_system dest_topology; - -/* Helper structures to encode device maps during Checkpoint and Restore operations */ -struct device_maps checkpoint_maps; -struct device_maps restore_maps; - -int record_dumped_fd(int fd, bool is_drm) -{ - int newfd = dup(fd); - - if (newfd < 0) - return newfd; - struct dumped_fd *st = malloc(sizeof(struct dumped_fd)); - if (!st) - return -1; - st->fd = newfd; - st->is_drm = is_drm; - list_add(&st->l, &dumped_fds); - - return 0; -} - -struct list_head *get_dumped_fds() -{ - return &dumped_fds; -} - -bool shared_bo_has_exporter(int handle) -{ - struct shared_bo *bo; - - if (handle == -1) - return false; - - list_for_each_entry(bo, &shared_bos, l) { - if (bo->handle == handle) { - return bo->has_exporter; - } - } - - return false; -} - -int record_shared_bo(int handle, bool is_imported) -{ - struct shared_bo *bo; - - if (handle == -1) - return 0; - - list_for_each_entry(bo, &shared_bos, l) { - if (bo->handle == handle) { - return 0; - } - } - bo = malloc(sizeof(struct shared_bo)); - if (!bo) - return -1; - bo->handle = handle; - bo->has_exporter = !is_imported; - list_add(&bo->l, &shared_bos); - - return 0; -} - -int handle_for_shared_bo_fd(int fd) -{ - struct dumped_fd *df; - int trial_handle; - amdgpu_device_handle h_dev; - uint32_t major, minor; - struct shared_bo *bo; - - list_for_each_entry(df, &dumped_fds, l) { - /* see if the gem handle for fd using the hdev for df->fd is the - same as bo->handle. */ - - if (!df->is_drm) { - continue; - } - - if (amdgpu_device_initialize(df->fd, &major, &minor, &h_dev)) { - pr_err("Failed to initialize amdgpu device\n"); - continue; - } - - trial_handle = get_gem_handle(h_dev, fd); - if (trial_handle < 0) - continue; - - list_for_each_entry(bo, &shared_bos, l) { - if (bo->handle == trial_handle) - return trial_handle; - } - - amdgpu_device_deinitialize(h_dev); - } - - return -1; -} - -int record_completed_work(int handle, int id) -{ - struct restore_completed_work *work; - - work = malloc(sizeof(struct restore_completed_work)); - if (!work) - return -1; - work->handle = handle; - work->id = id; - list_add(&work->l, &completed_work); - - return 0; -} - -bool work_already_completed(int handle, int id) -{ - struct restore_completed_work *work; - - list_for_each_entry(work, &completed_work, l) { - if (work->handle == handle && work->id == id) { - return true; - } - } - - return false; -} - -void clear_restore_state() -{ - while (!list_empty(&completed_work)) { - struct restore_completed_work *st = list_first_entry(&completed_work, struct restore_completed_work, l); - list_del(&st->l); - free(st); - } -} - -void clear_dumped_fds() -{ - while (!list_empty(&dumped_fds)) { - struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l); - list_del(&st->l); - close(st->fd); - free(st); - } -} - -int read_fp(FILE *fp, void *buf, const size_t buf_len) -{ - size_t len_read; - - len_read = fread(buf, 1, buf_len, fp); - if (len_read != buf_len) { - pr_err("Unable to read file (read:%ld buf_len:%ld)\n", len_read, buf_len); - return -EIO; - } - return 0; -} - -int write_fp(FILE *fp, const void *buf, const size_t buf_len) -{ - size_t len_write; - - len_write = fwrite(buf, 1, buf_len, fp); - if (len_write != buf_len) { - pr_err("Unable to write file (wrote:%ld buf_len:%ld)\n", len_write, buf_len); - return -EIO; - } - return 0; -} - -/** - * @brief Open an image file - * - * We store the size of the actual contents in the first 8-bytes of - * the file. This allows us to determine the file size when using - * criu_image_streamer when fseek and fstat are not available. The - * FILE * returned is already at the location of the first actual - * contents. - * - * @param path The file path - * @param write False for read, true for write - * @param size Size of actual contents - * @return FILE *if successful, NULL if failed - */ -FILE *open_img_file(char *path, bool write, size_t *size) -{ - FILE *fp = NULL; - int fd, ret; - - if (opts.stream) - fd = img_streamer_open(path, write ? O_DUMP : O_RSTR); - else - fd = openat(criu_get_image_dir(), path, write ? (O_WRONLY | O_CREAT) : O_RDONLY, 0600); - - if (fd < 0) { - pr_err("%s: Failed to open for %s\n", path, write ? "write" : "read"); - return NULL; - } - - fp = fdopen(fd, write ? "w" : "r"); - if (!fp) { - pr_err("%s: Failed get pointer for %s\n", path, write ? "write" : "read"); - return NULL; - } - - if (write) - ret = write_fp(fp, size, sizeof(*size)); - else - ret = read_fp(fp, size, sizeof(*size)); - - if (ret) { - pr_err("%s:Failed to access file size\n", path); - fclose(fp); - return NULL; - } - - pr_debug("%s:Opened file for %s with size:%ld\n", path, write ? "write" : "read", *size); - return fp; -} - -int read_file(const char *file_path, void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - - fp = fopen(file_path, "r"); - if (!fp) { - pr_err("Cannot fopen %s\n", file_path); - return -errno; - } - - ret = read_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - - -/** - * @brief Write an image file - * - * We store the size of the actual contents in the first 8-bytes of the file. This allows us to - * determine the file size when using criu_image_streamer when fseek and fstat are not available. - * - * @param path The file path - * @param buf pointer to data to be written - * @param buf_len size of buf - * @return 0 if successful. -errno on failure - */ -int write_img_file(char *path, const void *buf, const size_t buf_len) -{ - int ret; - FILE *fp; - size_t len = buf_len; - - fp = open_img_file(path, true, &len); - if (!fp) - return -errno; - - ret = write_fp(fp, buf, buf_len); - fclose(fp); /* this will also close fd */ - return ret; -} - -void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list) -{ - struct kfd_criu_bo_bucket *bo; - - pr_info("\n"); - for (int idx = 0; idx < bo_cnt; idx++) { - bo = &bo_list[idx]; - pr_info("\n"); - pr_info("%s(), %d. KFD BO Addr: %" PRIx64 " \n", __func__, idx, bo->addr); - pr_info("%s(), %d. KFD BO Size: %" PRIx64 " \n", __func__, idx, bo->size); - pr_info("%s(), %d. KFD BO Offset: %" PRIx64 " \n", __func__, idx, bo->offset); - pr_info("%s(), %d. KFD BO Restored Offset: %" PRIx64 " \n", __func__, idx, bo->restored_offset); - pr_info("%s(), %d. KFD BO Alloc Flags: %x \n", __func__, idx, bo->alloc_flags); - pr_info("%s(), %d. KFD BO Gpu ID: %x \n", __func__, idx, bo->gpu_id); - pr_info("%s(), %d. KFD BO Dmabuf FD: %x \n", __func__, idx, bo->dmabuf_fd); - pr_info("\n"); - } - pr_info("\n"); -} diff --git a/plugins/amdgpu/amdgpu_plugin_util.h b/plugins/amdgpu/amdgpu_plugin_util.h deleted file mode 100644 index f5f752d0b..000000000 --- a/plugins/amdgpu/amdgpu_plugin_util.h +++ /dev/null @@ -1,145 +0,0 @@ -#ifndef __AMDGPU_PLUGIN_UTIL_H__ -#define __AMDGPU_PLUGIN_UTIL_H__ - -#include - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 -#endif - -#ifdef COMPILE_TESTS -#undef pr_err -#define pr_err(format, arg...) fprintf(stdout, "%s:%d ERROR:" format, __FILE__, __LINE__, ##arg) -#undef pr_info -#define pr_info(format, arg...) fprintf(stdout, "%s:%d INFO:" format, __FILE__, __LINE__, ##arg) -#undef pr_debug -#define pr_debug(format, arg...) fprintf(stdout, "%s:%d DBG:" format, __FILE__, __LINE__, ##arg) - -#undef pr_perror -#define pr_perror(format, arg...) \ - fprintf(stdout, "%s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) -#endif - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "amdgpu_plugin: " - -#ifdef DEBUG -#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) -#else -#define plugin_log_msg(fmt, ...) \ - { \ - } -#endif - - -/* Path where KFD device is surfaced */ -#define AMDGPU_KFD_DEVICE "/dev/kfd" - -/* Path where DRM devices are surfaced */ -#define AMDGPU_DRM_DEVICE "/dev/dri/renderD%d" - -/* Minimum version of KFD IOCTL's that supports C&R */ -#define KFD_IOCTL_MAJOR_VERSION 1 -#define MIN_KFD_IOCTL_MINOR_VERSION 8 - -/* Name of file having serialized data of KFD device */ -#define IMG_KFD_FILE "amdgpu-kfd-%d.img" - -/* Name of file having serialized data of KFD buffer objects (BOs) */ -#define IMG_KFD_PAGES_FILE "amdgpu-pages-%d-%04x.img" - -/* Name of file having serialized data of DRM device */ -#define IMG_DRM_FILE "amdgpu-renderD-%d.img" - -/* Name of file having serialized data of dmabuf meta */ -#define IMG_DMABUF_FILE "amdgpu-dmabuf_%d.img" - -/* Name of file having serialized data of DRM device buffer objects (BOs) */ -#define IMG_DRM_PAGES_FILE "amdgpu-drm-pages-%d-%d-%04x.img" - -/* Helper macros to Checkpoint and Restore a ROCm file */ -#define HSAKMT_SHM_PATH "/dev/shm/hsakmt_shared_mem" -#define HSAKMT_SHM "/hsakmt_shared_mem" -#define HSAKMT_SEM_PATH "/dev/shm/sem.hsakmt_semaphore" -#define HSAKMT_SEM "hsakmt_semaphore" -#define DMABUF_LINK "/dmabuf" - -/* Help macros to build sDMA command packets */ -#define SDMA_PACKET(op, sub_op, e) ((((e)&0xFFFF) << 16) | (((sub_op)&0xFF) << 8) | (((op)&0xFF) << 0)) - -#define SDMA_OPCODE_COPY 1 -#define SDMA_COPY_SUB_OPCODE_LINEAR 0 -#define SDMA_NOP 0 -#define SDMA_LINEAR_COPY_MAX_SIZE (1ULL << 21) - -enum sdma_op_type { - SDMA_OP_VRAM_READ, - SDMA_OP_VRAM_WRITE, -}; - -struct dumped_fd { - struct list_head l; - int fd; - bool is_drm; -}; - -struct shared_bo { - struct list_head l; - int handle; - bool has_exporter; -}; - -struct restore_completed_work { - struct list_head l; - int handle; - int id; -}; - -/* Helper structures to encode device topology of SRC and DEST platforms */ -extern struct tp_system src_topology; -extern struct tp_system dest_topology; - -/* Helper structures to encode device maps during Checkpoint and Restore operations */ -extern struct device_maps checkpoint_maps; -extern struct device_maps restore_maps; - -extern int fd_next; - -extern bool kfd_fw_version_check; -extern bool kfd_sdma_fw_version_check; -extern bool kfd_caches_count_check; -extern bool kfd_num_gws_check; -extern bool kfd_vram_size_check; -extern bool kfd_numa_check; -extern bool kfd_capability_check; - -int read_fp(FILE *fp, void *buf, const size_t buf_len); -int write_fp(FILE *fp, const void *buf, const size_t buf_len); -int read_file(const char *file_path, void *buf, const size_t buf_len); -int write_img_file(char *path, const void *buf, const size_t buf_len); -FILE *open_img_file(char *path, bool write, size_t *size); - -int record_dumped_fd(int fd, bool is_drm); -struct list_head *get_dumped_fds(); -void clear_dumped_fds(); - -bool shared_bo_has_exporter(int handle); -int record_shared_bo(int handle, bool is_imported); -int handle_for_shared_bo_fd(int dmabuf_fd); - -int record_completed_work(int handle, int id); -bool work_already_completed(int handle, int id); - -void clear_restore_state(); - -void print_kfd_bo_stat(int bo_cnt, struct kfd_criu_bo_bucket *bo_list); - -int sdma_copy_bo(int shared_fd, uint64_t size, FILE *storage_fp, - void *buffer, size_t buffer_size, amdgpu_device_handle h_dev, - uint64_t max_copy_size, enum sdma_op_type type, bool do_not_free); - -int serve_out_dmabuf_fd(int handle, int fd); - -#endif /* __AMDGPU_PLUGIN_UTIL_H__ */ diff --git a/plugins/amdgpu/amdgpu_socket_utils.c b/plugins/amdgpu/amdgpu_socket_utils.c deleted file mode 100644 index c8bf6d1ba..000000000 --- a/plugins/amdgpu/amdgpu_socket_utils.c +++ /dev/null @@ -1,320 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "amdgpu_socket_utils.h" -#include "criu-log.h" -#include "common/scm.h" -#include "fdstore.h" -#include "util-pie.h" -#include "util.h" - -int parallel_socket_addr_len; -struct sockaddr_un parallel_socket_addr; -int parallel_socket_id = 0; - -static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len) -{ - addr->sun_family = AF_UNIX; - snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id); - *len = SUN_LEN(addr); - *addr->sun_path = '\0'; -} - -int install_parallel_sock(void) -{ - int ret = 0; - int sock_fd; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("socket creation failed"); - return -1; - } - - amdgpu_socket_name_gen(¶llel_socket_addr, ¶llel_socket_addr_len); - ret = bind(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("bind failed"); - goto err; - } - - ret = listen(sock_fd, SOMAXCONN); - if (ret < 0) { - pr_perror("listen failed"); - goto err; - } - - parallel_socket_id = fdstore_add(sock_fd); - if (parallel_socket_id < 0) { - ret = -1; - goto err; - } -err: - close(sock_fd); - return ret; -} - -void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, - parallel_restore_cmd *restore_cmd) -{ - parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num]; - restore_entry->gpu_id = gpu_id; - restore_entry->write_id = restore_cmd->cmd_head.fd_write_num; - restore_entry->write_offset = 0; - restore_entry->read_offset = offset; - restore_entry->size = size; - - restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd; - - restore_cmd->cmd_head.entry_num += 1; - restore_cmd->cmd_head.fd_write_num += 1; -} - -void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd) -{ - restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor }; - restore_cmd->cmd_head.gpu_num += 1; -} - -static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Send parallel restore command head fail"); - return -1; - } - return 0; -} - -static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { - pr_perror("Send GPU ids of parallel restore command fail"); - return -1; - } - return 0; -} - -static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { - pr_perror("Send parallel restore command fail"); - return -1; - } - return 0; -} - -static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { - pr_perror("Send dmabuf fds fail"); - return -1; - } - return 0; -} - -int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - int sock_fd; - int ret = 0; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("Socket creation failed"); - return -1; - } - - ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("Connect failed"); - goto err; - } - - ret = send_metadata(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_gpu_ids(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_cmds(sock_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = send_dmabuf_fds(sock_fd, restore_cmd); - -err: - close(sock_fd); - return ret; -} - -int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd) -{ - restore_cmd->cmd_head.id = id; - restore_cmd->cmd_head.fd_write_num = 0; - restore_cmd->cmd_head.entry_num = 0; - restore_cmd->cmd_head.gpu_num = 0; - - restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info)); - if (!restore_cmd->gpu_ids) - return -ENOMEM; - restore_cmd->fds_write = xzalloc(num * sizeof(int)); - if (!restore_cmd->fds_write) - return -ENOMEM; - restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry)); - if (!restore_cmd->entries) - return -ENOMEM; - return 0; -} - -void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - if (restore_cmd->gpu_ids) - xfree(restore_cmd->gpu_ids); - if (restore_cmd->fds_write) - xfree(restore_cmd->fds_write); - if (restore_cmd->entries) - xfree(restore_cmd->entries); -} - -static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd) -{ - restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info)); - if (!restore_cmd->gpu_ids) - return -ENOMEM; - restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int)); - if (!restore_cmd->fds_write) - return -ENOMEM; - restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry)); - if (!restore_cmd->entries) - return -ENOMEM; - return 0; -} - -static int check_quit_cmd(parallel_restore_cmd *restore_cmd) -{ - return restore_cmd->cmd_head.fd_write_num == 0; -} - -static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Recv parallel restore command head fail"); - return -1; - } - return 0; -} - -static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) { - pr_perror("Recv parallel restore command fail"); - return -1; - } - return 0; -} - -static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) { - pr_perror("Send GPU ids of parallel restore command fail"); - return -1; - } - return 0; -} - -static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd) -{ - if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) { - pr_perror("Recv dmabuf fds fail"); - return -1; - } - return 0; -} - -int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd) -{ - int sock_fd, client_fd; - int ret = 0; - - sock_fd = fdstore_get(parallel_socket_id); - if (sock_fd < 0) - return -1; - - client_fd = accept(sock_fd, NULL, NULL); - if (client_fd < 0) { - ret = client_fd; - goto err_accept; - } - - ret = recv_metadata(client_fd, restore_cmd); - if (ret) { - goto err; - } - - // Return 1 to quit - if (check_quit_cmd(restore_cmd)) { - ret = 1; - goto err; - } - - ret = init_parallel_restore_cmd_by_head(restore_cmd); - if (ret) { - goto err; - } - - ret = recv_gpu_ids(client_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = recv_cmds(client_fd, restore_cmd); - if (ret) { - goto err; - } - - ret = recv_dmabuf_fds(client_fd, restore_cmd); - -err: - close(client_fd); -err_accept: - close(sock_fd); - return ret; -} - -int close_parallel_restore_server(void) -{ - int sock_fd; - int ret = 0; - parallel_restore_cmd_head cmd_head; - - sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0); - if (sock_fd < 0) { - pr_perror("Socket creation failed"); - return -1; - } - - ret = connect(sock_fd, (struct sockaddr *)¶llel_socket_addr, parallel_socket_addr_len); - if (ret < 0) { - pr_perror("Connect failed"); - goto err; - } - - memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head)); - if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) { - pr_perror("Send parallel restore command head fail"); - return -1; - } - -err: - close(sock_fd); - return ret; -} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_socket_utils.h b/plugins/amdgpu/amdgpu_socket_utils.h deleted file mode 100644 index d7200c6bd..000000000 --- a/plugins/amdgpu/amdgpu_socket_utils.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ -#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__ - -typedef struct { - int id; - int fd_write_num; /* The number of buffer objects to be restored. */ - int entry_num; /* The number of restore commands.*/ - int gpu_num; -} parallel_restore_cmd_head; - -typedef struct { - int gpu_id; - int minor; -} parallel_gpu_info; - -typedef struct { - int gpu_id; - int write_id; - uint64_t read_offset; - uint64_t write_offset; - uint64_t size; -} parallel_restore_entry; - -typedef struct { - parallel_restore_cmd_head cmd_head; - int *fds_write; - parallel_gpu_info *gpu_ids; - parallel_restore_entry *entries; -} parallel_restore_cmd; - -/* - * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU - * buffer object. However, initially, the ownership of these buffer objects and the metadata for - * restoration are all with the target process. Therefore, we introduce a series of functions to - * help the target process send these tasks to the main CRIU process. - */ -int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd); - -void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -int install_parallel_sock(void); - -int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd); - -void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset, - parallel_restore_cmd *restore_cmd); - -void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd); - -int close_parallel_restore_server(void); - -#endif \ No newline at end of file diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto deleted file mode 100644 index 7682a8f21..000000000 --- a/plugins/amdgpu/criu-amdgpu.proto +++ /dev/null @@ -1,95 +0,0 @@ -syntax = "proto2"; - -message dev_iolink { - required uint32 type = 1; - required uint32 node_to_id = 2; -} - -message kfd_device_entry { - required uint32 node_id = 1; - required uint32 gpu_id = 2; - required uint32 cpu_cores_count = 3; - required uint32 simd_count = 4; - required uint32 mem_banks_count = 5; - required uint32 caches_count = 6; - required uint32 io_links_count = 7; - required uint32 max_waves_per_simd = 8; - required uint32 lds_size_in_kb = 9; - required uint32 gds_size_in_kb = 10; - required uint32 num_gws = 11; - required uint32 wave_front_size = 12; - required uint32 array_count = 13; - required uint32 simd_arrays_per_engine = 14; - required uint32 cu_per_simd_array = 15; - required uint32 simd_per_cu = 16; - required uint32 max_slots_scratch_cu = 17; - required uint32 vendor_id = 18; - required uint32 device_id = 19; - required uint32 domain = 20; - required uint32 drm_render_minor = 21; - required uint64 hive_id = 22; - required uint32 num_sdma_engines = 23; - required uint32 num_sdma_xgmi_engines = 24; - required uint32 num_sdma_queues_per_engine = 25; - required uint32 num_cp_queues = 26; - required uint32 fw_version = 27; - required uint32 capability = 28; - required uint32 sdma_fw_version = 29; - required uint32 vram_public = 30; - required uint64 vram_size = 31; - repeated dev_iolink iolinks = 32; -} - -message kfd_bo_entry { - required uint64 addr = 1; - required uint64 size = 2; - required uint64 offset = 3; - required uint32 alloc_flags = 4; - required uint32 gpu_id = 5; - required uint32 handle = 6; -} - -message criu_kfd { - required uint32 pid = 1; - required uint32 num_of_gpus = 2; - required uint32 num_of_cpus = 3; - repeated kfd_device_entry device_entries = 4; - required uint64 num_of_bos = 5; - repeated kfd_bo_entry bo_entries = 6; - required uint32 num_of_objects = 7; - required uint64 shared_mem_size = 8; - required uint32 shared_mem_magic = 9; - required bytes priv_data = 10; -} - -message drm_bo_entry { - required uint64 addr = 1; - required uint64 size = 2; - required uint64 offset = 3; - required uint64 alloc_flags = 4; - required uint64 alignment = 5; - required uint32 preferred_domains = 6; - required uint32 handle = 7; - required uint32 is_import = 8; - required uint32 num_of_vms = 9; - repeated drm_vm_entry vm_entries = 10; -} - -message drm_vm_entry { - required uint64 addr = 1; - required uint64 size = 2; - required uint64 offset = 3; - required uint64 flags = 4; -} - -message criu_render_node { - required uint32 gpu_id = 1; - required uint32 id = 2; - required uint32 drm_render_minor = 3; - required uint64 num_of_bos = 4; - repeated drm_bo_entry bo_entries = 5; -} - -message criu_dmabuf_node { - required uint32 gem_handle = 1; -} diff --git a/plugins/amdgpu/drm.h b/plugins/amdgpu/drm.h deleted file mode 100644 index 3cd5cf15e..000000000 --- a/plugins/amdgpu/drm.h +++ /dev/null @@ -1,1476 +0,0 @@ -/* - * Header for the Direct Rendering Manager - * - * Author: Rickard E. (Rik) Faith - * - * Acknowledgments: - * Dec 1999, Richard Henderson , move to generic cmpxchg. - */ - -/* - * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. - * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. - * All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef _DRM_H_ -#define _DRM_H_ - -#if defined(__KERNEL__) - -#include -#include -typedef unsigned int drm_handle_t; - -#elif defined(__linux__) - -#include -#include -typedef unsigned int drm_handle_t; - -#else /* One of the BSDs */ - -#include -#include -#include -typedef int8_t __s8; -typedef uint8_t __u8; -typedef int16_t __s16; -typedef uint16_t __u16; -typedef int32_t __s32; -typedef uint32_t __u32; -typedef int64_t __s64; -typedef uint64_t __u64; -typedef size_t __kernel_size_t; -typedef unsigned long drm_handle_t; - -#endif - -#if defined(__cplusplus) -extern "C" { -#endif - -#define DRM_NAME "drm" /**< Name in kernel, /dev, and /proc */ -#define DRM_MIN_ORDER 5 /**< At least 2^5 bytes = 32 bytes */ -#define DRM_MAX_ORDER 22 /**< Up to 2^22 bytes = 4MB */ -#define DRM_RAM_PERCENT 10 /**< How much system ram can we lock? */ - -#define _DRM_LOCK_HELD 0x80000000U /**< Hardware lock is held */ -#define _DRM_LOCK_CONT 0x40000000U /**< Hardware lock is contended */ -#define _DRM_LOCK_IS_HELD(lock) ((lock) & _DRM_LOCK_HELD) -#define _DRM_LOCK_IS_CONT(lock) ((lock) & _DRM_LOCK_CONT) -#define _DRM_LOCKING_CONTEXT(lock) ((lock) & ~(_DRM_LOCK_HELD|_DRM_LOCK_CONT)) - -typedef unsigned int drm_context_t; -typedef unsigned int drm_drawable_t; -typedef unsigned int drm_magic_t; - -/* - * Cliprect. - * - * \warning: If you change this structure, make sure you change - * XF86DRIClipRectRec in the server as well - * - * \note KW: Actually it's illegal to change either for - * backwards-compatibility reasons. - */ -struct drm_clip_rect { - unsigned short x1; - unsigned short y1; - unsigned short x2; - unsigned short y2; -}; - -/* - * Drawable information. - */ -struct drm_drawable_info { - unsigned int num_rects; - struct drm_clip_rect *rects; -}; - -/* - * Texture region, - */ -struct drm_tex_region { - unsigned char next; - unsigned char prev; - unsigned char in_use; - unsigned char padding; - unsigned int age; -}; - -/* - * Hardware lock. - * - * The lock structure is a simple cache-line aligned integer. To avoid - * processor bus contention on a multiprocessor system, there should not be any - * other data stored in the same cache line. - */ -struct drm_hw_lock { - __volatile__ unsigned int lock; /**< lock variable */ - char padding[60]; /**< Pad to cache line */ -}; - -/* - * DRM_IOCTL_VERSION ioctl argument type. - * - * \sa drmGetVersion(). - */ -struct drm_version { - int version_major; /**< Major version */ - int version_minor; /**< Minor version */ - int version_patchlevel; /**< Patch level */ - __kernel_size_t name_len; /**< Length of name buffer */ - char __user *name; /**< Name of driver */ - __kernel_size_t date_len; /**< Length of date buffer */ - char __user *date; /**< User-space buffer to hold date */ - __kernel_size_t desc_len; /**< Length of desc buffer */ - char __user *desc; /**< User-space buffer to hold desc */ -}; - -/* - * DRM_IOCTL_GET_UNIQUE ioctl argument type. - * - * \sa drmGetBusid() and drmSetBusId(). - */ -struct drm_unique { - __kernel_size_t unique_len; /**< Length of unique */ - char __user *unique; /**< Unique name for driver instantiation */ -}; - -struct drm_list { - int count; /**< Length of user-space structures */ - struct drm_version __user *version; -}; - -struct drm_block { - int unused; -}; - -/* - * DRM_IOCTL_CONTROL ioctl argument type. - * - * \sa drmCtlInstHandler() and drmCtlUninstHandler(). - */ -struct drm_control { - enum { - DRM_ADD_COMMAND, - DRM_RM_COMMAND, - DRM_INST_HANDLER, - DRM_UNINST_HANDLER - } func; - int irq; -}; - -/* - * Type of memory to map. - */ -enum drm_map_type { - _DRM_FRAME_BUFFER = 0, /**< WC (no caching), no core dump */ - _DRM_REGISTERS = 1, /**< no caching, no core dump */ - _DRM_SHM = 2, /**< shared, cached */ - _DRM_AGP = 3, /**< AGP/GART */ - _DRM_SCATTER_GATHER = 4, /**< Scatter/gather memory for PCI DMA */ - _DRM_CONSISTENT = 5 /**< Consistent memory for PCI DMA */ -}; - -/* - * Memory mapping flags. - */ -enum drm_map_flags { - _DRM_RESTRICTED = 0x01, /**< Cannot be mapped to user-virtual */ - _DRM_READ_ONLY = 0x02, - _DRM_LOCKED = 0x04, /**< shared, cached, locked */ - _DRM_KERNEL = 0x08, /**< kernel requires access */ - _DRM_WRITE_COMBINING = 0x10, /**< use write-combining if available */ - _DRM_CONTAINS_LOCK = 0x20, /**< SHM page that contains lock */ - _DRM_REMOVABLE = 0x40, /**< Removable mapping */ - _DRM_DRIVER = 0x80 /**< Managed by driver */ -}; - -struct drm_ctx_priv_map { - unsigned int ctx_id; /**< Context requesting private mapping */ - void *handle; /**< Handle of map */ -}; - -/* - * DRM_IOCTL_GET_MAP, DRM_IOCTL_ADD_MAP and DRM_IOCTL_RM_MAP ioctls - * argument type. - * - * \sa drmAddMap(). - */ -struct drm_map { - unsigned long offset; /**< Requested physical address (0 for SAREA)*/ - unsigned long size; /**< Requested physical size (bytes) */ - enum drm_map_type type; /**< Type of memory to map */ - enum drm_map_flags flags; /**< Flags */ - void *handle; /**< User-space: "Handle" to pass to mmap() */ - /**< Kernel-space: kernel-virtual address */ - int mtrr; /**< MTRR slot used */ - /* Private data */ -}; - -/* - * DRM_IOCTL_GET_CLIENT ioctl argument type. - */ -struct drm_client { - int idx; /**< Which client desired? */ - int auth; /**< Is client authenticated? */ - unsigned long pid; /**< Process ID */ - unsigned long uid; /**< User ID */ - unsigned long magic; /**< Magic */ - unsigned long iocs; /**< Ioctl count */ -}; - -enum drm_stat_type { - _DRM_STAT_LOCK, - _DRM_STAT_OPENS, - _DRM_STAT_CLOSES, - _DRM_STAT_IOCTLS, - _DRM_STAT_LOCKS, - _DRM_STAT_UNLOCKS, - _DRM_STAT_VALUE, /**< Generic value */ - _DRM_STAT_BYTE, /**< Generic byte counter (1024bytes/K) */ - _DRM_STAT_COUNT, /**< Generic non-byte counter (1000/k) */ - - _DRM_STAT_IRQ, /**< IRQ */ - _DRM_STAT_PRIMARY, /**< Primary DMA bytes */ - _DRM_STAT_SECONDARY, /**< Secondary DMA bytes */ - _DRM_STAT_DMA, /**< DMA */ - _DRM_STAT_SPECIAL, /**< Special DMA (e.g., priority or polled) */ - _DRM_STAT_MISSED /**< Missed DMA opportunity */ - /* Add to the *END* of the list */ -}; - -/* - * DRM_IOCTL_GET_STATS ioctl argument type. - */ -struct drm_stats { - unsigned long count; - struct { - unsigned long value; - enum drm_stat_type type; - } data[15]; -}; - -/* - * Hardware locking flags. - */ -enum drm_lock_flags { - _DRM_LOCK_READY = 0x01, /**< Wait until hardware is ready for DMA */ - _DRM_LOCK_QUIESCENT = 0x02, /**< Wait until hardware quiescent */ - _DRM_LOCK_FLUSH = 0x04, /**< Flush this context's DMA queue first */ - _DRM_LOCK_FLUSH_ALL = 0x08, /**< Flush all DMA queues first */ - /* These *HALT* flags aren't supported yet - -- they will be used to support the - full-screen DGA-like mode. */ - _DRM_HALT_ALL_QUEUES = 0x10, /**< Halt all current and future queues */ - _DRM_HALT_CUR_QUEUES = 0x20 /**< Halt all current queues */ -}; - -/* - * DRM_IOCTL_LOCK, DRM_IOCTL_UNLOCK and DRM_IOCTL_FINISH ioctl argument type. - * - * \sa drmGetLock() and drmUnlock(). - */ -struct drm_lock { - int context; - enum drm_lock_flags flags; -}; - -/* - * DMA flags - * - * \warning - * These values \e must match xf86drm.h. - * - * \sa drm_dma. - */ -enum drm_dma_flags { - /* Flags for DMA buffer dispatch */ - _DRM_DMA_BLOCK = 0x01, /**< - * Block until buffer dispatched. - * - * \note The buffer may not yet have - * been processed by the hardware -- - * getting a hardware lock with the - * hardware quiescent will ensure - * that the buffer has been - * processed. - */ - _DRM_DMA_WHILE_LOCKED = 0x02, /**< Dispatch while lock held */ - _DRM_DMA_PRIORITY = 0x04, /**< High priority dispatch */ - - /* Flags for DMA buffer request */ - _DRM_DMA_WAIT = 0x10, /**< Wait for free buffers */ - _DRM_DMA_SMALLER_OK = 0x20, /**< Smaller-than-requested buffers OK */ - _DRM_DMA_LARGER_OK = 0x40 /**< Larger-than-requested buffers OK */ -}; - -/* - * DRM_IOCTL_ADD_BUFS and DRM_IOCTL_MARK_BUFS ioctl argument type. - * - * \sa drmAddBufs(). - */ -struct drm_buf_desc { - int count; /**< Number of buffers of this size */ - int size; /**< Size in bytes */ - int low_mark; /**< Low water mark */ - int high_mark; /**< High water mark */ - enum { - _DRM_PAGE_ALIGN = 0x01, /**< Align on page boundaries for DMA */ - _DRM_AGP_BUFFER = 0x02, /**< Buffer is in AGP space */ - _DRM_SG_BUFFER = 0x04, /**< Scatter/gather memory buffer */ - _DRM_FB_BUFFER = 0x08, /**< Buffer is in frame buffer */ - _DRM_PCI_BUFFER_RO = 0x10 /**< Map PCI DMA buffer read-only */ - } flags; - unsigned long agp_start; /**< - * Start address of where the AGP buffers are - * in the AGP aperture - */ -}; - -/* - * DRM_IOCTL_INFO_BUFS ioctl argument type. - */ -struct drm_buf_info { - int count; /**< Entries in list */ - struct drm_buf_desc __user *list; -}; - -/* - * DRM_IOCTL_FREE_BUFS ioctl argument type. - */ -struct drm_buf_free { - int count; - int __user *list; -}; - -/* - * Buffer information - * - * \sa drm_buf_map. - */ -struct drm_buf_pub { - int idx; /**< Index into the master buffer list */ - int total; /**< Buffer size */ - int used; /**< Amount of buffer in use (for DMA) */ - void __user *address; /**< Address of buffer */ -}; - -/* - * DRM_IOCTL_MAP_BUFS ioctl argument type. - */ -struct drm_buf_map { - int count; /**< Length of the buffer list */ -#ifdef __cplusplus - void __user *virt; -#else - void __user *virtual; /**< Mmap'd area in user-virtual */ -#endif - struct drm_buf_pub __user *list; /**< Buffer information */ -}; - -/* - * DRM_IOCTL_DMA ioctl argument type. - * - * Indices here refer to the offset into the buffer list in drm_buf_get. - * - * \sa drmDMA(). - */ -struct drm_dma { - int context; /**< Context handle */ - int send_count; /**< Number of buffers to send */ - int __user *send_indices; /**< List of handles to buffers */ - int __user *send_sizes; /**< Lengths of data to send */ - enum drm_dma_flags flags; /**< Flags */ - int request_count; /**< Number of buffers requested */ - int request_size; /**< Desired size for buffers */ - int __user *request_indices; /**< Buffer information */ - int __user *request_sizes; - int granted_count; /**< Number of buffers granted */ -}; - -enum drm_ctx_flags { - _DRM_CONTEXT_PRESERVED = 0x01, - _DRM_CONTEXT_2DONLY = 0x02 -}; - -/* - * DRM_IOCTL_ADD_CTX ioctl argument type. - * - * \sa drmCreateContext() and drmDestroyContext(). - */ -struct drm_ctx { - drm_context_t handle; - enum drm_ctx_flags flags; -}; - -/* - * DRM_IOCTL_RES_CTX ioctl argument type. - */ -struct drm_ctx_res { - int count; - struct drm_ctx __user *contexts; -}; - -/* - * DRM_IOCTL_ADD_DRAW and DRM_IOCTL_RM_DRAW ioctl argument type. - */ -struct drm_draw { - drm_drawable_t handle; -}; - -/* - * DRM_IOCTL_UPDATE_DRAW ioctl argument type. - */ -typedef enum { - DRM_DRAWABLE_CLIPRECTS -} drm_drawable_info_type_t; - -struct drm_update_draw { - drm_drawable_t handle; - unsigned int type; - unsigned int num; - unsigned long long data; -}; - -/* - * DRM_IOCTL_GET_MAGIC and DRM_IOCTL_AUTH_MAGIC ioctl argument type. - */ -struct drm_auth { - drm_magic_t magic; -}; - -/* - * DRM_IOCTL_IRQ_BUSID ioctl argument type. - * - * \sa drmGetInterruptFromBusID(). - */ -struct drm_irq_busid { - int irq; /**< IRQ number */ - int busnum; /**< bus number */ - int devnum; /**< device number */ - int funcnum; /**< function number */ -}; - -enum drm_vblank_seq_type { - _DRM_VBLANK_ABSOLUTE = 0x0, /**< Wait for specific vblank sequence number */ - _DRM_VBLANK_RELATIVE = 0x1, /**< Wait for given number of vblanks */ - /* bits 1-6 are reserved for high crtcs */ - _DRM_VBLANK_HIGH_CRTC_MASK = 0x0000003e, - _DRM_VBLANK_EVENT = 0x4000000, /**< Send event instead of blocking */ - _DRM_VBLANK_FLIP = 0x8000000, /**< Scheduled buffer swap should flip */ - _DRM_VBLANK_NEXTONMISS = 0x10000000, /**< If missed, wait for next vblank */ - _DRM_VBLANK_SECONDARY = 0x20000000, /**< Secondary display controller */ - _DRM_VBLANK_SIGNAL = 0x40000000 /**< Send signal instead of blocking, unsupported */ -}; -#define _DRM_VBLANK_HIGH_CRTC_SHIFT 1 - -#define _DRM_VBLANK_TYPES_MASK (_DRM_VBLANK_ABSOLUTE | _DRM_VBLANK_RELATIVE) -#define _DRM_VBLANK_FLAGS_MASK (_DRM_VBLANK_EVENT | _DRM_VBLANK_SIGNAL | \ - _DRM_VBLANK_SECONDARY | _DRM_VBLANK_NEXTONMISS) - -struct drm_wait_vblank_request { - enum drm_vblank_seq_type type; - unsigned int sequence; - unsigned long signal; -}; - -struct drm_wait_vblank_reply { - enum drm_vblank_seq_type type; - unsigned int sequence; - long tval_sec; - long tval_usec; -}; - -/* - * DRM_IOCTL_WAIT_VBLANK ioctl argument type. - * - * \sa drmWaitVBlank(). - */ -union drm_wait_vblank { - struct drm_wait_vblank_request request; - struct drm_wait_vblank_reply reply; -}; - -#define _DRM_PRE_MODESET 1 -#define _DRM_POST_MODESET 2 - -/* - * DRM_IOCTL_MODESET_CTL ioctl argument type - * - * \sa drmModesetCtl(). - */ -struct drm_modeset_ctl { - __u32 crtc; - __u32 cmd; -}; - -/* - * DRM_IOCTL_AGP_ENABLE ioctl argument type. - * - * \sa drmAgpEnable(). - */ -struct drm_agp_mode { - unsigned long mode; /**< AGP mode */ -}; - -/* - * DRM_IOCTL_AGP_ALLOC and DRM_IOCTL_AGP_FREE ioctls argument type. - * - * \sa drmAgpAlloc() and drmAgpFree(). - */ -struct drm_agp_buffer { - unsigned long size; /**< In bytes -- will round to page boundary */ - unsigned long handle; /**< Used for binding / unbinding */ - unsigned long type; /**< Type of memory to allocate */ - unsigned long physical; /**< Physical used by i810 */ -}; - -/* - * DRM_IOCTL_AGP_BIND and DRM_IOCTL_AGP_UNBIND ioctls argument type. - * - * \sa drmAgpBind() and drmAgpUnbind(). - */ -struct drm_agp_binding { - unsigned long handle; /**< From drm_agp_buffer */ - unsigned long offset; /**< In bytes -- will round to page boundary */ -}; - -/* - * DRM_IOCTL_AGP_INFO ioctl argument type. - * - * \sa drmAgpVersionMajor(), drmAgpVersionMinor(), drmAgpGetMode(), - * drmAgpBase(), drmAgpSize(), drmAgpMemoryUsed(), drmAgpMemoryAvail(), - * drmAgpVendorId() and drmAgpDeviceId(). - */ -struct drm_agp_info { - int agp_version_major; - int agp_version_minor; - unsigned long mode; - unsigned long aperture_base; /* physical address */ - unsigned long aperture_size; /* bytes */ - unsigned long memory_allowed; /* bytes */ - unsigned long memory_used; - - /* PCI information */ - unsigned short id_vendor; - unsigned short id_device; -}; - -/* - * DRM_IOCTL_SG_ALLOC ioctl argument type. - */ -struct drm_scatter_gather { - unsigned long size; /**< In bytes -- will round to page boundary */ - unsigned long handle; /**< Used for mapping / unmapping */ -}; - -/* - * DRM_IOCTL_SET_VERSION ioctl argument type. - */ -struct drm_set_version { - int drm_di_major; - int drm_di_minor; - int drm_dd_major; - int drm_dd_minor; -}; - -/** - * struct drm_gem_close - Argument for &DRM_IOCTL_GEM_CLOSE ioctl. - * @handle: Handle of the object to be closed. - * @pad: Padding. - * - * Releases the handle to an mm object. - */ -struct drm_gem_close { - __u32 handle; - __u32 pad; -}; - -/** - * struct drm_gem_flink - Argument for &DRM_IOCTL_GEM_FLINK ioctl. - * @handle: Handle for the object being named. - * @name: Returned global name. - * - * Create a global name for an object, returning the name. - * - * Note that the name does not hold a reference; when the object - * is freed, the name goes away. - */ -struct drm_gem_flink { - __u32 handle; - __u32 name; -}; - -/** - * struct drm_gem_open - Argument for &DRM_IOCTL_GEM_OPEN ioctl. - * @name: Name of object being opened. - * @handle: Returned handle for the object. - * @size: Returned size of the object - * - * Open an object using the global name, returning a handle and the size. - * - * This handle (of course) holds a reference to the object, so the object - * will not go away until the handle is deleted. - */ -struct drm_gem_open { - __u32 name; - __u32 handle; - __u64 size; -}; - -/** - * struct drm_gem_change_handle - Argument for &DRM_IOCTL_GEM_CHANGE_HANDLE ioctl. - * @handle: The handle of a gem object. - * @new_handle: An available gem handle. - * - * This ioctl changes the handle of a GEM object to the specified one. - * The new handle must be unused. On success the old handle is closed - * and all further IOCTL should refer to the new handle only. - * Calls to DRM_IOCTL_PRIME_FD_TO_HANDLE will return the new handle. - */ -struct drm_gem_change_handle { - __u32 handle; - __u32 new_handle; -}; - -/** - * DRM_CAP_DUMB_BUFFER - * - * If set to 1, the driver supports creating dumb buffers via the - * &DRM_IOCTL_MODE_CREATE_DUMB ioctl. - */ -#define DRM_CAP_DUMB_BUFFER 0x1 -/** - * DRM_CAP_VBLANK_HIGH_CRTC - * - * If set to 1, the kernel supports specifying a :ref:`CRTC index` - * in the high bits of &drm_wait_vblank_request.type. - * - * Starting kernel version 2.6.39, this capability is always set to 1. - */ -#define DRM_CAP_VBLANK_HIGH_CRTC 0x2 -/** - * DRM_CAP_DUMB_PREFERRED_DEPTH - * - * The preferred bit depth for dumb buffers. - * - * The bit depth is the number of bits used to indicate the color of a single - * pixel excluding any padding. This is different from the number of bits per - * pixel. For instance, XRGB8888 has a bit depth of 24 but has 32 bits per - * pixel. - * - * Note that this preference only applies to dumb buffers, it's irrelevant for - * other types of buffers. - */ -#define DRM_CAP_DUMB_PREFERRED_DEPTH 0x3 -/** - * DRM_CAP_DUMB_PREFER_SHADOW - * - * If set to 1, the driver prefers userspace to render to a shadow buffer - * instead of directly rendering to a dumb buffer. For best speed, userspace - * should do streaming ordered memory copies into the dumb buffer and never - * read from it. - * - * Note that this preference only applies to dumb buffers, it's irrelevant for - * other types of buffers. - */ -#define DRM_CAP_DUMB_PREFER_SHADOW 0x4 -/** - * DRM_CAP_PRIME - * - * Bitfield of supported PRIME sharing capabilities. See &DRM_PRIME_CAP_IMPORT - * and &DRM_PRIME_CAP_EXPORT. - * - * Starting from kernel version 6.6, both &DRM_PRIME_CAP_IMPORT and - * &DRM_PRIME_CAP_EXPORT are always advertised. - * - * PRIME buffers are exposed as dma-buf file descriptors. - * See :ref:`prime_buffer_sharing`. - */ -#define DRM_CAP_PRIME 0x5 -/** - * DRM_PRIME_CAP_IMPORT - * - * If this bit is set in &DRM_CAP_PRIME, the driver supports importing PRIME - * buffers via the &DRM_IOCTL_PRIME_FD_TO_HANDLE ioctl. - * - * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. - */ -#define DRM_PRIME_CAP_IMPORT 0x1 -/** - * DRM_PRIME_CAP_EXPORT - * - * If this bit is set in &DRM_CAP_PRIME, the driver supports exporting PRIME - * buffers via the &DRM_IOCTL_PRIME_HANDLE_TO_FD ioctl. - * - * Starting from kernel version 6.6, this bit is always set in &DRM_CAP_PRIME. - */ -#define DRM_PRIME_CAP_EXPORT 0x2 -/** - * DRM_CAP_TIMESTAMP_MONOTONIC - * - * If set to 0, the kernel will report timestamps with ``CLOCK_REALTIME`` in - * struct drm_event_vblank. If set to 1, the kernel will report timestamps with - * ``CLOCK_MONOTONIC``. See ``clock_gettime(2)`` for the definition of these - * clocks. - * - * Starting from kernel version 2.6.39, the default value for this capability - * is 1. Starting kernel version 4.15, this capability is always set to 1. - */ -#define DRM_CAP_TIMESTAMP_MONOTONIC 0x6 -/** - * DRM_CAP_ASYNC_PAGE_FLIP - * - * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for legacy - * page-flips. - */ -#define DRM_CAP_ASYNC_PAGE_FLIP 0x7 -/** - * DRM_CAP_CURSOR_WIDTH - * - * The ``CURSOR_WIDTH`` and ``CURSOR_HEIGHT`` capabilities return a valid - * width x height combination for the hardware cursor. The intention is that a - * hardware agnostic userspace can query a cursor plane size to use. - * - * Note that the cross-driver contract is to merely return a valid size; - * drivers are free to attach another meaning on top, eg. i915 returns the - * maximum plane size. - */ -#define DRM_CAP_CURSOR_WIDTH 0x8 -/** - * DRM_CAP_CURSOR_HEIGHT - * - * See &DRM_CAP_CURSOR_WIDTH. - */ -#define DRM_CAP_CURSOR_HEIGHT 0x9 -/** - * DRM_CAP_ADDFB2_MODIFIERS - * - * If set to 1, the driver supports supplying modifiers in the - * &DRM_IOCTL_MODE_ADDFB2 ioctl. - */ -#define DRM_CAP_ADDFB2_MODIFIERS 0x10 -/** - * DRM_CAP_PAGE_FLIP_TARGET - * - * If set to 1, the driver supports the &DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE and - * &DRM_MODE_PAGE_FLIP_TARGET_RELATIVE flags in - * &drm_mode_crtc_page_flip_target.flags for the &DRM_IOCTL_MODE_PAGE_FLIP - * ioctl. - */ -#define DRM_CAP_PAGE_FLIP_TARGET 0x11 -/** - * DRM_CAP_CRTC_IN_VBLANK_EVENT - * - * If set to 1, the kernel supports reporting the CRTC ID in - * &drm_event_vblank.crtc_id for the &DRM_EVENT_VBLANK and - * &DRM_EVENT_FLIP_COMPLETE events. - * - * Starting kernel version 4.12, this capability is always set to 1. - */ -#define DRM_CAP_CRTC_IN_VBLANK_EVENT 0x12 -/** - * DRM_CAP_SYNCOBJ - * - * If set to 1, the driver supports sync objects. See :ref:`drm_sync_objects`. - */ -#define DRM_CAP_SYNCOBJ 0x13 -/** - * DRM_CAP_SYNCOBJ_TIMELINE - * - * If set to 1, the driver supports timeline operations on sync objects. See - * :ref:`drm_sync_objects`. - */ -#define DRM_CAP_SYNCOBJ_TIMELINE 0x14 -/** - * DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP - * - * If set to 1, the driver supports &DRM_MODE_PAGE_FLIP_ASYNC for atomic - * commits. - */ -#define DRM_CAP_ATOMIC_ASYNC_PAGE_FLIP 0x15 - -/* DRM_IOCTL_GET_CAP ioctl argument type */ -struct drm_get_cap { - __u64 capability; - __u64 value; -}; - -/** - * DRM_CLIENT_CAP_STEREO_3D - * - * If set to 1, the DRM core will expose the stereo 3D capabilities of the - * monitor by advertising the supported 3D layouts in the flags of struct - * drm_mode_modeinfo. See ``DRM_MODE_FLAG_3D_*``. - * - * This capability is always supported for all drivers starting from kernel - * version 3.13. - */ -#define DRM_CLIENT_CAP_STEREO_3D 1 - -/** - * DRM_CLIENT_CAP_UNIVERSAL_PLANES - * - * If set to 1, the DRM core will expose all planes (overlay, primary, and - * cursor) to userspace. - * - * This capability has been introduced in kernel version 3.15. Starting from - * kernel version 3.17, this capability is always supported for all drivers. - */ -#define DRM_CLIENT_CAP_UNIVERSAL_PLANES 2 - -/** - * DRM_CLIENT_CAP_ATOMIC - * - * If set to 1, the DRM core will expose atomic properties to userspace. This - * implicitly enables &DRM_CLIENT_CAP_UNIVERSAL_PLANES and - * &DRM_CLIENT_CAP_ASPECT_RATIO. - * - * If the driver doesn't support atomic mode-setting, enabling this capability - * will fail with -EOPNOTSUPP. - * - * This capability has been introduced in kernel version 4.0. Starting from - * kernel version 4.2, this capability is always supported for atomic-capable - * drivers. - */ -#define DRM_CLIENT_CAP_ATOMIC 3 - -/** - * DRM_CLIENT_CAP_ASPECT_RATIO - * - * If set to 1, the DRM core will provide aspect ratio information in modes. - * See ``DRM_MODE_FLAG_PIC_AR_*``. - * - * This capability is always supported for all drivers starting from kernel - * version 4.18. - */ -#define DRM_CLIENT_CAP_ASPECT_RATIO 4 - -/** - * DRM_CLIENT_CAP_WRITEBACK_CONNECTORS - * - * If set to 1, the DRM core will expose special connectors to be used for - * writing back to memory the scene setup in the commit. The client must enable - * &DRM_CLIENT_CAP_ATOMIC first. - * - * This capability is always supported for atomic-capable drivers starting from - * kernel version 4.19. - */ -#define DRM_CLIENT_CAP_WRITEBACK_CONNECTORS 5 - -/** - * DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT - * - * Drivers for para-virtualized hardware (e.g. vmwgfx, qxl, virtio and - * virtualbox) have additional restrictions for cursor planes (thus - * making cursor planes on those drivers not truly universal,) e.g. - * they need cursor planes to act like one would expect from a mouse - * cursor and have correctly set hotspot properties. - * If this client cap is not set the DRM core will hide cursor plane on - * those virtualized drivers because not setting it implies that the - * client is not capable of dealing with those extra restictions. - * Clients which do set cursor hotspot and treat the cursor plane - * like a mouse cursor should set this property. - * The client must enable &DRM_CLIENT_CAP_ATOMIC first. - * - * Setting this property on drivers which do not special case - * cursor planes (i.e. non-virtualized drivers) will return - * EOPNOTSUPP, which can be used by userspace to gauge - * requirements of the hardware/drivers they're running on. - * - * This capability is always supported for atomic-capable virtualized - * drivers starting from kernel version 6.6. - */ -#define DRM_CLIENT_CAP_CURSOR_PLANE_HOTSPOT 6 - -/* DRM_IOCTL_SET_CLIENT_CAP ioctl argument type */ -struct drm_set_client_cap { - __u64 capability; - __u64 value; -}; - -#define DRM_RDWR O_RDWR -#define DRM_CLOEXEC O_CLOEXEC -struct drm_prime_handle { - __u32 handle; - - /** Flags.. only applicable for handle->fd */ - __u32 flags; - - /** Returned dmabuf file descriptor */ - __s32 fd; -}; - -struct drm_syncobj_create { - __u32 handle; -#define DRM_SYNCOBJ_CREATE_SIGNALED (1 << 0) - __u32 flags; -}; - -struct drm_syncobj_destroy { - __u32 handle; - __u32 pad; -}; - -#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) -#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE (1 << 1) -#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) -#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE (1 << 1) -struct drm_syncobj_handle { - __u32 handle; - __u32 flags; - - __s32 fd; - __u32 pad; - - __u64 point; -}; - -struct drm_syncobj_transfer { - __u32 src_handle; - __u32 dst_handle; - __u64 src_point; - __u64 dst_point; - __u32 flags; - __u32 pad; -}; - -#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL (1 << 0) -#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT (1 << 1) -#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE (1 << 2) /* wait for time point to become available */ -#define DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE (1 << 3) /* set fence deadline to deadline_nsec */ -struct drm_syncobj_wait { - __u64 handles; - /* absolute timeout */ - __s64 timeout_nsec; - __u32 count_handles; - __u32 flags; - __u32 first_signaled; /* only valid when not waiting all */ - __u32 pad; - /** - * @deadline_nsec - fence deadline hint - * - * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing - * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is - * set. - */ - __u64 deadline_nsec; -}; - -struct drm_syncobj_timeline_wait { - __u64 handles; - /* wait on specific timeline point for every handles*/ - __u64 points; - /* absolute timeout */ - __s64 timeout_nsec; - __u32 count_handles; - __u32 flags; - __u32 first_signaled; /* only valid when not waiting all */ - __u32 pad; - /** - * @deadline_nsec - fence deadline hint - * - * Deadline hint, in absolute CLOCK_MONOTONIC, to set on backing - * fence(s) if the DRM_SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE flag is - * set. - */ - __u64 deadline_nsec; -}; - -/** - * struct drm_syncobj_eventfd - * @handle: syncobj handle. - * @flags: Zero to wait for the point to be signalled, or - * &DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE to wait for a fence to be - * available for the point. - * @point: syncobj timeline point (set to zero for binary syncobjs). - * @fd: Existing eventfd to sent events to. - * @pad: Must be zero. - * - * Register an eventfd to be signalled by a syncobj. The eventfd counter will - * be incremented by one. - */ -struct drm_syncobj_eventfd { - __u32 handle; - __u32 flags; - __u64 point; - __s32 fd; - __u32 pad; -}; - - -struct drm_syncobj_array { - __u64 handles; - __u32 count_handles; - __u32 pad; -}; - -#define DRM_SYNCOBJ_QUERY_FLAGS_LAST_SUBMITTED (1 << 0) /* last available point on timeline syncobj */ -struct drm_syncobj_timeline_array { - __u64 handles; - __u64 points; - __u32 count_handles; - __u32 flags; -}; - - -/* Query current scanout sequence number */ -struct drm_crtc_get_sequence { - __u32 crtc_id; /* requested crtc_id */ - __u32 active; /* return: crtc output is active */ - __u64 sequence; /* return: most recent vblank sequence */ - __s64 sequence_ns; /* return: most recent time of first pixel out */ -}; - -/* Queue event to be delivered at specified sequence. Time stamp marks - * when the first pixel of the refresh cycle leaves the display engine - * for the display - */ -#define DRM_CRTC_SEQUENCE_RELATIVE 0x00000001 /* sequence is relative to current */ -#define DRM_CRTC_SEQUENCE_NEXT_ON_MISS 0x00000002 /* Use next sequence if we've missed */ - -struct drm_crtc_queue_sequence { - __u32 crtc_id; - __u32 flags; - __u64 sequence; /* on input, target sequence. on output, actual sequence */ - __u64 user_data; /* user data passed to event */ -}; - -#define DRM_CLIENT_NAME_MAX_LEN 64 -struct drm_set_client_name { - __u64 name_len; - __u64 name; -}; - - -#if defined(__cplusplus) -} -#endif - -#include "drm_mode.h" - -#if defined(__cplusplus) -extern "C" { -#endif - -#define DRM_IOCTL_BASE 'd' -#define DRM_IO(nr) _IO(DRM_IOCTL_BASE,nr) -#define DRM_IOR(nr,type) _IOR(DRM_IOCTL_BASE,nr,type) -#define DRM_IOW(nr,type) _IOW(DRM_IOCTL_BASE,nr,type) -#define DRM_IOWR(nr,type) _IOWR(DRM_IOCTL_BASE,nr,type) - -#define DRM_IOCTL_VERSION DRM_IOWR(0x00, struct drm_version) -#define DRM_IOCTL_GET_UNIQUE DRM_IOWR(0x01, struct drm_unique) -#define DRM_IOCTL_GET_MAGIC DRM_IOR( 0x02, struct drm_auth) -#define DRM_IOCTL_IRQ_BUSID DRM_IOWR(0x03, struct drm_irq_busid) -#define DRM_IOCTL_GET_MAP DRM_IOWR(0x04, struct drm_map) -#define DRM_IOCTL_GET_CLIENT DRM_IOWR(0x05, struct drm_client) -#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) -#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) -#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) -/** - * DRM_IOCTL_GEM_CLOSE - Close a GEM handle. - * - * GEM handles are not reference-counted by the kernel. User-space is - * responsible for managing their lifetime. For example, if user-space imports - * the same memory object twice on the same DRM file description, the same GEM - * handle is returned by both imports, and user-space needs to ensure - * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen - * when a memory object is allocated, then exported and imported again on the - * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception - * and always returns fresh new GEM handles even if an existing GEM handle - * already refers to the same memory object before the IOCTL is performed. - */ -#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) -#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) -#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) -#define DRM_IOCTL_GET_CAP DRM_IOWR(0x0c, struct drm_get_cap) -#define DRM_IOCTL_SET_CLIENT_CAP DRM_IOW( 0x0d, struct drm_set_client_cap) - -#define DRM_IOCTL_SET_UNIQUE DRM_IOW( 0x10, struct drm_unique) -#define DRM_IOCTL_AUTH_MAGIC DRM_IOW( 0x11, struct drm_auth) -#define DRM_IOCTL_BLOCK DRM_IOWR(0x12, struct drm_block) -#define DRM_IOCTL_UNBLOCK DRM_IOWR(0x13, struct drm_block) -#define DRM_IOCTL_CONTROL DRM_IOW( 0x14, struct drm_control) -#define DRM_IOCTL_ADD_MAP DRM_IOWR(0x15, struct drm_map) -#define DRM_IOCTL_ADD_BUFS DRM_IOWR(0x16, struct drm_buf_desc) -#define DRM_IOCTL_MARK_BUFS DRM_IOW( 0x17, struct drm_buf_desc) -#define DRM_IOCTL_INFO_BUFS DRM_IOWR(0x18, struct drm_buf_info) -#define DRM_IOCTL_MAP_BUFS DRM_IOWR(0x19, struct drm_buf_map) -#define DRM_IOCTL_FREE_BUFS DRM_IOW( 0x1a, struct drm_buf_free) - -#define DRM_IOCTL_RM_MAP DRM_IOW( 0x1b, struct drm_map) - -#define DRM_IOCTL_SET_SAREA_CTX DRM_IOW( 0x1c, struct drm_ctx_priv_map) -#define DRM_IOCTL_GET_SAREA_CTX DRM_IOWR(0x1d, struct drm_ctx_priv_map) - -#define DRM_IOCTL_SET_MASTER DRM_IO(0x1e) -#define DRM_IOCTL_DROP_MASTER DRM_IO(0x1f) - -#define DRM_IOCTL_ADD_CTX DRM_IOWR(0x20, struct drm_ctx) -#define DRM_IOCTL_RM_CTX DRM_IOWR(0x21, struct drm_ctx) -#define DRM_IOCTL_MOD_CTX DRM_IOW( 0x22, struct drm_ctx) -#define DRM_IOCTL_GET_CTX DRM_IOWR(0x23, struct drm_ctx) -#define DRM_IOCTL_SWITCH_CTX DRM_IOW( 0x24, struct drm_ctx) -#define DRM_IOCTL_NEW_CTX DRM_IOW( 0x25, struct drm_ctx) -#define DRM_IOCTL_RES_CTX DRM_IOWR(0x26, struct drm_ctx_res) -#define DRM_IOCTL_ADD_DRAW DRM_IOWR(0x27, struct drm_draw) -#define DRM_IOCTL_RM_DRAW DRM_IOWR(0x28, struct drm_draw) -#define DRM_IOCTL_DMA DRM_IOWR(0x29, struct drm_dma) -#define DRM_IOCTL_LOCK DRM_IOW( 0x2a, struct drm_lock) -#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) -#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) - -/** - * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD. - * - * User-space sets &drm_prime_handle.handle with the GEM handle to export and - * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in - * &drm_prime_handle.fd. - * - * The export can fail for any driver-specific reason, e.g. because export is - * not supported for this specific GEM handle (but might be for others). - * - * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT. - */ -#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) -/** - * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle. - * - * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to - * import, and gets back a GEM handle in &drm_prime_handle.handle. - * &drm_prime_handle.flags is unused. - * - * If an existing GEM handle refers to the memory object backing the DMA-BUF, - * that GEM handle is returned. Therefore user-space which needs to handle - * arbitrary DMA-BUFs must have a user-space lookup data structure to manually - * reference-count duplicated GEM handles. For more information see - * &DRM_IOCTL_GEM_CLOSE. - * - * The import can fail for any driver-specific reason, e.g. because import is - * only supported for DMA-BUFs allocated on this DRM device. - * - * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT. - */ -#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) - -#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) -#define DRM_IOCTL_AGP_RELEASE DRM_IO( 0x31) -#define DRM_IOCTL_AGP_ENABLE DRM_IOW( 0x32, struct drm_agp_mode) -#define DRM_IOCTL_AGP_INFO DRM_IOR( 0x33, struct drm_agp_info) -#define DRM_IOCTL_AGP_ALLOC DRM_IOWR(0x34, struct drm_agp_buffer) -#define DRM_IOCTL_AGP_FREE DRM_IOW( 0x35, struct drm_agp_buffer) -#define DRM_IOCTL_AGP_BIND DRM_IOW( 0x36, struct drm_agp_binding) -#define DRM_IOCTL_AGP_UNBIND DRM_IOW( 0x37, struct drm_agp_binding) - -#define DRM_IOCTL_SG_ALLOC DRM_IOWR(0x38, struct drm_scatter_gather) -#define DRM_IOCTL_SG_FREE DRM_IOW( 0x39, struct drm_scatter_gather) - -#define DRM_IOCTL_WAIT_VBLANK DRM_IOWR(0x3a, union drm_wait_vblank) - -#define DRM_IOCTL_CRTC_GET_SEQUENCE DRM_IOWR(0x3b, struct drm_crtc_get_sequence) -#define DRM_IOCTL_CRTC_QUEUE_SEQUENCE DRM_IOWR(0x3c, struct drm_crtc_queue_sequence) - -#define DRM_IOCTL_UPDATE_DRAW DRM_IOW(0x3f, struct drm_update_draw) - -#define DRM_IOCTL_MODE_GETRESOURCES DRM_IOWR(0xA0, struct drm_mode_card_res) -#define DRM_IOCTL_MODE_GETCRTC DRM_IOWR(0xA1, struct drm_mode_crtc) -#define DRM_IOCTL_MODE_SETCRTC DRM_IOWR(0xA2, struct drm_mode_crtc) -#define DRM_IOCTL_MODE_CURSOR DRM_IOWR(0xA3, struct drm_mode_cursor) -#define DRM_IOCTL_MODE_GETGAMMA DRM_IOWR(0xA4, struct drm_mode_crtc_lut) -#define DRM_IOCTL_MODE_SETGAMMA DRM_IOWR(0xA5, struct drm_mode_crtc_lut) -#define DRM_IOCTL_MODE_GETENCODER DRM_IOWR(0xA6, struct drm_mode_get_encoder) -#define DRM_IOCTL_MODE_GETCONNECTOR DRM_IOWR(0xA7, struct drm_mode_get_connector) -#define DRM_IOCTL_MODE_ATTACHMODE DRM_IOWR(0xA8, struct drm_mode_mode_cmd) /* deprecated (never worked) */ -#define DRM_IOCTL_MODE_DETACHMODE DRM_IOWR(0xA9, struct drm_mode_mode_cmd) /* deprecated (never worked) */ - -#define DRM_IOCTL_MODE_GETPROPERTY DRM_IOWR(0xAA, struct drm_mode_get_property) -#define DRM_IOCTL_MODE_SETPROPERTY DRM_IOWR(0xAB, struct drm_mode_connector_set_property) -#define DRM_IOCTL_MODE_GETPROPBLOB DRM_IOWR(0xAC, struct drm_mode_get_blob) -#define DRM_IOCTL_MODE_GETFB DRM_IOWR(0xAD, struct drm_mode_fb_cmd) -#define DRM_IOCTL_MODE_ADDFB DRM_IOWR(0xAE, struct drm_mode_fb_cmd) -/** - * DRM_IOCTL_MODE_RMFB - Remove a framebuffer. - * - * This removes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL - * argument is a framebuffer object ID. - * - * Warning: removing a framebuffer currently in-use on an enabled plane will - * disable that plane. The CRTC the plane is linked to may also be disabled - * (depending on driver capabilities). - */ -#define DRM_IOCTL_MODE_RMFB DRM_IOWR(0xAF, unsigned int) -#define DRM_IOCTL_MODE_PAGE_FLIP DRM_IOWR(0xB0, struct drm_mode_crtc_page_flip) -#define DRM_IOCTL_MODE_DIRTYFB DRM_IOWR(0xB1, struct drm_mode_fb_dirty_cmd) - -/** - * DRM_IOCTL_MODE_CREATE_DUMB - Create a new dumb buffer object. - * - * KMS dumb buffers provide a very primitive way to allocate a buffer object - * suitable for scanout and map it for software rendering. KMS dumb buffers are - * not suitable for hardware-accelerated rendering nor video decoding. KMS dumb - * buffers are not suitable to be displayed on any other device than the KMS - * device where they were allocated from. Also see - * :ref:`kms_dumb_buffer_objects`. - * - * The IOCTL argument is a struct drm_mode_create_dumb. - * - * User-space is expected to create a KMS dumb buffer via this IOCTL, then add - * it as a KMS framebuffer via &DRM_IOCTL_MODE_ADDFB and map it via - * &DRM_IOCTL_MODE_MAP_DUMB. - * - * &DRM_CAP_DUMB_BUFFER indicates whether this IOCTL is supported. - * &DRM_CAP_DUMB_PREFERRED_DEPTH and &DRM_CAP_DUMB_PREFER_SHADOW indicate - * driver preferences for dumb buffers. - */ -#define DRM_IOCTL_MODE_CREATE_DUMB DRM_IOWR(0xB2, struct drm_mode_create_dumb) -#define DRM_IOCTL_MODE_MAP_DUMB DRM_IOWR(0xB3, struct drm_mode_map_dumb) -#define DRM_IOCTL_MODE_DESTROY_DUMB DRM_IOWR(0xB4, struct drm_mode_destroy_dumb) -#define DRM_IOCTL_MODE_GETPLANERESOURCES DRM_IOWR(0xB5, struct drm_mode_get_plane_res) -#define DRM_IOCTL_MODE_GETPLANE DRM_IOWR(0xB6, struct drm_mode_get_plane) -#define DRM_IOCTL_MODE_SETPLANE DRM_IOWR(0xB7, struct drm_mode_set_plane) -#define DRM_IOCTL_MODE_ADDFB2 DRM_IOWR(0xB8, struct drm_mode_fb_cmd2) -#define DRM_IOCTL_MODE_OBJ_GETPROPERTIES DRM_IOWR(0xB9, struct drm_mode_obj_get_properties) -#define DRM_IOCTL_MODE_OBJ_SETPROPERTY DRM_IOWR(0xBA, struct drm_mode_obj_set_property) -#define DRM_IOCTL_MODE_CURSOR2 DRM_IOWR(0xBB, struct drm_mode_cursor2) -#define DRM_IOCTL_MODE_ATOMIC DRM_IOWR(0xBC, struct drm_mode_atomic) -#define DRM_IOCTL_MODE_CREATEPROPBLOB DRM_IOWR(0xBD, struct drm_mode_create_blob) -#define DRM_IOCTL_MODE_DESTROYPROPBLOB DRM_IOWR(0xBE, struct drm_mode_destroy_blob) - -#define DRM_IOCTL_SYNCOBJ_CREATE DRM_IOWR(0xBF, struct drm_syncobj_create) -#define DRM_IOCTL_SYNCOBJ_DESTROY DRM_IOWR(0xC0, struct drm_syncobj_destroy) -#define DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD DRM_IOWR(0xC1, struct drm_syncobj_handle) -#define DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE DRM_IOWR(0xC2, struct drm_syncobj_handle) -#define DRM_IOCTL_SYNCOBJ_WAIT DRM_IOWR(0xC3, struct drm_syncobj_wait) -#define DRM_IOCTL_SYNCOBJ_RESET DRM_IOWR(0xC4, struct drm_syncobj_array) -#define DRM_IOCTL_SYNCOBJ_SIGNAL DRM_IOWR(0xC5, struct drm_syncobj_array) - -#define DRM_IOCTL_MODE_CREATE_LEASE DRM_IOWR(0xC6, struct drm_mode_create_lease) -#define DRM_IOCTL_MODE_LIST_LESSEES DRM_IOWR(0xC7, struct drm_mode_list_lessees) -#define DRM_IOCTL_MODE_GET_LEASE DRM_IOWR(0xC8, struct drm_mode_get_lease) -#define DRM_IOCTL_MODE_REVOKE_LEASE DRM_IOWR(0xC9, struct drm_mode_revoke_lease) - -#define DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT DRM_IOWR(0xCA, struct drm_syncobj_timeline_wait) -#define DRM_IOCTL_SYNCOBJ_QUERY DRM_IOWR(0xCB, struct drm_syncobj_timeline_array) -#define DRM_IOCTL_SYNCOBJ_TRANSFER DRM_IOWR(0xCC, struct drm_syncobj_transfer) -#define DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL DRM_IOWR(0xCD, struct drm_syncobj_timeline_array) - -/** - * DRM_IOCTL_MODE_GETFB2 - Get framebuffer metadata. - * - * This queries metadata about a framebuffer. User-space fills - * &drm_mode_fb_cmd2.fb_id as the input, and the kernels fills the rest of the - * struct as the output. - * - * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles - * will be filled with GEM buffer handles. Fresh new GEM handles are always - * returned, even if another GEM handle referring to the same memory object - * already exists on the DRM file description. The caller is responsible for - * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same - * new handle will be returned for multiple planes in case they use the same - * memory object. Planes are valid until one has a zero handle -- this can be - * used to compute the number of planes. - * - * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid - * until one has a zero &drm_mode_fb_cmd2.pitches. - * - * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set - * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the - * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. - * - * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space - * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately - * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not - * double-close handles which are specified multiple times in the array. - */ -#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) - -#define DRM_IOCTL_SYNCOBJ_EVENTFD DRM_IOWR(0xCF, struct drm_syncobj_eventfd) - -/** - * DRM_IOCTL_MODE_CLOSEFB - Close a framebuffer. - * - * This closes a framebuffer previously added via ADDFB/ADDFB2. The IOCTL - * argument is a framebuffer object ID. - * - * This IOCTL is similar to &DRM_IOCTL_MODE_RMFB, except it doesn't disable - * planes and CRTCs. As long as the framebuffer is used by a plane, it's kept - * alive. When the plane no longer uses the framebuffer (because the - * framebuffer is replaced with another one, or the plane is disabled), the - * framebuffer is cleaned up. - * - * This is useful to implement flicker-free transitions between two processes. - * - * Depending on the threat model, user-space may want to ensure that the - * framebuffer doesn't expose any sensitive user information: closed - * framebuffers attached to a plane can be read back by the next DRM master. - */ -#define DRM_IOCTL_MODE_CLOSEFB DRM_IOWR(0xD0, struct drm_mode_closefb) - -/** - * DRM_IOCTL_SET_CLIENT_NAME - Attach a name to a drm_file - * - * Having a name allows for easier tracking and debugging. - * The length of the name (without null ending char) must be - * <= DRM_CLIENT_NAME_MAX_LEN. - * The call will fail if the name contains whitespaces or non-printable chars. - */ -#define DRM_IOCTL_SET_CLIENT_NAME DRM_IOWR(0xD1, struct drm_set_client_name) - -/** - * DRM_IOCTL_GEM_CHANGE_HANDLE - Move an object to a different handle - * - * Some applications (notably CRIU) need objects to have specific gem handles. - * This ioctl changes the object at one gem handle to use a new gem handle. - */ -#define DRM_IOCTL_GEM_CHANGE_HANDLE DRM_IOWR(0xD2, struct drm_gem_change_handle) - -/* - * Device specific ioctls should only be in their respective headers - * The device specific ioctl range is from 0x40 to 0x9f. - * Generic IOCTLS restart at 0xA0. - * - * \sa drmCommandNone(), drmCommandRead(), drmCommandWrite(), and - * drmCommandReadWrite(). - */ -#define DRM_COMMAND_BASE 0x40 -#define DRM_COMMAND_END 0xA0 - -/** - * struct drm_event - Header for DRM events - * @type: event type. - * @length: total number of payload bytes (including header). - * - * This struct is a header for events written back to user-space on the DRM FD. - * A read on the DRM FD will always only return complete events: e.g. if the - * read buffer is 100 bytes large and there are two 64 byte events pending, - * only one will be returned. - * - * Event types 0 - 0x7fffffff are generic DRM events, 0x80000000 and - * up are chipset specific. Generic DRM events include &DRM_EVENT_VBLANK, - * &DRM_EVENT_FLIP_COMPLETE and &DRM_EVENT_CRTC_SEQUENCE. - */ -struct drm_event { - __u32 type; - __u32 length; -}; - -/** - * DRM_EVENT_VBLANK - vertical blanking event - * - * This event is sent in response to &DRM_IOCTL_WAIT_VBLANK with the - * &_DRM_VBLANK_EVENT flag set. - * - * The event payload is a struct drm_event_vblank. - */ -#define DRM_EVENT_VBLANK 0x01 -/** - * DRM_EVENT_FLIP_COMPLETE - page-flip completion event - * - * This event is sent in response to an atomic commit or legacy page-flip with - * the &DRM_MODE_PAGE_FLIP_EVENT flag set. - * - * The event payload is a struct drm_event_vblank. - */ -#define DRM_EVENT_FLIP_COMPLETE 0x02 -/** - * DRM_EVENT_CRTC_SEQUENCE - CRTC sequence event - * - * This event is sent in response to &DRM_IOCTL_CRTC_QUEUE_SEQUENCE. - * - * The event payload is a struct drm_event_crtc_sequence. - */ -#define DRM_EVENT_CRTC_SEQUENCE 0x03 - -struct drm_event_vblank { - struct drm_event base; - __u64 user_data; - __u32 tv_sec; - __u32 tv_usec; - __u32 sequence; - __u32 crtc_id; /* 0 on older kernels that do not support this */ -}; - -/* Event delivered at sequence. Time stamp marks when the first pixel - * of the refresh cycle leaves the display engine for the display - */ -struct drm_event_crtc_sequence { - struct drm_event base; - __u64 user_data; - __s64 time_ns; - __u64 sequence; -}; - -/* typedef area */ -#ifndef __KERNEL__ -typedef struct drm_clip_rect drm_clip_rect_t; -typedef struct drm_drawable_info drm_drawable_info_t; -typedef struct drm_tex_region drm_tex_region_t; -typedef struct drm_hw_lock drm_hw_lock_t; -typedef struct drm_version drm_version_t; -typedef struct drm_unique drm_unique_t; -typedef struct drm_list drm_list_t; -typedef struct drm_block drm_block_t; -typedef struct drm_control drm_control_t; -typedef enum drm_map_type drm_map_type_t; -typedef enum drm_map_flags drm_map_flags_t; -typedef struct drm_ctx_priv_map drm_ctx_priv_map_t; -typedef struct drm_map drm_map_t; -typedef struct drm_client drm_client_t; -typedef enum drm_stat_type drm_stat_type_t; -typedef struct drm_stats drm_stats_t; -typedef enum drm_lock_flags drm_lock_flags_t; -typedef struct drm_lock drm_lock_t; -typedef enum drm_dma_flags drm_dma_flags_t; -typedef struct drm_buf_desc drm_buf_desc_t; -typedef struct drm_buf_info drm_buf_info_t; -typedef struct drm_buf_free drm_buf_free_t; -typedef struct drm_buf_pub drm_buf_pub_t; -typedef struct drm_buf_map drm_buf_map_t; -typedef struct drm_dma drm_dma_t; -typedef union drm_wait_vblank drm_wait_vblank_t; -typedef struct drm_agp_mode drm_agp_mode_t; -typedef enum drm_ctx_flags drm_ctx_flags_t; -typedef struct drm_ctx drm_ctx_t; -typedef struct drm_ctx_res drm_ctx_res_t; -typedef struct drm_draw drm_draw_t; -typedef struct drm_update_draw drm_update_draw_t; -typedef struct drm_auth drm_auth_t; -typedef struct drm_irq_busid drm_irq_busid_t; -typedef enum drm_vblank_seq_type drm_vblank_seq_type_t; - -typedef struct drm_agp_buffer drm_agp_buffer_t; -typedef struct drm_agp_binding drm_agp_binding_t; -typedef struct drm_agp_info drm_agp_info_t; -typedef struct drm_scatter_gather drm_scatter_gather_t; -typedef struct drm_set_version drm_set_version_t; -#endif - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/plugins/amdgpu/drm_mode.h b/plugins/amdgpu/drm_mode.h deleted file mode 100644 index c082810c0..000000000 --- a/plugins/amdgpu/drm_mode.h +++ /dev/null @@ -1,1362 +0,0 @@ -/* - * Copyright (c) 2007 Dave Airlie - * Copyright (c) 2007 Jakob Bornecrantz - * Copyright (c) 2008 Red Hat Inc. - * Copyright (c) 2007-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA - * Copyright (c) 2007-2008 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _DRM_MODE_H -#define _DRM_MODE_H - -#include "drm.h" - -#if defined(__cplusplus) -extern "C" { -#endif - -/** - * DOC: overview - * - * DRM exposes many UAPI and structure definitions to have a consistent - * and standardized interface with users. - * Userspace can refer to these structure definitions and UAPI formats - * to communicate to drivers. - */ - -#define DRM_CONNECTOR_NAME_LEN 32 -#define DRM_DISPLAY_MODE_LEN 32 -#define DRM_PROP_NAME_LEN 32 - -#define DRM_MODE_TYPE_BUILTIN (1<<0) /* deprecated */ -#define DRM_MODE_TYPE_CLOCK_C ((1<<1) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ -#define DRM_MODE_TYPE_CRTC_C ((1<<2) | DRM_MODE_TYPE_BUILTIN) /* deprecated */ -#define DRM_MODE_TYPE_PREFERRED (1<<3) -#define DRM_MODE_TYPE_DEFAULT (1<<4) /* deprecated */ -#define DRM_MODE_TYPE_USERDEF (1<<5) -#define DRM_MODE_TYPE_DRIVER (1<<6) - -#define DRM_MODE_TYPE_ALL (DRM_MODE_TYPE_PREFERRED | \ - DRM_MODE_TYPE_USERDEF | \ - DRM_MODE_TYPE_DRIVER) - -/* Video mode flags */ -/* bit compatible with the xrandr RR_ definitions (bits 0-13) - * - * ABI warning: Existing userspace really expects - * the mode flags to match the xrandr definitions. Any - * changes that don't match the xrandr definitions will - * likely need a new client cap or some other mechanism - * to avoid breaking existing userspace. This includes - * allocating new flags in the previously unused bits! - */ -#define DRM_MODE_FLAG_PHSYNC (1<<0) -#define DRM_MODE_FLAG_NHSYNC (1<<1) -#define DRM_MODE_FLAG_PVSYNC (1<<2) -#define DRM_MODE_FLAG_NVSYNC (1<<3) -#define DRM_MODE_FLAG_INTERLACE (1<<4) -#define DRM_MODE_FLAG_DBLSCAN (1<<5) -#define DRM_MODE_FLAG_CSYNC (1<<6) -#define DRM_MODE_FLAG_PCSYNC (1<<7) -#define DRM_MODE_FLAG_NCSYNC (1<<8) -#define DRM_MODE_FLAG_HSKEW (1<<9) /* hskew provided */ -#define DRM_MODE_FLAG_BCAST (1<<10) /* deprecated */ -#define DRM_MODE_FLAG_PIXMUX (1<<11) /* deprecated */ -#define DRM_MODE_FLAG_DBLCLK (1<<12) -#define DRM_MODE_FLAG_CLKDIV2 (1<<13) - /* - * When adding a new stereo mode don't forget to adjust DRM_MODE_FLAGS_3D_MAX - * (define not exposed to user space). - */ -#define DRM_MODE_FLAG_3D_MASK (0x1f<<14) -#define DRM_MODE_FLAG_3D_NONE (0<<14) -#define DRM_MODE_FLAG_3D_FRAME_PACKING (1<<14) -#define DRM_MODE_FLAG_3D_FIELD_ALTERNATIVE (2<<14) -#define DRM_MODE_FLAG_3D_LINE_ALTERNATIVE (3<<14) -#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_FULL (4<<14) -#define DRM_MODE_FLAG_3D_L_DEPTH (5<<14) -#define DRM_MODE_FLAG_3D_L_DEPTH_GFX_GFX_DEPTH (6<<14) -#define DRM_MODE_FLAG_3D_TOP_AND_BOTTOM (7<<14) -#define DRM_MODE_FLAG_3D_SIDE_BY_SIDE_HALF (8<<14) - -/* Picture aspect ratio options */ -#define DRM_MODE_PICTURE_ASPECT_NONE 0 -#define DRM_MODE_PICTURE_ASPECT_4_3 1 -#define DRM_MODE_PICTURE_ASPECT_16_9 2 -#define DRM_MODE_PICTURE_ASPECT_64_27 3 -#define DRM_MODE_PICTURE_ASPECT_256_135 4 - -/* Content type options */ -#define DRM_MODE_CONTENT_TYPE_NO_DATA 0 -#define DRM_MODE_CONTENT_TYPE_GRAPHICS 1 -#define DRM_MODE_CONTENT_TYPE_PHOTO 2 -#define DRM_MODE_CONTENT_TYPE_CINEMA 3 -#define DRM_MODE_CONTENT_TYPE_GAME 4 - -/* Aspect ratio flag bitmask (4 bits 22:19) */ -#define DRM_MODE_FLAG_PIC_AR_MASK (0x0F<<19) -#define DRM_MODE_FLAG_PIC_AR_NONE \ - (DRM_MODE_PICTURE_ASPECT_NONE<<19) -#define DRM_MODE_FLAG_PIC_AR_4_3 \ - (DRM_MODE_PICTURE_ASPECT_4_3<<19) -#define DRM_MODE_FLAG_PIC_AR_16_9 \ - (DRM_MODE_PICTURE_ASPECT_16_9<<19) -#define DRM_MODE_FLAG_PIC_AR_64_27 \ - (DRM_MODE_PICTURE_ASPECT_64_27<<19) -#define DRM_MODE_FLAG_PIC_AR_256_135 \ - (DRM_MODE_PICTURE_ASPECT_256_135<<19) - -#define DRM_MODE_FLAG_ALL (DRM_MODE_FLAG_PHSYNC | \ - DRM_MODE_FLAG_NHSYNC | \ - DRM_MODE_FLAG_PVSYNC | \ - DRM_MODE_FLAG_NVSYNC | \ - DRM_MODE_FLAG_INTERLACE | \ - DRM_MODE_FLAG_DBLSCAN | \ - DRM_MODE_FLAG_CSYNC | \ - DRM_MODE_FLAG_PCSYNC | \ - DRM_MODE_FLAG_NCSYNC | \ - DRM_MODE_FLAG_HSKEW | \ - DRM_MODE_FLAG_DBLCLK | \ - DRM_MODE_FLAG_CLKDIV2 | \ - DRM_MODE_FLAG_3D_MASK) - -/* DPMS flags */ -/* bit compatible with the xorg definitions. */ -#define DRM_MODE_DPMS_ON 0 -#define DRM_MODE_DPMS_STANDBY 1 -#define DRM_MODE_DPMS_SUSPEND 2 -#define DRM_MODE_DPMS_OFF 3 - -/* Scaling mode options */ -#define DRM_MODE_SCALE_NONE 0 /* Unmodified timing (display or - software can still scale) */ -#define DRM_MODE_SCALE_FULLSCREEN 1 /* Full screen, ignore aspect */ -#define DRM_MODE_SCALE_CENTER 2 /* Centered, no scaling */ -#define DRM_MODE_SCALE_ASPECT 3 /* Full screen, preserve aspect */ - -/* Dithering mode options */ -#define DRM_MODE_DITHERING_OFF 0 -#define DRM_MODE_DITHERING_ON 1 -#define DRM_MODE_DITHERING_AUTO 2 - -/* Dirty info options */ -#define DRM_MODE_DIRTY_OFF 0 -#define DRM_MODE_DIRTY_ON 1 -#define DRM_MODE_DIRTY_ANNOTATE 2 - -/* Link Status options */ -#define DRM_MODE_LINK_STATUS_GOOD 0 -#define DRM_MODE_LINK_STATUS_BAD 1 - -/* - * DRM_MODE_ROTATE_ - * - * Signals that a drm plane is been rotated degrees in counter - * clockwise direction. - * - * This define is provided as a convenience, looking up the property id - * using the name->prop id lookup is the preferred method. - */ -#define DRM_MODE_ROTATE_0 (1<<0) -#define DRM_MODE_ROTATE_90 (1<<1) -#define DRM_MODE_ROTATE_180 (1<<2) -#define DRM_MODE_ROTATE_270 (1<<3) - -/* - * DRM_MODE_ROTATE_MASK - * - * Bitmask used to look for drm plane rotations. - */ -#define DRM_MODE_ROTATE_MASK (\ - DRM_MODE_ROTATE_0 | \ - DRM_MODE_ROTATE_90 | \ - DRM_MODE_ROTATE_180 | \ - DRM_MODE_ROTATE_270) - -/* - * DRM_MODE_REFLECT_ - * - * Signals that the contents of a drm plane is reflected along the axis, - * in the same way as mirroring. - * See kerneldoc chapter "Plane Composition Properties" for more details. - * - * This define is provided as a convenience, looking up the property id - * using the name->prop id lookup is the preferred method. - */ -#define DRM_MODE_REFLECT_X (1<<4) -#define DRM_MODE_REFLECT_Y (1<<5) - -/* - * DRM_MODE_REFLECT_MASK - * - * Bitmask used to look for drm plane reflections. - */ -#define DRM_MODE_REFLECT_MASK (\ - DRM_MODE_REFLECT_X | \ - DRM_MODE_REFLECT_Y) - -/* Content Protection Flags */ -#define DRM_MODE_CONTENT_PROTECTION_UNDESIRED 0 -#define DRM_MODE_CONTENT_PROTECTION_DESIRED 1 -#define DRM_MODE_CONTENT_PROTECTION_ENABLED 2 - -/** - * struct drm_mode_modeinfo - Display mode information. - * @clock: pixel clock in kHz - * @hdisplay: horizontal display size - * @hsync_start: horizontal sync start - * @hsync_end: horizontal sync end - * @htotal: horizontal total size - * @hskew: horizontal skew - * @vdisplay: vertical display size - * @vsync_start: vertical sync start - * @vsync_end: vertical sync end - * @vtotal: vertical total size - * @vscan: vertical scan - * @vrefresh: approximate vertical refresh rate in Hz - * @flags: bitmask of misc. flags, see DRM_MODE_FLAG_* defines - * @type: bitmask of type flags, see DRM_MODE_TYPE_* defines - * @name: string describing the mode resolution - * - * This is the user-space API display mode information structure. For the - * kernel version see struct drm_display_mode. - */ -struct drm_mode_modeinfo { - __u32 clock; - __u16 hdisplay; - __u16 hsync_start; - __u16 hsync_end; - __u16 htotal; - __u16 hskew; - __u16 vdisplay; - __u16 vsync_start; - __u16 vsync_end; - __u16 vtotal; - __u16 vscan; - - __u32 vrefresh; - - __u32 flags; - __u32 type; - char name[DRM_DISPLAY_MODE_LEN]; -}; - -struct drm_mode_card_res { - __u64 fb_id_ptr; - __u64 crtc_id_ptr; - __u64 connector_id_ptr; - __u64 encoder_id_ptr; - __u32 count_fbs; - __u32 count_crtcs; - __u32 count_connectors; - __u32 count_encoders; - __u32 min_width; - __u32 max_width; - __u32 min_height; - __u32 max_height; -}; - -struct drm_mode_crtc { - __u64 set_connectors_ptr; - __u32 count_connectors; - - __u32 crtc_id; /**< Id */ - __u32 fb_id; /**< Id of framebuffer */ - - __u32 x; /**< x Position on the framebuffer */ - __u32 y; /**< y Position on the framebuffer */ - - __u32 gamma_size; - __u32 mode_valid; - struct drm_mode_modeinfo mode; -}; - -#define DRM_MODE_PRESENT_TOP_FIELD (1<<0) -#define DRM_MODE_PRESENT_BOTTOM_FIELD (1<<1) - -/* Planes blend with or override other bits on the CRTC */ -struct drm_mode_set_plane { - __u32 plane_id; - __u32 crtc_id; - __u32 fb_id; /* fb object contains surface format type */ - __u32 flags; /* see above flags */ - - /* Signed dest location allows it to be partially off screen */ - __s32 crtc_x; - __s32 crtc_y; - __u32 crtc_w; - __u32 crtc_h; - - /* Source values are 16.16 fixed point */ - __u32 src_x; - __u32 src_y; - __u32 src_h; - __u32 src_w; -}; - -/** - * struct drm_mode_get_plane - Get plane metadata. - * - * Userspace can perform a GETPLANE ioctl to retrieve information about a - * plane. - * - * To retrieve the number of formats supported, set @count_format_types to zero - * and call the ioctl. @count_format_types will be updated with the value. - * - * To retrieve these formats, allocate an array with the memory needed to store - * @count_format_types formats. Point @format_type_ptr to this array and call - * the ioctl again (with @count_format_types still set to the value returned in - * the first ioctl call). - */ -struct drm_mode_get_plane { - /** - * @plane_id: Object ID of the plane whose information should be - * retrieved. Set by caller. - */ - __u32 plane_id; - - /** @crtc_id: Object ID of the current CRTC. */ - __u32 crtc_id; - /** @fb_id: Object ID of the current fb. */ - __u32 fb_id; - - /** - * @possible_crtcs: Bitmask of CRTC's compatible with the plane. CRTC's - * are created and they receive an index, which corresponds to their - * position in the bitmask. Bit N corresponds to - * :ref:`CRTC index` N. - */ - __u32 possible_crtcs; - /** @gamma_size: Never used. */ - __u32 gamma_size; - - /** @count_format_types: Number of formats. */ - __u32 count_format_types; - /** - * @format_type_ptr: Pointer to ``__u32`` array of formats that are - * supported by the plane. These formats do not require modifiers. - */ - __u64 format_type_ptr; -}; - -struct drm_mode_get_plane_res { - __u64 plane_id_ptr; - __u32 count_planes; -}; - -#define DRM_MODE_ENCODER_NONE 0 -#define DRM_MODE_ENCODER_DAC 1 -#define DRM_MODE_ENCODER_TMDS 2 -#define DRM_MODE_ENCODER_LVDS 3 -#define DRM_MODE_ENCODER_TVDAC 4 -#define DRM_MODE_ENCODER_VIRTUAL 5 -#define DRM_MODE_ENCODER_DSI 6 -#define DRM_MODE_ENCODER_DPMST 7 -#define DRM_MODE_ENCODER_DPI 8 - -struct drm_mode_get_encoder { - __u32 encoder_id; - __u32 encoder_type; - - __u32 crtc_id; /**< Id of crtc */ - - __u32 possible_crtcs; - __u32 possible_clones; -}; - -/* This is for connectors with multiple signal types. */ -/* Try to match DRM_MODE_CONNECTOR_X as closely as possible. */ -enum drm_mode_subconnector { - DRM_MODE_SUBCONNECTOR_Automatic = 0, /* DVI-I, TV */ - DRM_MODE_SUBCONNECTOR_Unknown = 0, /* DVI-I, TV, DP */ - DRM_MODE_SUBCONNECTOR_VGA = 1, /* DP */ - DRM_MODE_SUBCONNECTOR_DVID = 3, /* DVI-I DP */ - DRM_MODE_SUBCONNECTOR_DVIA = 4, /* DVI-I */ - DRM_MODE_SUBCONNECTOR_Composite = 5, /* TV */ - DRM_MODE_SUBCONNECTOR_SVIDEO = 6, /* TV */ - DRM_MODE_SUBCONNECTOR_Component = 8, /* TV */ - DRM_MODE_SUBCONNECTOR_SCART = 9, /* TV */ - DRM_MODE_SUBCONNECTOR_DisplayPort = 10, /* DP */ - DRM_MODE_SUBCONNECTOR_HDMIA = 11, /* DP */ - DRM_MODE_SUBCONNECTOR_Native = 15, /* DP */ - DRM_MODE_SUBCONNECTOR_Wireless = 18, /* DP */ -}; - -#define DRM_MODE_CONNECTOR_Unknown 0 -#define DRM_MODE_CONNECTOR_VGA 1 -#define DRM_MODE_CONNECTOR_DVII 2 -#define DRM_MODE_CONNECTOR_DVID 3 -#define DRM_MODE_CONNECTOR_DVIA 4 -#define DRM_MODE_CONNECTOR_Composite 5 -#define DRM_MODE_CONNECTOR_SVIDEO 6 -#define DRM_MODE_CONNECTOR_LVDS 7 -#define DRM_MODE_CONNECTOR_Component 8 -#define DRM_MODE_CONNECTOR_9PinDIN 9 -#define DRM_MODE_CONNECTOR_DisplayPort 10 -#define DRM_MODE_CONNECTOR_HDMIA 11 -#define DRM_MODE_CONNECTOR_HDMIB 12 -#define DRM_MODE_CONNECTOR_TV 13 -#define DRM_MODE_CONNECTOR_eDP 14 -#define DRM_MODE_CONNECTOR_VIRTUAL 15 -#define DRM_MODE_CONNECTOR_DSI 16 -#define DRM_MODE_CONNECTOR_DPI 17 -#define DRM_MODE_CONNECTOR_WRITEBACK 18 -#define DRM_MODE_CONNECTOR_SPI 19 -#define DRM_MODE_CONNECTOR_USB 20 - -/** - * struct drm_mode_get_connector - Get connector metadata. - * - * User-space can perform a GETCONNECTOR ioctl to retrieve information about a - * connector. User-space is expected to retrieve encoders, modes and properties - * by performing this ioctl at least twice: the first time to retrieve the - * number of elements, the second time to retrieve the elements themselves. - * - * To retrieve the number of elements, set @count_props and @count_encoders to - * zero, set @count_modes to 1, and set @modes_ptr to a temporary struct - * drm_mode_modeinfo element. - * - * To retrieve the elements, allocate arrays for @encoders_ptr, @modes_ptr, - * @props_ptr and @prop_values_ptr, then set @count_modes, @count_props and - * @count_encoders to their capacity. - * - * Performing the ioctl only twice may be racy: the number of elements may have - * changed with a hotplug event in-between the two ioctls. User-space is - * expected to retry the last ioctl until the number of elements stabilizes. - * The kernel won't fill any array which doesn't have the expected length. - * - * **Force-probing a connector** - * - * If the @count_modes field is set to zero and the DRM client is the current - * DRM master, the kernel will perform a forced probe on the connector to - * refresh the connector status, modes and EDID. A forced-probe can be slow, - * might cause flickering and the ioctl will block. - * - * User-space needs to force-probe connectors to ensure their metadata is - * up-to-date at startup and after receiving a hot-plug event. User-space - * may perform a forced-probe when the user explicitly requests it. User-space - * shouldn't perform a forced-probe in other situations. - */ -struct drm_mode_get_connector { - /** @encoders_ptr: Pointer to ``__u32`` array of object IDs. */ - __u64 encoders_ptr; - /** @modes_ptr: Pointer to struct drm_mode_modeinfo array. */ - __u64 modes_ptr; - /** @props_ptr: Pointer to ``__u32`` array of property IDs. */ - __u64 props_ptr; - /** @prop_values_ptr: Pointer to ``__u64`` array of property values. */ - __u64 prop_values_ptr; - - /** @count_modes: Number of modes. */ - __u32 count_modes; - /** @count_props: Number of properties. */ - __u32 count_props; - /** @count_encoders: Number of encoders. */ - __u32 count_encoders; - - /** @encoder_id: Object ID of the current encoder. */ - __u32 encoder_id; - /** @connector_id: Object ID of the connector. */ - __u32 connector_id; - /** - * @connector_type: Type of the connector. - * - * See DRM_MODE_CONNECTOR_* defines. - */ - __u32 connector_type; - /** - * @connector_type_id: Type-specific connector number. - * - * This is not an object ID. This is a per-type connector number. Each - * (type, type_id) combination is unique across all connectors of a DRM - * device. - * - * The (type, type_id) combination is not a stable identifier: the - * type_id can change depending on the driver probe order. - */ - __u32 connector_type_id; - - /** - * @connection: Status of the connector. - * - * See enum drm_connector_status. - */ - __u32 connection; - /** @mm_width: Width of the connected sink in millimeters. */ - __u32 mm_width; - /** @mm_height: Height of the connected sink in millimeters. */ - __u32 mm_height; - /** - * @subpixel: Subpixel order of the connected sink. - * - * See enum subpixel_order. - */ - __u32 subpixel; - - /** @pad: Padding, must be zero. */ - __u32 pad; -}; - -#define DRM_MODE_PROP_PENDING (1<<0) /* deprecated, do not use */ -#define DRM_MODE_PROP_RANGE (1<<1) -#define DRM_MODE_PROP_IMMUTABLE (1<<2) -#define DRM_MODE_PROP_ENUM (1<<3) /* enumerated type with text strings */ -#define DRM_MODE_PROP_BLOB (1<<4) -#define DRM_MODE_PROP_BITMASK (1<<5) /* bitmask of enumerated types */ - -/* non-extended types: legacy bitmask, one bit per type: */ -#define DRM_MODE_PROP_LEGACY_TYPE ( \ - DRM_MODE_PROP_RANGE | \ - DRM_MODE_PROP_ENUM | \ - DRM_MODE_PROP_BLOB | \ - DRM_MODE_PROP_BITMASK) - -/* extended-types: rather than continue to consume a bit per type, - * grab a chunk of the bits to use as integer type id. - */ -#define DRM_MODE_PROP_EXTENDED_TYPE 0x0000ffc0 -#define DRM_MODE_PROP_TYPE(n) ((n) << 6) -#define DRM_MODE_PROP_OBJECT DRM_MODE_PROP_TYPE(1) -#define DRM_MODE_PROP_SIGNED_RANGE DRM_MODE_PROP_TYPE(2) - -/* the PROP_ATOMIC flag is used to hide properties from userspace that - * is not aware of atomic properties. This is mostly to work around - * older userspace (DDX drivers) that read/write each prop they find, - * without being aware that this could be triggering a lengthy modeset. - */ -#define DRM_MODE_PROP_ATOMIC 0x80000000 - -/** - * struct drm_mode_property_enum - Description for an enum/bitfield entry. - * @value: numeric value for this enum entry. - * @name: symbolic name for this enum entry. - * - * See struct drm_property_enum for details. - */ -struct drm_mode_property_enum { - __u64 value; - char name[DRM_PROP_NAME_LEN]; -}; - -/** - * struct drm_mode_get_property - Get property metadata. - * - * User-space can perform a GETPROPERTY ioctl to retrieve information about a - * property. The same property may be attached to multiple objects, see - * "Modeset Base Object Abstraction". - * - * The meaning of the @values_ptr field changes depending on the property type. - * See &drm_property.flags for more details. - * - * The @enum_blob_ptr and @count_enum_blobs fields are only meaningful when the - * property has the type &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK. For - * backwards compatibility, the kernel will always set @count_enum_blobs to - * zero when the property has the type &DRM_MODE_PROP_BLOB. User-space must - * ignore these two fields if the property has a different type. - * - * User-space is expected to retrieve values and enums by performing this ioctl - * at least twice: the first time to retrieve the number of elements, the - * second time to retrieve the elements themselves. - * - * To retrieve the number of elements, set @count_values and @count_enum_blobs - * to zero, then call the ioctl. @count_values will be updated with the number - * of elements. If the property has the type &DRM_MODE_PROP_ENUM or - * &DRM_MODE_PROP_BITMASK, @count_enum_blobs will be updated as well. - * - * To retrieve the elements themselves, allocate an array for @values_ptr and - * set @count_values to its capacity. If the property has the type - * &DRM_MODE_PROP_ENUM or &DRM_MODE_PROP_BITMASK, allocate an array for - * @enum_blob_ptr and set @count_enum_blobs to its capacity. Calling the ioctl - * again will fill the arrays. - */ -struct drm_mode_get_property { - /** @values_ptr: Pointer to a ``__u64`` array. */ - __u64 values_ptr; - /** @enum_blob_ptr: Pointer to a struct drm_mode_property_enum array. */ - __u64 enum_blob_ptr; - - /** - * @prop_id: Object ID of the property which should be retrieved. Set - * by the caller. - */ - __u32 prop_id; - /** - * @flags: ``DRM_MODE_PROP_*`` bitfield. See &drm_property.flags for - * a definition of the flags. - */ - __u32 flags; - /** - * @name: Symbolic property name. User-space should use this field to - * recognize properties. - */ - char name[DRM_PROP_NAME_LEN]; - - /** @count_values: Number of elements in @values_ptr. */ - __u32 count_values; - /** @count_enum_blobs: Number of elements in @enum_blob_ptr. */ - __u32 count_enum_blobs; -}; - -struct drm_mode_connector_set_property { - __u64 value; - __u32 prop_id; - __u32 connector_id; -}; - -#define DRM_MODE_OBJECT_CRTC 0xcccccccc -#define DRM_MODE_OBJECT_CONNECTOR 0xc0c0c0c0 -#define DRM_MODE_OBJECT_ENCODER 0xe0e0e0e0 -#define DRM_MODE_OBJECT_MODE 0xdededede -#define DRM_MODE_OBJECT_PROPERTY 0xb0b0b0b0 -#define DRM_MODE_OBJECT_FB 0xfbfbfbfb -#define DRM_MODE_OBJECT_BLOB 0xbbbbbbbb -#define DRM_MODE_OBJECT_PLANE 0xeeeeeeee -#define DRM_MODE_OBJECT_ANY 0 - -struct drm_mode_obj_get_properties { - __u64 props_ptr; - __u64 prop_values_ptr; - __u32 count_props; - __u32 obj_id; - __u32 obj_type; -}; - -struct drm_mode_obj_set_property { - __u64 value; - __u32 prop_id; - __u32 obj_id; - __u32 obj_type; -}; - -struct drm_mode_get_blob { - __u32 blob_id; - __u32 length; - __u64 data; -}; - -struct drm_mode_fb_cmd { - __u32 fb_id; - __u32 width; - __u32 height; - __u32 pitch; - __u32 bpp; - __u32 depth; - /* driver specific handle */ - __u32 handle; -}; - -#define DRM_MODE_FB_INTERLACED (1<<0) /* for interlaced framebuffers */ -#define DRM_MODE_FB_MODIFIERS (1<<1) /* enables ->modifier[] */ - -/** - * struct drm_mode_fb_cmd2 - Frame-buffer metadata. - * - * This struct holds frame-buffer metadata. There are two ways to use it: - * - * - User-space can fill this struct and perform a &DRM_IOCTL_MODE_ADDFB2 - * ioctl to register a new frame-buffer. The new frame-buffer object ID will - * be set by the kernel in @fb_id. - * - User-space can set @fb_id and perform a &DRM_IOCTL_MODE_GETFB2 ioctl to - * fetch metadata about an existing frame-buffer. - * - * In case of planar formats, this struct allows up to 4 buffer objects with - * offsets and pitches per plane. The pitch and offset order are dictated by - * the format FourCC as defined by ``drm_fourcc.h``, e.g. NV12 is described as: - * - * YUV 4:2:0 image with a plane of 8-bit Y samples followed by an - * interleaved U/V plane containing 8-bit 2x2 subsampled colour difference - * samples. - * - * So it would consist of a Y plane at ``offsets[0]`` and a UV plane at - * ``offsets[1]``. - * - * To accommodate tiled, compressed, etc formats, a modifier can be specified. - * For more information see the "Format Modifiers" section. Note that even - * though it looks like we have a modifier per-plane, we in fact do not. The - * modifier for each plane must be identical. Thus all combinations of - * different data layouts for multi-plane formats must be enumerated as - * separate modifiers. - * - * All of the entries in @handles, @pitches, @offsets and @modifier must be - * zero when unused. Warning, for @offsets and @modifier zero can't be used to - * figure out whether the entry is used or not since it's a valid value (a zero - * offset is common, and a zero modifier is &DRM_FORMAT_MOD_LINEAR). - */ -struct drm_mode_fb_cmd2 { - /** @fb_id: Object ID of the frame-buffer. */ - __u32 fb_id; - /** @width: Width of the frame-buffer. */ - __u32 width; - /** @height: Height of the frame-buffer. */ - __u32 height; - /** - * @pixel_format: FourCC format code, see ``DRM_FORMAT_*`` constants in - * ``drm_fourcc.h``. - */ - __u32 pixel_format; - /** - * @flags: Frame-buffer flags (see &DRM_MODE_FB_INTERLACED and - * &DRM_MODE_FB_MODIFIERS). - */ - __u32 flags; - - /** - * @handles: GEM buffer handle, one per plane. Set to 0 if the plane is - * unused. The same handle can be used for multiple planes. - */ - __u32 handles[4]; - /** @pitches: Pitch (aka. stride) in bytes, one per plane. */ - __u32 pitches[4]; - /** @offsets: Offset into the buffer in bytes, one per plane. */ - __u32 offsets[4]; - /** - * @modifier: Format modifier, one per plane. See ``DRM_FORMAT_MOD_*`` - * constants in ``drm_fourcc.h``. All planes must use the same - * modifier. Ignored unless &DRM_MODE_FB_MODIFIERS is set in @flags. - */ - __u64 modifier[4]; -}; - -#define DRM_MODE_FB_DIRTY_ANNOTATE_COPY 0x01 -#define DRM_MODE_FB_DIRTY_ANNOTATE_FILL 0x02 -#define DRM_MODE_FB_DIRTY_FLAGS 0x03 - -#define DRM_MODE_FB_DIRTY_MAX_CLIPS 256 - -/* - * Mark a region of a framebuffer as dirty. - * - * Some hardware does not automatically update display contents - * as a hardware or software draw to a framebuffer. This ioctl - * allows userspace to tell the kernel and the hardware what - * regions of the framebuffer have changed. - * - * The kernel or hardware is free to update more then just the - * region specified by the clip rects. The kernel or hardware - * may also delay and/or coalesce several calls to dirty into a - * single update. - * - * Userspace may annotate the updates, the annotates are a - * promise made by the caller that the change is either a copy - * of pixels or a fill of a single color in the region specified. - * - * If the DRM_MODE_FB_DIRTY_ANNOTATE_COPY flag is given then - * the number of updated regions are half of num_clips given, - * where the clip rects are paired in src and dst. The width and - * height of each one of the pairs must match. - * - * If the DRM_MODE_FB_DIRTY_ANNOTATE_FILL flag is given the caller - * promises that the region specified of the clip rects is filled - * completely with a single color as given in the color argument. - */ - -struct drm_mode_fb_dirty_cmd { - __u32 fb_id; - __u32 flags; - __u32 color; - __u32 num_clips; - __u64 clips_ptr; -}; - -struct drm_mode_mode_cmd { - __u32 connector_id; - struct drm_mode_modeinfo mode; -}; - -#define DRM_MODE_CURSOR_BO 0x01 -#define DRM_MODE_CURSOR_MOVE 0x02 -#define DRM_MODE_CURSOR_FLAGS 0x03 - -/* - * depending on the value in flags different members are used. - * - * CURSOR_BO uses - * crtc_id - * width - * height - * handle - if 0 turns the cursor off - * - * CURSOR_MOVE uses - * crtc_id - * x - * y - */ -struct drm_mode_cursor { - __u32 flags; - __u32 crtc_id; - __s32 x; - __s32 y; - __u32 width; - __u32 height; - /* driver specific handle */ - __u32 handle; -}; - -struct drm_mode_cursor2 { - __u32 flags; - __u32 crtc_id; - __s32 x; - __s32 y; - __u32 width; - __u32 height; - /* driver specific handle */ - __u32 handle; - __s32 hot_x; - __s32 hot_y; -}; - -struct drm_mode_crtc_lut { - __u32 crtc_id; - __u32 gamma_size; - - /* pointers to arrays */ - __u64 red; - __u64 green; - __u64 blue; -}; - -struct drm_color_ctm { - /* - * Conversion matrix in S31.32 sign-magnitude - * (not two's complement!) format. - * - * out matrix in - * |R| |0 1 2| |R| - * |G| = |3 4 5| x |G| - * |B| |6 7 8| |B| - */ - __u64 matrix[9]; -}; - -struct drm_color_lut { - /* - * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and - * 0xffff == 1.0. - */ - __u16 red; - __u16 green; - __u16 blue; - __u16 reserved; -}; - -/** - * struct drm_plane_size_hint - Plane size hints - * @width: The width of the plane in pixel - * @height: The height of the plane in pixel - * - * The plane SIZE_HINTS property blob contains an - * array of struct drm_plane_size_hint. - */ -struct drm_plane_size_hint { - __u16 width; - __u16 height; -}; - -/** - * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. - * - * HDR Metadata Infoframe as per CTA 861.G spec. This is expected - * to match exactly with the spec. - * - * Userspace is expected to pass the metadata information as per - * the format described in this structure. - */ -struct hdr_metadata_infoframe { - /** - * @eotf: Electro-Optical Transfer Function (EOTF) - * used in the stream. - */ - __u8 eotf; - /** - * @metadata_type: Static_Metadata_Descriptor_ID. - */ - __u8 metadata_type; - /** - * @display_primaries: Color Primaries of the Data. - * These are coded as unsigned 16-bit values in units of - * 0.00002, where 0x0000 represents zero and 0xC350 - * represents 1.0000. - * @display_primaries.x: X coordinate of color primary. - * @display_primaries.y: Y coordinate of color primary. - */ - struct { - __u16 x, y; - } display_primaries[3]; - /** - * @white_point: White Point of Colorspace Data. - * These are coded as unsigned 16-bit values in units of - * 0.00002, where 0x0000 represents zero and 0xC350 - * represents 1.0000. - * @white_point.x: X coordinate of whitepoint of color primary. - * @white_point.y: Y coordinate of whitepoint of color primary. - */ - struct { - __u16 x, y; - } white_point; - /** - * @max_display_mastering_luminance: Max Mastering Display Luminance. - * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, - * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. - */ - __u16 max_display_mastering_luminance; - /** - * @min_display_mastering_luminance: Min Mastering Display Luminance. - * This value is coded as an unsigned 16-bit value in units of - * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF - * represents 6.5535 cd/m2. - */ - __u16 min_display_mastering_luminance; - /** - * @max_cll: Max Content Light Level. - * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, - * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. - */ - __u16 max_cll; - /** - * @max_fall: Max Frame Average Light Level. - * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, - * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. - */ - __u16 max_fall; -}; - -/** - * struct hdr_output_metadata - HDR output metadata - * - * Metadata Information to be passed from userspace - */ -struct hdr_output_metadata { - /** - * @metadata_type: Static_Metadata_Descriptor_ID. - */ - __u32 metadata_type; - /** - * @hdmi_metadata_type1: HDR Metadata Infoframe. - */ - union { - struct hdr_metadata_infoframe hdmi_metadata_type1; - }; -}; - -/** - * DRM_MODE_PAGE_FLIP_EVENT - * - * Request that the kernel sends back a vblank event (see - * struct drm_event_vblank) with the &DRM_EVENT_FLIP_COMPLETE type when the - * page-flip is done. - */ -#define DRM_MODE_PAGE_FLIP_EVENT 0x01 -/** - * DRM_MODE_PAGE_FLIP_ASYNC - * - * Request that the page-flip is performed as soon as possible, ie. with no - * delay due to waiting for vblank. This may cause tearing to be visible on - * the screen. - * - * When used with atomic uAPI, the driver will return an error if the hardware - * doesn't support performing an asynchronous page-flip for this update. - * User-space should handle this, e.g. by falling back to a regular page-flip. - * - * Note, some hardware might need to perform one last synchronous page-flip - * before being able to switch to asynchronous page-flips. As an exception, - * the driver will return success even though that first page-flip is not - * asynchronous. - */ -#define DRM_MODE_PAGE_FLIP_ASYNC 0x02 -#define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 -#define DRM_MODE_PAGE_FLIP_TARGET_RELATIVE 0x8 -#define DRM_MODE_PAGE_FLIP_TARGET (DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE | \ - DRM_MODE_PAGE_FLIP_TARGET_RELATIVE) -/** - * DRM_MODE_PAGE_FLIP_FLAGS - * - * Bitmask of flags suitable for &drm_mode_crtc_page_flip_target.flags. - */ -#define DRM_MODE_PAGE_FLIP_FLAGS (DRM_MODE_PAGE_FLIP_EVENT | \ - DRM_MODE_PAGE_FLIP_ASYNC | \ - DRM_MODE_PAGE_FLIP_TARGET) - -/* - * Request a page flip on the specified crtc. - * - * This ioctl will ask KMS to schedule a page flip for the specified - * crtc. Once any pending rendering targeting the specified fb (as of - * ioctl time) has completed, the crtc will be reprogrammed to display - * that fb after the next vertical refresh. The ioctl returns - * immediately, but subsequent rendering to the current fb will block - * in the execbuffer ioctl until the page flip happens. If a page - * flip is already pending as the ioctl is called, EBUSY will be - * returned. - * - * Flag DRM_MODE_PAGE_FLIP_EVENT requests that drm sends back a vblank - * event (see drm.h: struct drm_event_vblank) when the page flip is - * done. The user_data field passed in with this ioctl will be - * returned as the user_data field in the vblank event struct. - * - * Flag DRM_MODE_PAGE_FLIP_ASYNC requests that the flip happen - * 'as soon as possible', meaning that it not delay waiting for vblank. - * This may cause tearing on the screen. - * - * The reserved field must be zero. - */ - -struct drm_mode_crtc_page_flip { - __u32 crtc_id; - __u32 fb_id; - __u32 flags; - __u32 reserved; - __u64 user_data; -}; - -/* - * Request a page flip on the specified crtc. - * - * Same as struct drm_mode_crtc_page_flip, but supports new flags and - * re-purposes the reserved field: - * - * The sequence field must be zero unless either of the - * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is specified. When - * the ABSOLUTE flag is specified, the sequence field denotes the absolute - * vblank sequence when the flip should take effect. When the RELATIVE - * flag is specified, the sequence field denotes the relative (to the - * current one when the ioctl is called) vblank sequence when the flip - * should take effect. NOTE: DRM_IOCTL_WAIT_VBLANK must still be used to - * make sure the vblank sequence before the target one has passed before - * calling this ioctl. The purpose of the - * DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE/RELATIVE flags is merely to clarify - * the target for when code dealing with a page flip runs during a - * vertical blank period. - */ - -struct drm_mode_crtc_page_flip_target { - __u32 crtc_id; - __u32 fb_id; - __u32 flags; - __u32 sequence; - __u64 user_data; -}; - -/** - * struct drm_mode_create_dumb - Create a KMS dumb buffer for scanout. - * @height: buffer height in pixels - * @width: buffer width in pixels - * @bpp: bits per pixel - * @flags: must be zero - * @handle: buffer object handle - * @pitch: number of bytes between two consecutive lines - * @size: size of the whole buffer in bytes - * - * User-space fills @height, @width, @bpp and @flags. If the IOCTL succeeds, - * the kernel fills @handle, @pitch and @size. - */ -struct drm_mode_create_dumb { - __u32 height; - __u32 width; - __u32 bpp; - __u32 flags; - - __u32 handle; - __u32 pitch; - __u64 size; -}; - -/* set up for mmap of a dumb scanout buffer */ -struct drm_mode_map_dumb { - /** Handle for the object being mapped. */ - __u32 handle; - __u32 pad; - /** - * Fake offset to use for subsequent mmap call - * - * This is a fixed-size type for 32/64 compatibility. - */ - __u64 offset; -}; - -struct drm_mode_destroy_dumb { - __u32 handle; -}; - -/** - * DRM_MODE_ATOMIC_TEST_ONLY - * - * Do not apply the atomic commit, instead check whether the hardware supports - * this configuration. - * - * See &drm_mode_config_funcs.atomic_check for more details on test-only - * commits. - */ -#define DRM_MODE_ATOMIC_TEST_ONLY 0x0100 -/** - * DRM_MODE_ATOMIC_NONBLOCK - * - * Do not block while applying the atomic commit. The &DRM_IOCTL_MODE_ATOMIC - * IOCTL returns immediately instead of waiting for the changes to be applied - * in hardware. Note, the driver will still check that the update can be - * applied before retuning. - */ -#define DRM_MODE_ATOMIC_NONBLOCK 0x0200 -/** - * DRM_MODE_ATOMIC_ALLOW_MODESET - * - * Allow the update to result in temporary or transient visible artifacts while - * the update is being applied. Applying the update may also take significantly - * more time than a page flip. All visual artifacts will disappear by the time - * the update is completed, as signalled through the vblank event's timestamp - * (see struct drm_event_vblank). - * - * This flag must be set when the KMS update might cause visible artifacts. - * Without this flag such KMS update will return a EINVAL error. What kind of - * update may cause visible artifacts depends on the driver and the hardware. - * User-space that needs to know beforehand if an update might cause visible - * artifacts can use &DRM_MODE_ATOMIC_TEST_ONLY without - * &DRM_MODE_ATOMIC_ALLOW_MODESET to see if it fails. - * - * To the best of the driver's knowledge, visual artifacts are guaranteed to - * not appear when this flag is not set. Some sinks might display visual - * artifacts outside of the driver's control. - */ -#define DRM_MODE_ATOMIC_ALLOW_MODESET 0x0400 - -/** - * DRM_MODE_ATOMIC_FLAGS - * - * Bitfield of flags accepted by the &DRM_IOCTL_MODE_ATOMIC IOCTL in - * &drm_mode_atomic.flags. - */ -#define DRM_MODE_ATOMIC_FLAGS (\ - DRM_MODE_PAGE_FLIP_EVENT |\ - DRM_MODE_PAGE_FLIP_ASYNC |\ - DRM_MODE_ATOMIC_TEST_ONLY |\ - DRM_MODE_ATOMIC_NONBLOCK |\ - DRM_MODE_ATOMIC_ALLOW_MODESET) - -struct drm_mode_atomic { - __u32 flags; - __u32 count_objs; - __u64 objs_ptr; - __u64 count_props_ptr; - __u64 props_ptr; - __u64 prop_values_ptr; - __u64 reserved; - __u64 user_data; -}; - -struct drm_format_modifier_blob { -#define FORMAT_BLOB_CURRENT 1 - /* Version of this blob format */ - __u32 version; - - /* Flags */ - __u32 flags; - - /* Number of fourcc formats supported */ - __u32 count_formats; - - /* Where in this blob the formats exist (in bytes) */ - __u32 formats_offset; - - /* Number of drm_format_modifiers */ - __u32 count_modifiers; - - /* Where in this blob the modifiers exist (in bytes) */ - __u32 modifiers_offset; - - /* __u32 formats[] */ - /* struct drm_format_modifier modifiers[] */ -}; - -struct drm_format_modifier { - /* Bitmask of formats in get_plane format list this info applies to. The - * offset allows a sliding window of which 64 formats (bits). - * - * Some examples: - * In today's world with < 65 formats, and formats 0, and 2 are - * supported - * 0x0000000000000005 - * ^-offset = 0, formats = 5 - * - * If the number formats grew to 128, and formats 98-102 are - * supported with the modifier: - * - * 0x0000007c00000000 0000000000000000 - * ^ - * |__offset = 64, formats = 0x7c00000000 - * - */ - __u64 formats; - __u32 offset; - __u32 pad; - - /* The modifier that applies to the >get_plane format list bitmask. */ - __u64 modifier; -}; - -/** - * struct drm_mode_create_blob - Create New blob property - * - * Create a new 'blob' data property, copying length bytes from data pointer, - * and returning new blob ID. - */ -struct drm_mode_create_blob { - /** @data: Pointer to data to copy. */ - __u64 data; - /** @length: Length of data to copy. */ - __u32 length; - /** @blob_id: Return: new property ID. */ - __u32 blob_id; -}; - -/** - * struct drm_mode_destroy_blob - Destroy user blob - * @blob_id: blob_id to destroy - * - * Destroy a user-created blob property. - * - * User-space can release blobs as soon as they do not need to refer to them by - * their blob object ID. For instance, if you are using a MODE_ID blob in an - * atomic commit and you will not make another commit re-using the same ID, you - * can destroy the blob as soon as the commit has been issued, without waiting - * for it to complete. - */ -struct drm_mode_destroy_blob { - __u32 blob_id; -}; - -/** - * struct drm_mode_create_lease - Create lease - * - * Lease mode resources, creating another drm_master. - * - * The @object_ids array must reference at least one CRTC, one connector and - * one plane if &DRM_CLIENT_CAP_UNIVERSAL_PLANES is enabled. Alternatively, - * the lease can be completely empty. - */ -struct drm_mode_create_lease { - /** @object_ids: Pointer to array of object ids (__u32) */ - __u64 object_ids; - /** @object_count: Number of object ids */ - __u32 object_count; - /** @flags: flags for new FD (O_CLOEXEC, etc) */ - __u32 flags; - - /** @lessee_id: Return: unique identifier for lessee. */ - __u32 lessee_id; - /** @fd: Return: file descriptor to new drm_master file */ - __u32 fd; -}; - -/** - * struct drm_mode_list_lessees - List lessees - * - * List lesses from a drm_master. - */ -struct drm_mode_list_lessees { - /** - * @count_lessees: Number of lessees. - * - * On input, provides length of the array. - * On output, provides total number. No - * more than the input number will be written - * back, so two calls can be used to get - * the size and then the data. - */ - __u32 count_lessees; - /** @pad: Padding. */ - __u32 pad; - - /** - * @lessees_ptr: Pointer to lessees. - * - * Pointer to __u64 array of lessee ids - */ - __u64 lessees_ptr; -}; - -/** - * struct drm_mode_get_lease - Get Lease - * - * Get leased objects. - */ -struct drm_mode_get_lease { - /** - * @count_objects: Number of leased objects. - * - * On input, provides length of the array. - * On output, provides total number. No - * more than the input number will be written - * back, so two calls can be used to get - * the size and then the data. - */ - __u32 count_objects; - /** @pad: Padding. */ - __u32 pad; - - /** - * @objects_ptr: Pointer to objects. - * - * Pointer to __u32 array of object ids. - */ - __u64 objects_ptr; -}; - -/** - * struct drm_mode_revoke_lease - Revoke lease - */ -struct drm_mode_revoke_lease { - /** @lessee_id: Unique ID of lessee */ - __u32 lessee_id; -}; - -/** - * struct drm_mode_rect - Two dimensional rectangle. - * @x1: Horizontal starting coordinate (inclusive). - * @y1: Vertical starting coordinate (inclusive). - * @x2: Horizontal ending coordinate (exclusive). - * @y2: Vertical ending coordinate (exclusive). - * - * With drm subsystem using struct drm_rect to manage rectangular area this - * export it to user-space. - * - * Currently used by drm_mode_atomic blob property FB_DAMAGE_CLIPS. - */ -struct drm_mode_rect { - __s32 x1; - __s32 y1; - __s32 x2; - __s32 y2; -}; - -/** - * struct drm_mode_closefb - * @fb_id: Framebuffer ID. - * @pad: Must be zero. - */ -struct drm_mode_closefb { - __u32 fb_id; - __u32 pad; -}; - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/plugins/amdgpu/kfd_ioctl.h b/plugins/amdgpu/kfd_ioctl.h deleted file mode 100644 index a63d453f0..000000000 --- a/plugins/amdgpu/kfd_ioctl.h +++ /dev/null @@ -1,788 +0,0 @@ -/* - * Copyright 2014 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef KFD_IOCTL_H_INCLUDED -#define KFD_IOCTL_H_INCLUDED - -#include - -/* Define __user as empty for kernel headers in user-space */ -#define __user -#include "drm.h" - -/* - * - 1.1 - initial version - * - 1.3 - Add SMI events support - * - 1.4 - Indicate new SRAM EDC bit in device properties - * - 1.5 - Add SVM API - * - 1.6 - Query clear flags in SVM get_attr API - * - 1.7 - Checkpoint Restore (CRIU) API - * - 1.8 - CRIU - Support for SDMA transfers with GTT BOs - */ -#define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 8 - -struct kfd_ioctl_get_version_args { - uint32_t major_version; /* from KFD */ - uint32_t minor_version; /* from KFD */ -}; - -/* For kfd_ioctl_create_queue_args.queue_type. */ -#define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0 -#define KFD_IOC_QUEUE_TYPE_SDMA 0x1 -#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 0x2 -#define KFD_IOC_QUEUE_TYPE_SDMA_XGMI 0x3 - -#define KFD_MAX_QUEUE_PERCENTAGE 100 -#define KFD_MAX_QUEUE_PRIORITY 15 - -struct kfd_ioctl_create_queue_args { - uint64_t ring_base_address; /* to KFD */ - uint64_t write_pointer_address; /* from KFD */ - uint64_t read_pointer_address; /* from KFD */ - uint64_t doorbell_offset; /* from KFD */ - - uint32_t ring_size; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t queue_type; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ - uint32_t queue_id; /* from KFD */ - - uint64_t eop_buffer_address; /* to KFD */ - uint64_t eop_buffer_size; /* to KFD */ - uint64_t ctx_save_restore_address; /* to KFD */ - uint32_t ctx_save_restore_size; /* to KFD */ - uint32_t ctl_stack_size; /* to KFD */ -}; - -struct kfd_ioctl_destroy_queue_args { - uint32_t queue_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_update_queue_args { - uint64_t ring_base_address; /* to KFD */ - - uint32_t queue_id; /* to KFD */ - uint32_t ring_size; /* to KFD */ - uint32_t queue_percentage; /* to KFD */ - uint32_t queue_priority; /* to KFD */ -}; - -struct kfd_ioctl_set_cu_mask_args { - uint32_t queue_id; /* to KFD */ - uint32_t num_cu_mask; /* to KFD */ - uint64_t cu_mask_ptr; /* to KFD */ -}; - -struct kfd_ioctl_get_queue_wave_state_args { - uint64_t ctl_stack_address; /* to KFD */ - uint32_t ctl_stack_used_size; /* from KFD */ - uint32_t save_area_used_size; /* from KFD */ - uint32_t queue_id; /* to KFD */ - uint32_t pad; -}; - -/* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ -#define KFD_IOC_CACHE_POLICY_COHERENT 0 -#define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 - -struct kfd_ioctl_set_memory_policy_args { - uint64_t alternate_aperture_base; /* to KFD */ - uint64_t alternate_aperture_size; /* to KFD */ - - uint32_t gpu_id; /* to KFD */ - uint32_t default_policy; /* to KFD */ - uint32_t alternate_policy; /* to KFD */ - uint32_t pad; -}; - -/* - * All counters are monotonic. They are used for profiling of compute jobs. - * The profiling is done by userspace. - * - * In case of GPU reset, the counter should not be affected. - */ - -struct kfd_ioctl_get_clock_counters_args { - uint64_t gpu_clock_counter; /* from KFD */ - uint64_t cpu_clock_counter; /* from KFD */ - uint64_t system_clock_counter; /* from KFD */ - uint64_t system_clock_freq; /* from KFD */ - - uint32_t gpu_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_process_device_apertures { - uint64_t lds_base; /* from KFD */ - uint64_t lds_limit; /* from KFD */ - uint64_t scratch_base; /* from KFD */ - uint64_t scratch_limit; /* from KFD */ - uint64_t gpuvm_base; /* from KFD */ - uint64_t gpuvm_limit; /* from KFD */ - uint32_t gpu_id; /* from KFD */ - uint32_t pad; -}; - -/* - * AMDKFD_IOC_GET_PROCESS_APERTURES is deprecated. Use - * AMDKFD_IOC_GET_PROCESS_APERTURES_NEW instead, which supports an - * unlimited number of GPUs. - */ -#define NUM_OF_SUPPORTED_GPUS 7 -struct kfd_ioctl_get_process_apertures_args { - struct kfd_process_device_apertures process_apertures[NUM_OF_SUPPORTED_GPUS]; /* from KFD */ - - /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ - uint32_t num_of_nodes; - uint32_t pad; -}; - -struct kfd_ioctl_get_process_apertures_new_args { - /* User allocated. Pointer to struct kfd_process_device_apertures - * filled in by Kernel - */ - uint64_t kfd_process_device_apertures_ptr; - /* to KFD - indicates amount of memory present in kfd_process_device_apertures_ptr - * from KFD - Number of entries filled by KFD. - */ - uint32_t num_of_nodes; - uint32_t pad; -}; - -#define MAX_ALLOWED_NUM_POINTS 100 -#define MAX_ALLOWED_AW_BUFF_SIZE 4096 -#define MAX_ALLOWED_WAC_BUFF_SIZE 128 - -struct kfd_ioctl_dbg_register_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_dbg_unregister_args { - uint32_t gpu_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_dbg_address_watch_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ -}; - -struct kfd_ioctl_dbg_wave_control_args { - uint64_t content_ptr; /* a pointer to the actual content */ - uint32_t gpu_id; /* to KFD */ - uint32_t buf_size_in_bytes; /*including gpu_id and buf_size */ -}; - -#define KFD_INVALID_FD 0xffffffff - -/* Matching HSA_EVENTTYPE */ -#define KFD_IOC_EVENT_SIGNAL 0 -#define KFD_IOC_EVENT_NODECHANGE 1 -#define KFD_IOC_EVENT_DEVICESTATECHANGE 2 -#define KFD_IOC_EVENT_HW_EXCEPTION 3 -#define KFD_IOC_EVENT_SYSTEM_EVENT 4 -#define KFD_IOC_EVENT_DEBUG_EVENT 5 -#define KFD_IOC_EVENT_PROFILE_EVENT 6 -#define KFD_IOC_EVENT_QUEUE_EVENT 7 -#define KFD_IOC_EVENT_MEMORY 8 - -#define KFD_IOC_WAIT_RESULT_COMPLETE 0 -#define KFD_IOC_WAIT_RESULT_TIMEOUT 1 -#define KFD_IOC_WAIT_RESULT_FAIL 2 - -#define KFD_SIGNAL_EVENT_LIMIT 4096 - -/* For kfd_event_data.hw_exception_data.reset_type. */ -#define KFD_HW_EXCEPTION_WHOLE_GPU_RESET 0 -#define KFD_HW_EXCEPTION_PER_ENGINE_RESET 1 - -/* For kfd_event_data.hw_exception_data.reset_cause. */ -#define KFD_HW_EXCEPTION_GPU_HANG 0 -#define KFD_HW_EXCEPTION_ECC 1 - -/* For kfd_hsa_memory_exception_data.ErrorType */ -#define KFD_MEM_ERR_NO_RAS 0 -#define KFD_MEM_ERR_SRAM_ECC 1 -#define KFD_MEM_ERR_POISON_CONSUMED 2 -#define KFD_MEM_ERR_GPU_HANG 3 - -struct kfd_ioctl_create_event_args { - uint64_t event_page_offset; /* from KFD */ - uint32_t event_trigger_data; /* from KFD - signal events only */ - uint32_t event_type; /* to KFD */ - uint32_t auto_reset; /* to KFD */ - uint32_t node_id; /* to KFD - only valid for certain event types */ - uint32_t event_id; /* from KFD */ - uint32_t event_slot_index; /* from KFD */ -}; - -struct kfd_ioctl_destroy_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_set_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_reset_event_args { - uint32_t event_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_memory_exception_failure { - uint32_t NotPresent; /* Page not present or supervisor privilege */ - uint32_t ReadOnly; /* Write access to a read-only page */ - uint32_t NoExecute; /* Execute access to a page marked NX */ - uint32_t imprecise; /* Can't determine the exact fault address */ -}; - -/* memory exception data */ -struct kfd_hsa_memory_exception_data { - struct kfd_memory_exception_failure failure; - uint64_t va; - uint32_t gpu_id; - uint32_t ErrorType; /* 0 = no RAS error, - * 1 = ECC_SRAM, - * 2 = Link_SYNFLOOD (poison), - * 3 = GPU hang (not attributable to a specific cause), - * other values reserved - */ -}; - -/* hw exception data */ -struct kfd_hsa_hw_exception_data { - uint32_t reset_type; - uint32_t reset_cause; - uint32_t memory_lost; - uint32_t gpu_id; -}; - -/* Event data */ -struct kfd_event_data { - union { - struct kfd_hsa_memory_exception_data memory_exception_data; - struct kfd_hsa_hw_exception_data hw_exception_data; - }; /* From KFD */ - uint64_t kfd_event_data_ext; /* pointer to an extension structure for future exception types */ - uint32_t event_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_wait_events_args { - uint64_t events_ptr; /* pointed to struct kfd_event_data array, to KFD */ - uint32_t num_events; /* to KFD */ - uint32_t wait_for_all; /* to KFD */ - uint32_t timeout; /* to KFD */ - uint32_t wait_result; /* from KFD */ -}; - -struct kfd_ioctl_set_scratch_backing_va_args { - uint64_t va_addr; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_get_tile_config_args { - /* to KFD: pointer to tile array */ - uint64_t tile_config_ptr; - /* to KFD: pointer to macro tile array */ - uint64_t macro_tile_config_ptr; - /* to KFD: array size allocated by user mode - * from KFD: array size filled by kernel - */ - uint32_t num_tile_configs; - /* to KFD: array size allocated by user mode - * from KFD: array size filled by kernel - */ - uint32_t num_macro_tile_configs; - - uint32_t gpu_id; /* to KFD */ - uint32_t gb_addr_config; /* from KFD */ - uint32_t num_banks; /* from KFD */ - uint32_t num_ranks; /* from KFD */ - - /* struct size can be extended later if needed without breaking ABI compatibility */ -}; - -struct kfd_ioctl_set_trap_handler_args { - uint64_t tba_addr; /* to KFD */ - uint64_t tma_addr; /* to KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_acquire_vm_args { - uint32_t drm_fd; /* to KFD */ - uint32_t gpu_id; /* to KFD */ -}; - -/* Allocation flags: memory types */ -#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM (1 << 0) -#define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1) -#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2) -#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3) -#define KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP (1 << 4) -/* Allocation flags: attributes/access options */ -#define KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE (1 << 31) -#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE (1 << 30) -#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC (1 << 29) -#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28) -#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) -#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) -#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) - -/* Allocate memory for later SVM (shared virtual memory) mapping. - * - * @va_addr: virtual address of the memory to be allocated - * all later mappings on all GPUs will use this address - * @size: size in bytes - * @handle: buffer handle returned to user mode, used to refer to - * this allocation for mapping, unmapping and freeing - * @mmap_offset: for CPU-mapping the allocation by mmapping a render node - * for userptrs this is overloaded to specify the CPU address - * @gpu_id: device identifier - * @flags: memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above - */ -struct kfd_ioctl_alloc_memory_of_gpu_args { - uint64_t va_addr; /* to KFD */ - uint64_t size; /* to KFD */ - uint64_t handle; /* from KFD */ - uint64_t mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */ - uint32_t gpu_id; /* to KFD */ - uint32_t flags; -}; - -/* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu - * - * @handle: memory handle returned by alloc - */ -struct kfd_ioctl_free_memory_of_gpu_args { - uint64_t handle; /* to KFD */ -}; - -/* Map memory to one or more GPUs - * - * @handle: memory handle returned by alloc - * @device_ids_array_ptr: array of gpu_ids (uint32_t per device) - * @n_devices: number of devices in the array - * @n_success: number of devices mapped successfully - * - * @n_success returns information to the caller how many devices from - * the start of the array have mapped the buffer successfully. It can - * be passed into a subsequent retry call to skip those devices. For - * the first call the caller should initialize it to 0. - * - * If the ioctl completes with return code 0 (success), n_success == - * n_devices. - */ -struct kfd_ioctl_map_memory_to_gpu_args { - uint64_t handle; /* to KFD */ - uint64_t device_ids_array_ptr; /* to KFD */ - uint32_t n_devices; /* to KFD */ - uint32_t n_success; /* to/from KFD */ -}; - -/* Unmap memory from one or more GPUs - * - * same arguments as for mapping - */ -struct kfd_ioctl_unmap_memory_from_gpu_args { - uint64_t handle; /* to KFD */ - uint64_t device_ids_array_ptr; /* to KFD */ - uint32_t n_devices; /* to KFD */ - uint32_t n_success; /* to/from KFD */ -}; - -/* Allocate GWS for specific queue - * - * @queue_id: queue's id that GWS is allocated for - * @num_gws: how many GWS to allocate - * @first_gws: index of the first GWS allocated. - * only support contiguous GWS allocation - */ -struct kfd_ioctl_alloc_queue_gws_args { - uint32_t queue_id; /* to KFD */ - uint32_t num_gws; /* to KFD */ - uint32_t first_gws; /* from KFD */ - uint32_t pad; -}; - -struct kfd_ioctl_get_dmabuf_info_args { - uint64_t size; /* from KFD */ - uint64_t metadata_ptr; /* to KFD */ - uint32_t metadata_size; /* to KFD (space allocated by user) - * from KFD (actual metadata size) - */ - uint32_t gpu_id; /* from KFD */ - uint32_t flags; /* from KFD (KFD_IOC_ALLOC_MEM_FLAGS) */ - uint32_t dmabuf_fd; /* to KFD */ -}; - -struct kfd_ioctl_import_dmabuf_args { - uint64_t va_addr; /* to KFD */ - uint64_t handle; /* from KFD */ - uint32_t gpu_id; /* to KFD */ - uint32_t dmabuf_fd; /* to KFD */ -}; - -/* - * KFD SMI(System Management Interface) events - */ -enum kfd_smi_event { - KFD_SMI_EVENT_NONE = 0, /* not used */ - KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ - KFD_SMI_EVENT_THERMAL_THROTTLE = 2, - KFD_SMI_EVENT_GPU_PRE_RESET = 3, - KFD_SMI_EVENT_GPU_POST_RESET = 4, -}; - -#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i)-1)) -#define KFD_SMI_EVENT_MSG_SIZE 96 - -struct kfd_ioctl_smi_events_args { - uint32_t gpuid; /* to KFD */ - uint32_t anon_fd; /* from KFD */ -}; - -/************************************************************************************************** - * CRIU IOCTLs (Checkpoint Restore In Userspace) - * - * When checkpointing a process, the userspace application will perform: - * 1. PROCESS_INFO op to determine current process information. This pauses execution and evicts - * all the queues. - * 2. CHECKPOINT op to checkpoint process contents (BOs, queues, events, svm-ranges) - * 3. UNPAUSE op to un-evict all the queues - * - * When restoring a process, the CRIU userspace application will perform: - * - * 1. RESTORE op to restore process contents - * 2. RESUME op to start the process - * - * Note: Queues are forced into an evicted state after a successful PROCESS_INFO. User - * application needs to perform an UNPAUSE operation after calling PROCESS_INFO. - */ - -enum kfd_criu_op { - KFD_CRIU_OP_PROCESS_INFO, - KFD_CRIU_OP_CHECKPOINT, - KFD_CRIU_OP_UNPAUSE, - KFD_CRIU_OP_RESTORE, - KFD_CRIU_OP_RESUME, -}; - -/** - * kfd_ioctl_criu_args - Arguments perform CRIU operation - * @devices: [in/out] User pointer to memory location for devices information. - * This is an array of type kfd_criu_device_bucket. - * @bos: [in/out] User pointer to memory location for BOs information - * This is an array of type kfd_criu_bo_bucket. - * @priv_data: [in/out] User pointer to memory location for private data - * @priv_data_size: [in/out] Size of priv_data in bytes - * @num_devices: [in/out] Number of GPUs used by process. Size of @devices array. - * @num_bos [in/out] Number of BOs used by process. Size of @bos array. - * @num_objects: [in/out] Number of objects used by process. Objects are opaque to user application. - * @pid: [in/out] PID of the process being checkpointed - * @op [in] Type of operation (kfd_criu_op) - * - * Return: 0 on success, -errno on failure - */ -struct kfd_ioctl_criu_args { - uint64_t devices; /* Used during ops: CHECKPOINT, RESTORE */ - uint64_t bos; /* Used during ops: CHECKPOINT, RESTORE */ - uint64_t priv_data; /* Used during ops: CHECKPOINT, RESTORE */ - uint64_t priv_data_size; /* Used during ops: PROCESS_INFO, RESTORE */ - uint32_t num_devices; /* Used during ops: PROCESS_INFO, RESTORE */ - uint32_t num_bos; /* Used during ops: PROCESS_INFO, RESTORE */ - uint32_t num_objects; /* Used during ops: PROCESS_INFO, RESTORE */ - uint32_t pid; /* Used during ops: PROCESS_INFO, RESUME */ - uint32_t op; -}; - -struct kfd_criu_device_bucket { - uint32_t user_gpu_id; - uint32_t actual_gpu_id; - uint32_t drm_fd; - uint32_t pad; -}; - -struct kfd_criu_bo_bucket { - uint64_t addr; - uint64_t size; - uint64_t offset; - uint64_t restored_offset; /* During restore, updated offset for BO */ - uint32_t gpu_id; /* This is the user_gpu_id */ - uint32_t alloc_flags; - uint32_t dmabuf_fd; - uint32_t pad; -}; - -/* CRIU IOCTLs - END */ -/**************************************************************************************************/ - -/* Register offset inside the remapped mmio page - */ -enum kfd_mmio_remap { - KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL = 0, - KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL = 4, -}; - -/* Guarantee host access to memory */ -#define KFD_IOCTL_SVM_FLAG_HOST_ACCESS 0x00000001 -/* Fine grained coherency between all devices with access */ -#define KFD_IOCTL_SVM_FLAG_COHERENT 0x00000002 -/* Use any GPU in same hive as preferred device */ -#define KFD_IOCTL_SVM_FLAG_HIVE_LOCAL 0x00000004 -/* GPUs only read, allows replication */ -#define KFD_IOCTL_SVM_FLAG_GPU_RO 0x00000008 -/* Allow execution on GPU */ -#define KFD_IOCTL_SVM_FLAG_GPU_EXEC 0x00000010 -/* GPUs mostly read, may allow similar optimizations as RO, but writes fault */ -#define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 - -/** - * kfd_ioctl_svm_op - SVM ioctl operations - * - * @KFD_IOCTL_SVM_OP_SET_ATTR: Modify one or more attributes - * @KFD_IOCTL_SVM_OP_GET_ATTR: Query one or more attributes - */ -enum kfd_ioctl_svm_op { KFD_IOCTL_SVM_OP_SET_ATTR, KFD_IOCTL_SVM_OP_GET_ATTR }; - -/** kfd_ioctl_svm_location - Enum for preferred and prefetch locations - * - * GPU IDs are used to specify GPUs as preferred and prefetch locations. - * Below definitions are used for system memory or for leaving the preferred - * location unspecified. - */ -enum kfd_ioctl_svm_location { KFD_IOCTL_SVM_LOCATION_SYSMEM = 0, KFD_IOCTL_SVM_LOCATION_UNDEFINED = 0xffffffff }; - -/** - * kfd_ioctl_svm_attr_type - SVM attribute types - * - * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: gpuid of the preferred location, 0 for - * system memory - * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: gpuid of the prefetch location, 0 for - * system memory. Setting this triggers an - * immediate prefetch (migration). - * @KFD_IOCTL_SVM_ATTR_ACCESS: - * @KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: - * @KFD_IOCTL_SVM_ATTR_NO_ACCESS: specify memory access for the gpuid given - * by the attribute value - * @KFD_IOCTL_SVM_ATTR_SET_FLAGS: bitmask of flags to set (see - * KFD_IOCTL_SVM_FLAG_...) - * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS: bitmask of flags to clear - * @KFD_IOCTL_SVM_ATTR_GRANULARITY: migration granularity - * (log2 num pages) - */ -enum kfd_ioctl_svm_attr_type { - KFD_IOCTL_SVM_ATTR_PREFERRED_LOC, - KFD_IOCTL_SVM_ATTR_PREFETCH_LOC, - KFD_IOCTL_SVM_ATTR_ACCESS, - KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE, - KFD_IOCTL_SVM_ATTR_NO_ACCESS, - KFD_IOCTL_SVM_ATTR_SET_FLAGS, - KFD_IOCTL_SVM_ATTR_CLR_FLAGS, - KFD_IOCTL_SVM_ATTR_GRANULARITY -}; - -/** - * kfd_ioctl_svm_attribute - Attributes as pairs of type and value - * - * The meaning of the @value depends on the attribute type. - * - * @type: attribute type (see enum @kfd_ioctl_svm_attr_type) - * @value: attribute value - */ -struct kfd_ioctl_svm_attribute { - uint32_t type; - uint32_t value; -}; - -/** - * kfd_ioctl_svm_args - Arguments for SVM ioctl - * - * @op specifies the operation to perform (see enum - * @kfd_ioctl_svm_op). @start_addr and @size are common for all - * operations. - * - * A variable number of attributes can be given in @attrs. - * @nattr specifies the number of attributes. New attributes can be - * added in the future without breaking the ABI. If unknown attributes - * are given, the function returns -EINVAL. - * - * @KFD_IOCTL_SVM_OP_SET_ATTR sets attributes for a virtual address - * range. It may overlap existing virtual address ranges. If it does, - * the existing ranges will be split such that the attribute changes - * only apply to the specified address range. - * - * @KFD_IOCTL_SVM_OP_GET_ATTR returns the intersection of attributes - * over all memory in the given range and returns the result as the - * attribute value. If different pages have different preferred or - * prefetch locations, 0xffffffff will be returned for - * @KFD_IOCTL_SVM_ATTR_PREFERRED_LOC or - * @KFD_IOCTL_SVM_ATTR_PREFETCH_LOC resepctively. For - * @KFD_IOCTL_SVM_ATTR_SET_FLAGS, flags of all pages will be - * aggregated by bitwise AND. That means, a flag will be set in the - * output, if that flag is set for all pages in the range. For - * @KFD_IOCTL_SVM_ATTR_CLR_FLAGS, flags of all pages will be - * aggregated by bitwise NOR. That means, a flag will be set in the - * output, if that flag is clear for all pages in the range. - * The minimum migration granularity throughout the range will be - * returned for @KFD_IOCTL_SVM_ATTR_GRANULARITY. - * - * Querying of accessibility attributes works by initializing the - * attribute type to @KFD_IOCTL_SVM_ATTR_ACCESS and the value to the - * GPUID being queried. Multiple attributes can be given to allow - * querying multiple GPUIDs. The ioctl function overwrites the - * attribute type to indicate the access for the specified GPU. - */ -struct kfd_ioctl_svm_args { - uint64_t start_addr; - uint64_t size; - uint32_t op; - uint32_t nattr; - /* Variable length array of attributes */ - struct kfd_ioctl_svm_attribute attrs[0]; -}; - -/** - * kfd_ioctl_set_xnack_mode_args - Arguments for set_xnack_mode - * - * @xnack_enabled: [in/out] Whether to enable XNACK mode for this process - * - * @xnack_enabled indicates whether recoverable page faults should be - * enabled for the current process. 0 means disabled, positive means - * enabled, negative means leave unchanged. If enabled, virtual address - * translations on GFXv9 and later AMD GPUs can return XNACK and retry - * the access until a valid PTE is available. This is used to implement - * device page faults. - * - * On output, @xnack_enabled returns the (new) current mode (0 or - * positive). Therefore, a negative input value can be used to query - * the current mode without changing it. - * - * The XNACK mode fundamentally changes the way SVM managed memory works - * in the driver, with subtle effects on application performance and - * functionality. - * - * Enabling XNACK mode requires shader programs to be compiled - * differently. Furthermore, not all GPUs support changing the mode - * per-process. Therefore changing the mode is only allowed while no - * user mode queues exist in the process. This ensure that no shader - * code is running that may be compiled for the wrong mode. And GPUs - * that cannot change to the requested mode will prevent the XNACK - * mode from occurring. All GPUs used by the process must be in the - * same XNACK mode. - * - * GFXv8 or older GPUs do not support 48 bit virtual addresses or SVM. - * Therefore those GPUs are not considered for the XNACK mode switch. - * - * Return: 0 on success, -errno on failure - */ -struct kfd_ioctl_set_xnack_mode_args { - __s32 xnack_enabled; -}; - -#define AMDKFD_IOCTL_BASE 'K' -#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) -#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) -#define AMDKFD_IOW(nr, type) _IOW(AMDKFD_IOCTL_BASE, nr, type) -#define AMDKFD_IOWR(nr, type) _IOWR(AMDKFD_IOCTL_BASE, nr, type) - -#define AMDKFD_IOC_GET_VERSION AMDKFD_IOR(0x01, struct kfd_ioctl_get_version_args) - -#define AMDKFD_IOC_CREATE_QUEUE AMDKFD_IOWR(0x02, struct kfd_ioctl_create_queue_args) - -#define AMDKFD_IOC_DESTROY_QUEUE AMDKFD_IOWR(0x03, struct kfd_ioctl_destroy_queue_args) - -#define AMDKFD_IOC_SET_MEMORY_POLICY AMDKFD_IOW(0x04, struct kfd_ioctl_set_memory_policy_args) - -#define AMDKFD_IOC_GET_CLOCK_COUNTERS AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args) - -#define AMDKFD_IOC_GET_PROCESS_APERTURES AMDKFD_IOR(0x06, struct kfd_ioctl_get_process_apertures_args) - -#define AMDKFD_IOC_UPDATE_QUEUE AMDKFD_IOW(0x07, struct kfd_ioctl_update_queue_args) - -#define AMDKFD_IOC_CREATE_EVENT AMDKFD_IOWR(0x08, struct kfd_ioctl_create_event_args) - -#define AMDKFD_IOC_DESTROY_EVENT AMDKFD_IOW(0x09, struct kfd_ioctl_destroy_event_args) - -#define AMDKFD_IOC_SET_EVENT AMDKFD_IOW(0x0A, struct kfd_ioctl_set_event_args) - -#define AMDKFD_IOC_RESET_EVENT AMDKFD_IOW(0x0B, struct kfd_ioctl_reset_event_args) - -#define AMDKFD_IOC_WAIT_EVENTS AMDKFD_IOWR(0x0C, struct kfd_ioctl_wait_events_args) - -#define AMDKFD_IOC_DBG_REGISTER_DEPRECATED AMDKFD_IOW(0x0D, struct kfd_ioctl_dbg_register_args) - -#define AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED AMDKFD_IOW(0x0E, struct kfd_ioctl_dbg_unregister_args) - -#define AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED AMDKFD_IOW(0x0F, struct kfd_ioctl_dbg_address_watch_args) - -#define AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED AMDKFD_IOW(0x10, struct kfd_ioctl_dbg_wave_control_args) - -#define AMDKFD_IOC_SET_SCRATCH_BACKING_VA AMDKFD_IOWR(0x11, struct kfd_ioctl_set_scratch_backing_va_args) - -#define AMDKFD_IOC_GET_TILE_CONFIG AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args) - -#define AMDKFD_IOC_SET_TRAP_HANDLER AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args) - -#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW AMDKFD_IOWR(0x14, struct kfd_ioctl_get_process_apertures_new_args) - -#define AMDKFD_IOC_ACQUIRE_VM AMDKFD_IOW(0x15, struct kfd_ioctl_acquire_vm_args) - -#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args) - -#define AMDKFD_IOC_FREE_MEMORY_OF_GPU AMDKFD_IOW(0x17, struct kfd_ioctl_free_memory_of_gpu_args) - -#define AMDKFD_IOC_MAP_MEMORY_TO_GPU AMDKFD_IOWR(0x18, struct kfd_ioctl_map_memory_to_gpu_args) - -#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU AMDKFD_IOWR(0x19, struct kfd_ioctl_unmap_memory_from_gpu_args) - -#define AMDKFD_IOC_SET_CU_MASK AMDKFD_IOW(0x1A, struct kfd_ioctl_set_cu_mask_args) - -#define AMDKFD_IOC_GET_QUEUE_WAVE_STATE AMDKFD_IOWR(0x1B, struct kfd_ioctl_get_queue_wave_state_args) - -#define AMDKFD_IOC_GET_DMABUF_INFO AMDKFD_IOWR(0x1C, struct kfd_ioctl_get_dmabuf_info_args) - -#define AMDKFD_IOC_IMPORT_DMABUF AMDKFD_IOWR(0x1D, struct kfd_ioctl_import_dmabuf_args) - -#define AMDKFD_IOC_ALLOC_QUEUE_GWS AMDKFD_IOWR(0x1E, struct kfd_ioctl_alloc_queue_gws_args) - -#define AMDKFD_IOC_SMI_EVENTS AMDKFD_IOWR(0x1F, struct kfd_ioctl_smi_events_args) - -#define AMDKFD_IOC_SVM AMDKFD_IOWR(0x20, struct kfd_ioctl_svm_args) - -#define AMDKFD_IOC_SET_XNACK_MODE AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args) - -#define AMDKFD_IOC_CRIU_OP AMDKFD_IOWR(0x22, struct kfd_ioctl_criu_args) - -#define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x23 - -#endif diff --git a/plugins/amdgpu/tests/test_topology_remap.c b/plugins/amdgpu/tests/test_topology_remap.c deleted file mode 100644 index 69c2a4827..000000000 --- a/plugins/amdgpu/tests/test_topology_remap.c +++ /dev/null @@ -1,1119 +0,0 @@ -/************************************************************************************************** - * GPU groups remapping unit tests - * - * Test cases for GPU topology group remapping when there are P2P iolinks between the GPUs. GPUs are - * considered to be grouped when they are connected via a XGMI bridge. - * - * When a GPU has large BAR enabled and its full address space can be accessed by another GPU (i.e - * its address is within 40/44/48-bits address limitation of the other GPU), then - * the other GPU will have an P2P-PCIe to this GPU. - * - * When GPUs have large BAR, but one GPU cannot address the full address another, then there is no - * iolink-PCIe from this GPU to the other GPU. This GPU still has to be remapped to a GPU with - * large BAR on restore. The other GPU could still have an iolink-PCIe to this GPU. This would - * result in a uni-directional P2P-PCIe. - * - * - * In general, the GPU ID's have the following format to ease debugging: - * WXYZ where: - * W = A in the source topology, B in the destination topology - * X = Unused - * Y = Hive number - * Z = GPU number - * - * e.g A017 = GPU in source topology, Hive number 1 (2nd hive), GPU number 7 (8th GPU) - * - * - * Test 0: 8 GPUs in 2 XGMI hives (full P2P-PCIe) - * 2 XGMI hives of 4 GPUs - * Each hives have different type of GPUs - * All 8 GPUs have P2P-PCIe links - * 2 CPU's - each cpu can access alternate GPUs - * - * Src Topology: - * Hive-0 has 4 GPUs device-id 0xD000 - * Hive-1 has 4 GPUs device-id 0xD001 - * - * Dest Topology: - * Hive-0 has 4 GPUs device-id 0xD001 - * Hive-1 has 4 GPUs device-id 0xD000 - * - * EXPECT: SUCCESS - * GPUs in Hive-0 in Src Topology should be mapped to GPUs in Hive-1 in Dest Topology and - * GPUs in Hive-1 in Src Topology should be mapped to GPUs in Hive-0. So that the device-id's - * match. - * - * - * Test 1: 8 GPUs in 2 XGMI hives (partial P2P-PCIe case 1) - * 2 XGMI hives of 4 GPUs - * All 8 GPUs have the same device-id - * - * Src Topology: - * Hive-0 has 4 GPUs - * Hive-1 has 4 GPUs - * GPUs 0, 1, 2, 3, 4, 5 have P2P-PCIe links - * GPUs 7, 8 do not have P2P-PCIe links - * - * Dest Topology: - * Same as Src Topology - * - * EXPECT: SUCCESS - * - * - * Test 2: 8 GPUs in 2 XGMI hives (partial P2P-PCIe case 1) - * Same as test 1 but each hive have different device-id's for each hive. - * - * Src Topology: - * Hive-0 has 4 GPUs with device-id 0xD000 - * Hive-1 has 4 GPUs with device-id 0xD001 - * GPUs 0, 1, 2, 3, 4, 5 have P2P-PCIe links - * GPUs 7, 8 do not have P2P-PCIe links - * - * Dest Topology: - * Hive-0 has 4 GPUs with device-id 0xD001 - * Hive-1 has 4 GPUs with device-id 0xD000 - * GPUs 0, 1, 2, 3, 4, 5 have P2P-PCIe links - * GPUs 7, 8 do not have P2P-PCIe links - * - * EXPECT: FAIL - * It is not possible to map the GPUs because Src:Hive-0 would have to be mapped to - * Dest:Hive-1 to be able to match the P2P-PCIe links, but Src:Hive-0 and Dest:Hive-1 have - * different device-id's. - * - * - * Test 3:8 GPUs in 2 XGMI hives (partial P2P-PCIe case 2) - * 2 XGMI hives of 4 GPUs - * 1 GPU in each hive has a P2P-PCIe link but at different indexes within the hive. - * - * Src Topology: - * Hive-0: Has 4 GPUs with device-id 0xD000. - * Only GPU-2(A002) has bi-directional P2P-PCIe link. - * - * Hive-1: Has 4 GPUs with device-id 0xD001. - * Only GPU-7(A017) has bi-directional P2P-PCIe link. - * - * Dest Topology: - * Hive-0: Has 4 GPUs with device-id 0xD000. - * Only GPU-0(B000) has bi-directional P2P-PCIe link. - * - * Hive-1: Has 4 GPUs with device-id 0xD001. - * Only GPU-5(B015) has bi-directional P2P-PCIe link. - * - * - * EXPECT: SUCCESS - * Only possible map for A002 is B000 because B000 is the only dest GPU with device-id 0xD000 - * that has a P2P-PCIe link. - * Only possible map for A017 is B015 because B015 is the only dest GPU with device-id 0xD001 - * that has a P2P-PCIe link. - * - * - * Test 4:8 GPU's in 2 XGMI hives (partial P2P-PCIe case 2) - * Similar to Test 3 but not possible to map because one CPU iolink is uni-directional - * - * Src Topology: - * Hive-0: Has 4 GPUs with device-id 0xD000. - * Only GPU-2(A002) has bi-directional P2P-PCIe link. - * - * Hive-1: Has 4 GPUs with device-id 0xD001. - * Only GPU-7(A017) has bi-directional P2P-PCIe link. - * - * Dest Topology: - * Hive-0: Has 4 GPUs with device-id 0xD000. - * Only GPU-0(B000) has bi-directional P2P-PCIe link. - * - * Hive-1: Has 4 GPUs with device-id 0xD001. - * Only GPU-5(B014) has uni-directional P2P-PCIe link. - * - * EXPECT: FAIL - * - * - * Test 5: 8 GPUs with 1 XGMI hive - * 4 GPUs in 1 XGMI hive, 4 GPUs have no XGMI bridge. Tests combination of XGMI and non-XGMI. - * All 8 GPUs have P2P-PCIe links - * - * Src Topology: - * Hive-0: Has 4 GPUs - * 4 Other GPUs are not part of a XGMI hive - * - * Dest Topology: - * Same as src topology - * - * EXPECT: SUCCESS - * - * - * Test 6: 5 GPUs (mix and match GPU types and partial P2P-PCIe links) - * No XGMI bridges - * First 4 GPUs have P2P-PCIe links - * 1 GPU has different device-id's at different locations - * - * Src Topology: - * GPU-0, GPU-1, GPU-3 has device-id 0xD000 - * GPU-2, GPU-4 has device-id 0xD001 - * - * Dest Topology: - * GPU-0, GPU-2, GPU-3 has device-id 0xD000 - * GPU-1, GPU-4 has device-id 0xD001 - * - * EXPECT: SUCCESS - * Mapping needs to be able to map A001->B002 and A002->B001. - * - * - ************************************************************************************************** - * Tests where restore node is more capable than checkpointed node - * In the following tests, the destination topology is more P2P-links than the source topology, so - * the mapping should succeed, even though the user application will probably not be able to take - * advantage of the extra links. - * - * Test 7: 8 GPUs (ignore XGMI bridge on restore node) - * - * Src Topology: - * Hive-0: GPU-0, GPU-1, GPU-2, GPU-3 - * No XGMI bridge: GPU-4, GPU-5, GPU-6, GPU-7 - * - * Dest Topology: - * Hive-0: GPU-0, GPU-1, GPU-2, GPU-3 - * Hive-1: GPU-4, GPU-5, GPU-6, GPU-7 - * - * EXPECT: SUCCESS - * Mapping should succeed, because destination GPUs GPU-4, GPU-5, GPU-6, GPU-7 are more - * capable than source GPUs GPU-4, GPU-5, GPU-6, GPU-7. - * User application will probably not take advantage of XGMI links in Hive-1. - * - * - * Test 8: 5 GPUs (4 GPUs with P2P-PCIe links vs 3 GPUs with P2P-PCIe links) - * - * Src Topology: - * GPU-0, GPU-1, GPU-2 have bi-directional links - * GPU-3, GPU-4 have unidirectional links - * - * Dest Topology: - * GPU-0, GPU-1, GPU-2, GPU-3 have bi-directional links - * GPU-4 have unidirectional links - * - * EXPECT: SUCCESS - * Mapping should succeed because destination GPU-3 can replace source GPU-3. - * - **************************************************************************************************/ - -#include -#include -#include -#include -#include -#include "common/list.h" - -#include "amdgpu_plugin_topology.h" - -#define pr_err(...) fprintf(stdout, "ERR:" __VA_ARGS__) -#define pr_info(...) fprintf(stdout, "INFO:" __VA_ARGS__) -#define pr_debug(...) fprintf(stdout, "DBG:" __VA_ARGS__) - -int verify_maps(const struct device_maps *maps, uint32_t num_cpus, uint32_t num_gpus) -{ - struct id_map *map; - - /* TODO: This merely checks that all nodes have been mapped. We should add individual - * verification functions for each test to tests the mappings are correct - */ - - list_for_each_entry(map, &maps->cpu_maps, listm) { - if (num_cpus-- == 0) { - pr_err("Results had more mappings than number of CPUs\n"); - return -EINVAL; - } - } - if (num_cpus > 0) { - pr_err("Results did not map all CPUs\n"); - return -EINVAL; - } - - list_for_each_entry(map, &maps->gpu_maps, listm) { - if (num_gpus-- == 0) { - pr_err("Results had more mappings than number of GPUs\n"); - return -EINVAL; - } - } - if (num_gpus > 0) { - pr_err("Results did not map all GPUs\n"); - return -EINVAL; - } - return 0; -} - -int test_0(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[8]; - struct tp_node *node_cpus[2]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_src, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 10; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA010 + i); - node_gpus[i]->device_id = 0xD001; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_dest, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB000 + i); - node_gpus[i]->device_id = 0xD001; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 10; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB010 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_1(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[8]; - struct tp_node *node_cpus[2]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_src, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 8; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA010 + i); - node_gpus[i]->device_id = 0xD000; - - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - if (i < 6) - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_dest, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 8; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB010 + i); - node_gpus[i]->device_id = 0xD000; - - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - if (i < 6) - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_2(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[8]; - struct tp_node *node_cpus[2]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_src, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 8; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA010 + i); - node_gpus[i]->device_id = 0xD001; - - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - if (i < 6) - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_dest, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB000 + i); - node_gpus[i]->device_id = 0xD001; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 8; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB010 + i); - node_gpus[i]->device_id = 0xD000; - - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - if (i < 6) - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_3(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[8]; - struct tp_node *node_cpus[2]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_src, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA010 + i); - node_gpus[i]->device_id = 0xD001; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - } - - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, 4); - node_add_iolink(node_cpus[1], TOPO_IOLINK_TYPE_PCIE, 9); - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_dest, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB010 + i); - node_gpus[i]->device_id = 0xD001; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - } - - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_cpus[1], TOPO_IOLINK_TYPE_PCIE, 7); - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_4(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[8]; - struct tp_node *node_cpus[2]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_src, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA010 + i); - node_gpus[i]->device_id = 0xD001; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - } - - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, 4); - node_add_iolink(node_cpus[1], TOPO_IOLINK_TYPE_PCIE, 9); - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_dest, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB010 + i); - node_gpus[i]->device_id = 0xD001; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - } - - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_cpus[1], TOPO_IOLINK_TYPE_PCIE, 6); - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_5(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[8]; - struct tp_node *node_cpus[2]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_src, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 10; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA010 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, ((i + j) % 4) + 6); - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_dest, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - for (int j = 6; j < 10; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB010 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - node_add_iolink(node_cpus[i & 1], TOPO_IOLINK_TYPE_PCIE, i + 2); - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, ((i + j) % 4) + 6); - for (int j = 2; j < 6; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, j); - } - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_6(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[5]; - struct tp_node *node_cpus[1]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - - for (int i = 0; i < 5; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 1, 0xA000 + i); - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, 0); - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, i + 1); - } - node_gpus[0]->device_id = 0xD000; - node_gpus[1]->device_id = 0xD000; - node_gpus[2]->device_id = 0xD001; - node_gpus[3]->device_id = 0xD000; - node_gpus[4]->device_id = 0xD001; - - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 3); - - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 4); - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - - for (int i = 0; i < 5; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 1, 0xB000 + i); - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, 0); - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, i + 1); - } - node_gpus[0]->device_id = 0xD000; - node_gpus[1]->device_id = 0xD001; - node_gpus[2]->device_id = 0xD000; - node_gpus[3]->device_id = 0xD000; - node_gpus[4]->device_id = 0xD001; - - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 3); - - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 4); - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_7(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[8]; - struct tp_node *node_cpus[2]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_src, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 2, 0xA010 + i); - node_gpus[i]->device_id = 0xD000; - } - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - node_cpus[1] = sys_add_node(&tp_dest, 1, 0); - node_cpus[1]->cpu_cores_count = 1; - - for (int i = 0; i < 4; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, i & 1); - } - for (int i = 0; i < 4; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 2); - } - - for (int i = 4; i < 8; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 2, 0xB010 + i); - node_gpus[i]->device_id = 0xD000; - } - for (int i = 4; i < 8; i++) { - for (int j = 1; j < 4; j++) - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_XGMI, ((i + j) % 4) + 6); - } - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -int test_8(void) -{ - int ret = 0; - struct device_maps maps; - - struct tp_node *node_gpus[5]; - struct tp_node *node_cpus[1]; - - struct tp_system tp_src = { 0 }; - struct tp_system tp_dest = { 0 }; - - /* Fill src struct */ - topology_init(&tp_src); - tp_src.parsed = true; - - node_cpus[0] = sys_add_node(&tp_src, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - - for (int i = 0; i < 5; i++) { - node_gpus[i] = sys_add_node(&tp_src, i + 1, 0xA000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, 0); - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, i + 1); - } - - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 3); - - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 3); - - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 2); - - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 3); - - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 3); - - /* Fill dest struct */ - topology_init(&tp_dest); - tp_dest.parsed = true; - - node_cpus[0] = sys_add_node(&tp_dest, 0, 0); - node_cpus[0]->cpu_cores_count = 1; - - for (int i = 0; i < 5; i++) { - node_gpus[i] = sys_add_node(&tp_dest, i + 1, 0xB000 + i); - node_gpus[i]->device_id = 0xD000; - node_add_iolink(node_gpus[i], TOPO_IOLINK_TYPE_PCIE, 0); - node_add_iolink(node_cpus[0], TOPO_IOLINK_TYPE_PCIE, i + 1); - } - - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[0], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[1], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[2], TOPO_IOLINK_TYPE_PCIE, 4); - - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[3], TOPO_IOLINK_TYPE_PCIE, 3); - - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 1); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 2); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 3); - node_add_iolink(node_gpus[4], TOPO_IOLINK_TYPE_PCIE, 4); - - ret = set_restore_gpu_maps(&tp_src, &tp_dest, &maps); - - if (!ret) { - if (verify_maps(&maps, ARRAY_SIZE(node_cpus), ARRAY_SIZE(node_gpus))) { - pr_err("Mapping returned success, but results had errors\n"); - ret = -1; - } - } - topology_free(&tp_src); - topology_free(&tp_dest); - maps_free(&maps); - return ret; -} - -struct test { - int (*test_func)(void); - bool success; /* true if we expect function to return 0 */ -}; - -int main(int argc, char **argv) -{ - int ret; - int result = 0; - - struct test tests[] = { - { test_0, true }, { test_1, true }, { test_2, false }, { test_3, true }, { test_4, false }, - { test_5, true }, { test_6, true }, { test_7, true }, { test_8, true }, - }; - - if (argc > 1) { - int run; - - if (sscanf(argv[1], "%d", &run) != 1 || (run >= ARRAY_SIZE(tests))) { - pr_err("Usage: test_topology_remap [test_number]\n"); - pr_err(" Test number range:0-%ld\n", ARRAY_SIZE(tests) - 1); - pr_err(" Return codes:\n"); - pr_err(" 0 All tests pass\n"); - pr_err(" 1 At least one test failed\n"); - pr_err(" 2 Invalid parameters\n"); - return 2; - } - pr_info("======================================================================\n"); - pr_info("Starting test %d\n", run); - ret = tests[run].test_func(); - pr_info("\n\nTest %d: %s\n", run, (!ret == tests[run].success) ? "PASS" : "FAILED"); - pr_info("======================================================================\n"); - return (!ret == tests[run].success) ? 0 : 1; - } - - for (int i = 0; i < ARRAY_SIZE(tests); i++) { - pr_info("======================================================================\n"); - pr_info("Starting test %d\n", i); - ret = tests[i].test_func(); - pr_info("\n\nTest %d: %s\n", i, (!ret == tests[i].success) ? "PASS" : "FAILED"); - pr_info("======================================================================\n"); - if (!ret != tests[i].success) - result = 1; - } - return result; -} diff --git a/plugins/cuda/Makefile b/plugins/cuda/Makefile deleted file mode 100644 index 2c1944a34..000000000 --- a/plugins/cuda/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -PLUGIN_NAME := cuda_plugin -PLUGIN_SOBJ := cuda_plugin.so - -DEPS_CUDA := $(PLUGIN_SOBJ) - -PLUGIN_INCLUDE := -iquote../../include -PLUGIN_INCLUDE += -iquote../../criu/include -PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/ -PLUGIN_INCLUDE += -iquote../../ - -COMPEL := ../../compel/compel-host - -PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC - -__nmk_dir ?= ../../scripts/nmk/scripts/ -include $(__nmk_dir)msg.mk - -all: $(DEPS_CUDA) - -cuda_plugin.so: cuda_plugin.c - $(call msg-gen, $@) - $(Q) $(CC) $(PLUGIN_CFLAGS) $(DEFINES) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) - -clean: - $(call msg-clean, $@) - $(Q) $(RM) $(PLUGIN_SOBJ) -.PHONY: clean - -mrproper: clean - -install: - $(Q) mkdir -p $(DESTDIR)$(PLUGINDIR) - $(E) " INSTALL " $(PLUGIN_NAME) - $(Q) install -m 755 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR) -.PHONY: install - -uninstall: - $(E) " UNINSTALL" $(PLUGIN_NAME) - $(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ) -.PHONY: uninstall diff --git a/plugins/cuda/README.md b/plugins/cuda/README.md deleted file mode 100644 index 7b91f6998..000000000 --- a/plugins/cuda/README.md +++ /dev/null @@ -1,59 +0,0 @@ -Checkpoint and Restore for CUDA applications with CRIU -====================================================== - -# Requirements -The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555 -or higher GPU driver is required for CUDA CRIU integration support. - -## cuda-checkpoint -The cuda-checkpoint utility can be found at: -https://github.com/NVIDIA/cuda-checkpoint - -cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA -applications. Updating the cuda-checkpoint utility between driver releases -should not be necessary as the utility simply exposes some extra driver behavior -so driver updates are all that's needed to get access to newer features. - -# Checkpointing Procedure -cuda-checkpoint exposes 4 actions used in the checkpointing process: lock, -checkpoint, restore, unlock. - -* lock - Used with the PAUSE_DEVICES hook while a process is still running to - quiesce the application into a state where it can be checkpointed -* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been - seized/frozen to perform the actual checkpointing operation -* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA - state and release the process back to it's running state - -These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA -plugin will re-wake when needed. - -# Known Limitations -* Currently GPU memory contents are brought into main system memory and CRIU - then checkpoints that as part of the normal procedure. On systems with many - GPU's with high GPU memory usage this can cause memory thrashing. A future - CUDA release will add support for dumping the memory contents to files to - alleviate this as well as support in the CRIU plugin. -* There's currently a small race between when a PAUSE_DEVICES hook is called on - a running process and a process calls cuInit() and finishes initializing CUDA - after the PAUSE is issued but before the process is frozen to checkpoint. This - will cause cuda-checkpoint to report that the process is in an illegal state - for checkpointing and it's recommended to just attempt the CRIU procedure - again, this should be very rare. -* Applications that use NVML will leave some leftover device references as NVML - is not currently supported for checkpointing. There will be support for this - in later drivers. A possible temporary workaround is to have the - {DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N} - remaining references for these applications as in most cases NVML is used to - get info such as gpu count and some capabilities and these values are never - accessed again and unlikely to change. -* CUDA applications that fork() but don't call exec() but also don't issue any - CUDA API calls will have some leftover references to /dev/nvidia* and fail to - checkpoint as a result. This can be worked around in a similar fashion to the - NVML case where the leftover references can be ignored as CUDA is not fork() - safe anyway. -* Restore currently requires that you restore on a system with similar GPU's and - same GPU count. -* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process - Service) are currently not supported for checkpointing. Future CUDA releases - will add support for these. diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c deleted file mode 100644 index 9ccb04224..000000000 --- a/plugins/cuda/cuda_plugin.c +++ /dev/null @@ -1,631 +0,0 @@ -#include "criu-log.h" -#include "plugin.h" -#include "util.h" -#include "cr_options.h" -#include "pid.h" -#include "proc_parse.h" -#include "seize.h" -#include "fault-injection.h" - -#include -#include - -#include -#include -#include -#include -#include -#include - -/* cuda-checkpoint binary should live in your PATH */ -#define CUDA_CHECKPOINT "cuda-checkpoint" - -/* cuda-checkpoint --action flags */ -#define ACTION_LOCK "lock" -#define ACTION_CHECKPOINT "checkpoint" -#define ACTION_RESTORE "restore" -#define ACTION_UNLOCK "unlock" - -typedef enum { - CUDA_TASK_RUNNING = 0, - CUDA_TASK_LOCKED, - CUDA_TASK_CHECKPOINTED, - CUDA_TASK_UNKNOWN = -1 -} cuda_task_state_t; - -#define CUDA_CKPT_BUF_SIZE (128) - -#ifdef LOG_PREFIX -#undef LOG_PREFIX -#endif -#define LOG_PREFIX "cuda_plugin: " - -/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver - * version doesn't support --action flag - */ -bool plugin_disabled = false; - -bool plugin_added_to_inventory = false; - -struct pid_info { - int pid; - char checkpointed; - cuda_task_state_t initial_task_state; - struct list_head list; -}; - -/* Used to track which PID's we've paused CUDA operations on so far so we can - * release them after we're done with the DUMP - */ -static LIST_HEAD(cuda_pids); - -static void dealloc_pid_buffer(struct list_head *pid_buf) -{ - struct pid_info *info; - struct pid_info *n; - - list_for_each_entry_safe(info, n, pid_buf, list) { - list_del(&info->list); - xfree(info); - } -} - -static int add_pid_to_buf(struct list_head *pid_buf, int pid, cuda_task_state_t state) -{ - struct pid_info *new = xmalloc(sizeof(*new)); - - if (new == NULL) { - return -1; - } - - new->pid = pid; - new->checkpointed = 0; - new->initial_task_state = state; - list_add_tail(&new->list, pid_buf); - - return 0; -} - -static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) -{ -#define READ 0 -#define WRITE 1 - int fd[2], buf_off; - - if (pipe(fd) != 0) { - pr_perror("Couldn't create pipes for reading cuda-checkpoint output"); - return -1; - } - - buf[0] = '\0'; - - int child_pid = fork(); - if (child_pid == -1) { - pr_perror("Failed to fork to exec cuda-checkpoint"); - close(fd[READ]); - close(fd[WRITE]); - return -1; - } - - if (child_pid == 0) { // child - if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { - pr_perror("unable to clone fd %d->%d", fd[WRITE], STDOUT_FILENO); - _exit(EXIT_FAILURE); - } - if (dup2(fd[WRITE], STDERR_FILENO) == -1) { - pr_perror("unable to clone fd %d->%d", fd[WRITE], STDERR_FILENO); - _exit(EXIT_FAILURE); - } - close(fd[READ]); - - close_fds(STDERR_FILENO + 1); - - execvp(args[0], (char **)args); - - /* We can't use pr_error() as log file fd is closed. */ - fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno)); - - _exit(EXIT_FAILURE); - } - - close(fd[WRITE]); - buf_off = 0; - /* Reserve one byte for the null charracter. */ - buf_size--; - while (buf_off < buf_size) { - int bytes_read; - bytes_read = read(fd[READ], buf + buf_off, buf_size - buf_off); - if (bytes_read == -1) { - pr_perror("Unable to read output of cuda-checkpoint"); - goto err; - } - if (bytes_read == 0) - break; - buf_off += bytes_read; - } - buf[buf_off] = '\0'; - - /* Clear out any of the remaining output in the pipe in case the buffer wasn't large enough */ - while (true) { - char scratch[1024]; - int bytes_read; - bytes_read = read(fd[READ], scratch, sizeof(scratch)); - if (bytes_read == -1) { - pr_perror("Unable to read output of cuda-checkpoint"); - goto err; - } - if (bytes_read == 0) - break; - } - close(fd[READ]); - - int status, exit_code = -1; - if (waitpid(child_pid, &status, 0) == -1) { - pr_perror("Unable to wait for the cuda-checkpoint process %d", child_pid); - goto err; - } - if (WIFSIGNALED(status)) { - int sig = WTERMSIG(status); - pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig)); - } else if (WIFEXITED(status)) { - exit_code = WEXITSTATUS(status); - } else { - pr_err("cuda-checkpoint exited improperly: %u\n", status); - } - - if (exit_code != EXIT_SUCCESS) - pr_debug("cuda-checkpoint output ===>\n%s\n" - "<=== cuda-checkpoint output\n", - buf); - - return exit_code; -err: - kill(child_pid, SIGKILL); - waitpid(child_pid, NULL, 0); - return -1; -} - -/** - * Checks if a given flag is supported by the cuda-checkpoint utility - * - * Returns: - * 1 if the flag is supported, - * 0 if the flag is not supported, - * -1 if there was an error launching the cuda-checkpoint utility. - */ -static int cuda_checkpoint_supports_flag(const char *flag) -{ - char msg_buf[2048]; - const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; - - if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0) - return -1; - - if (strstr(msg_buf, flag) == NULL) - return 0; - - return 1; -} - -/* Retrieve the cuda restore thread TID from the root pid */ -static int get_cuda_restore_tid(int root_pid) -{ - char pid_buf[16]; - char pid_out[CUDA_CKPT_BUF_SIZE]; - - snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid); - - const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL }; - int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out)); - if (ret != 0) { - pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out); - return -1; - } - - return atoi(pid_out); -} - -static cuda_task_state_t get_task_state_enum(const char *state_str) -{ - if (strncmp(state_str, "running", 7) == 0) - return CUDA_TASK_RUNNING; - - if (strncmp(state_str, "locked", 6) == 0) - return CUDA_TASK_LOCKED; - - if (strncmp(state_str, "checkpointed", 12) == 0) - return CUDA_TASK_CHECKPOINTED; - - pr_err("Unknown CUDA state: %s\n", state_str); - return CUDA_TASK_UNKNOWN; -} - -static cuda_task_state_t get_cuda_state(pid_t pid) -{ - char pid_buf[16]; - char state_str[CUDA_CKPT_BUF_SIZE]; - const char *args[] = { CUDA_CHECKPOINT, "--get-state", "--pid", pid_buf, NULL }; - - snprintf(pid_buf, sizeof(pid_buf), "%d", pid); - - if (launch_cuda_checkpoint(args, state_str, sizeof(state_str))) { - pr_err("Failed to launch cuda-checkpoint to retrieve state: %s\n", state_str); - return CUDA_TASK_UNKNOWN; - } - - return get_task_state_enum(state_str); -} - -static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, - int buf_size) -{ - char pid_buf[16]; - char timeout_buf[16]; - - snprintf(pid_buf, sizeof(pid_buf), "%d", pid); - - const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */, - NULL /* timeout_val */, NULL }; - if (timeout > 0) { - snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout); - args[5] = "--timeout"; - args[6] = timeout_buf; - } - - return launch_cuda_checkpoint(args, msg_buf, buf_size); -} - -static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset) -{ - /* Since we resumed a thread that CRIU previously already froze we need to - * INTERRUPT it once again, task was already SEIZE'd so we don't need to do - * a compel_interrupt_task() - */ - if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { - pr_perror("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state", - restore_tid); - return -1; - } - - struct proc_status_creds creds; - if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) { - pr_err("compel_wait_task failed after interrupt\n"); - return -1; - } - - if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { - pr_perror("Failed to set ptrace options on interrupt for restore tid %d", restore_tid); - return -1; - } - - if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { - pr_perror("Unable to restore original sigmask to restore tid %d", restore_tid); - return -1; - } - - return 0; -} - -static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) -{ - k_rtsigset_t block; - - if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { - pr_perror("Failed to get current sigmask for restore tid %d", restore_tid); - return -1; - } - - ksigfillset(&block); - ksigdelset(&block, SIGTRAP); - - if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { - pr_perror("Failed to block signals on restore tid %d", restore_tid); - return -1; - } - - // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread - if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { - pr_perror("Could not clear ptrace options on restore tid %d", restore_tid); - return -1; - } - - if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { - pr_perror("Could not resume cuda restore tid %d", restore_tid); - return -1; - } - - return 0; -} - -int cuda_plugin_checkpoint_devices(int pid) -{ - int restore_tid; - char msg_buf[CUDA_CKPT_BUF_SIZE]; - int int_ret; - int status; - k_rtsigset_t save_sigset; - struct pid_info *task_info; - bool pid_found = false; - - if (plugin_disabled) { - return -ENOTSUP; - } - - restore_tid = get_cuda_restore_tid(pid); - - /* We can possibly hit a race with cuInit() where we are past the point of - * locking the process but at lock time cuInit() hadn't completed in which - * case cuda-checkpoint will report that we're in an invalid state to - * checkpoint - */ - if (restore_tid == -1) { - pr_info("No need to checkpoint devices on pid %d\n", pid); - return 0; - } - - /* Check if the process is already in a checkpointed state */ - list_for_each_entry(task_info, &cuda_pids, list) { - if (task_info->pid == pid) { - if (task_info->initial_task_state == CUDA_TASK_CHECKPOINTED) { - pr_info("pid %d already in a checkpointed state\n", pid); - return 0; - } - pid_found = true; - break; - } - } - - if (pid_found == false) { - /* We return an error here. The task should be restored - * to its original state at cuda_plugin_fini(). - */ - pr_err("Failed to track pid %d\n", pid); - return -1; - } - - pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); - /* We need to resume the checkpoint thread to prepare the mappings for - * checkpointing - */ - if (resume_restore_thread(restore_tid, &save_sigset)) { - return -1; - } - - task_info->checkpointed = 1; - status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); - } - - int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - return status != 0 ? -1 : int_ret; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); - -int cuda_plugin_pause_devices(int pid) -{ - int restore_tid; - char msg_buf[CUDA_CKPT_BUF_SIZE]; - cuda_task_state_t task_state; - - if (plugin_disabled) { - return -ENOTSUP; - } - - restore_tid = get_cuda_restore_tid(pid); - - if (restore_tid == -1) { - pr_info("no need to pause devices on pid %d\n", pid); - return 0; - } - - task_state = get_cuda_state(restore_tid); - if (task_state == CUDA_TASK_UNKNOWN) { - pr_err("Failed to get CUDA state for PID %d\n", restore_tid); - return -1; - } - - if (!plugin_added_to_inventory) { - if (add_inventory_plugin(CR_PLUGIN_DESC.name)) { - pr_err("Failed to add CUDA plugin to inventory image\n"); - return -1; - } - plugin_added_to_inventory = true; - } - - if (task_state == CUDA_TASK_LOCKED) { - pr_info("pid %d already in a locked state\n", pid); - /* Leave this PID in a "locked" state at resume_device() */ - add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_LOCKED); - return 0; - } - - if (task_state == CUDA_TASK_CHECKPOINTED) { - /* We need to skip this PID in cuda_plugin_checkpoint_devices(), - * and leave it in a "checkpoined" state at resume_device(). */ - add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_CHECKPOINTED); - return 0; - } - - pr_info("pausing devices on pid %d\n", pid); - int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); - if (alarm_timeouted()) - goto unlock; - return -1; - } - - if (add_pid_to_buf(&cuda_pids, pid, CUDA_TASK_RUNNING)) { - pr_err("unable to track paused pid %d\n", pid); - goto unlock; - } - - return 0; -unlock: - status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); - } - return -1; -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) - -int resume_device(int pid, int checkpointed, cuda_task_state_t initial_task_state) -{ - char msg_buf[CUDA_CKPT_BUF_SIZE]; - int status; - int ret = 0; - int int_ret; - k_rtsigset_t save_sigset; - - if (initial_task_state == CUDA_TASK_UNKNOWN) { - pr_info("skip resume for PID %d (unknown state)\n", pid); - return 0; - } - - int restore_tid = get_cuda_restore_tid(pid); - if (restore_tid == -1) { - pr_info("No need to resume devices on pid %d\n", pid); - return 0; - } - - pr_info("resuming devices on pid %d\n", pid); - /* The resuming process has to stay frozen during this time otherwise - * attempting to access a UVM pointer will crash if we haven't restored the - * underlying mappings yet - */ - pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid); - /* wakeup the restore thread so we can handle the restore for this pid, - * rseq_cs has to be restored before execution - */ - if (resume_restore_thread(restore_tid, &save_sigset)) { - return -1; - } - - if (checkpointed && (initial_task_state == CUDA_TASK_RUNNING || initial_task_state == CUDA_TASK_LOCKED)) { - /* If the process was "locked" or "running" before checkpointing it, we need to restore it */ - status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); - ret = -1; - goto interrupt; - } - } - - if (initial_task_state == CUDA_TASK_RUNNING) { - /* If the process was "running" before we paused it, we need to unlock it */ - status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); - if (status) { - pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); - ret = -1; - } - } - -interrupt: - int_ret = interrupt_restore_thread(restore_tid, &save_sigset); - - return ret != 0 ? ret : int_ret; -} - -int cuda_plugin_resume_devices_late(int pid) -{ - if (plugin_disabled) { - return -ENOTSUP; - } - - /* RESUME_DEVICES_LATE is used during `criu restore`. - * Here, we assume that users expect the target process - * to be in a "running" state after restore, even if it was - * in a "locked" or "checkpointed" state during `criu dump`. - */ - return resume_device(pid, 1, CUDA_TASK_RUNNING); -} -CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) - -/** - * Check if a CUDA device is available on the system - */ -static bool is_cuda_device_available(void) -{ - const char *gpu_path = "/proc/driver/nvidia/gpus/"; - struct stat sb; - - if (stat(gpu_path, &sb) != 0) - return false; - - return S_ISDIR(sb.st_mode); -} - -int cuda_plugin_init(int stage) -{ - int ret; - - /* Disable CUDA checkpointing with pre-dump */ - if (stage == CR_PLUGIN_STAGE__PRE_DUMP) { - plugin_disabled = true; - return 0; - } - - if (stage == CR_PLUGIN_STAGE__RESTORE) { - if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) { - plugin_disabled = true; - return 0; - } - } - - if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) { - pr_info("No GPU device found; CUDA plugin is disabled\n"); - plugin_disabled = true; - return 0; - } - - ret = cuda_checkpoint_supports_flag("--action"); - if (ret == -1) { - pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT); - plugin_disabled = true; - return 0; - } - - if (ret == 0) { - pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); - plugin_disabled = true; - return 0; - } - - pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage); - - /* In the DUMP stage track all the PID's we've paused CUDA operations on to - * release them when we're done if the user requested the leave-running option - */ - if (stage == CR_PLUGIN_STAGE__DUMP) { - INIT_LIST_HEAD(&cuda_pids); - } - - set_compel_interrupt_only_mode(); - - return 0; -} - -void cuda_plugin_fini(int stage, int ret) -{ - if (plugin_disabled) { - return; - } - - pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret); - - /* Release all the paused PID's at the end of the DUMP stage in case the - * user provides the -R (leave-running) flag or an error occurred - */ - if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { - struct pid_info *info; - list_for_each_entry(info, &cuda_pids, list) { - resume_device(info->pid, info->checkpointed, info->initial_task_state); - } - } - if (stage == CR_PLUGIN_STAGE__DUMP) { - dealloc_pid_buffer(&cuda_pids); - } -} -CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini) diff --git a/scripts/build/Dockerfile.aarch64-stable-cross.hdr b/scripts/build/Dockerfile.aarch64-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.aarch64-stable-cross.hdr rename to scripts/build/Dockerfile.aarch64-cross.hdr diff --git a/scripts/build/Dockerfile.aarch64-cross.tmpl b/scripts/build/Dockerfile.aarch64-cross.tmpl new file mode 120000 index 000000000..50eff9213 --- /dev/null +++ b/scripts/build/Dockerfile.aarch64-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.aarch64-stable-cross.tmpl b/scripts/build/Dockerfile.aarch64-stable-cross.tmpl deleted file mode 120000 index 81ef22980..000000000 --- a/scripts/build/Dockerfile.aarch64-stable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.aarch64-unstable-cross.hdr b/scripts/build/Dockerfile.aarch64-unstable-cross.hdr deleted file mode 100644 index c61d2af27..000000000 --- a/scripts/build/Dockerfile.aarch64-unstable-cross.hdr +++ /dev/null @@ -1,5 +0,0 @@ -FROM docker.io/dockcross/base:latest - -ENV ARCH=aarch64 -ENV DEBIAN_ARCH=arm64 -ENV CROSS_TRIPLET=aarch64-linux-gnu diff --git a/scripts/build/Dockerfile.aarch64-unstable-cross.tmpl b/scripts/build/Dockerfile.aarch64-unstable-cross.tmpl deleted file mode 120000 index 955ae1fd4..000000000 --- a/scripts/build/Dockerfile.aarch64-unstable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index ed883f300..a6579c0bb 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -1,14 +1,50 @@ FROM alpine ARG CC=gcc +RUN apk update && apk add \ + $CC \ + bash \ + build-base \ + coreutils \ + git \ + gnutls-dev \ + libaio-dev \ + libcap-dev \ + libnet-dev \ + libnl3-dev \ + nftables \ + nftables-dev \ + pkgconfig \ + protobuf-c-dev \ + protobuf-dev \ + py3-pip \ + py3-protobuf \ + python3 \ + sudo + COPY . /criu WORKDIR /criu - -RUN apk add --no-cache "$CC" && /criu/contrib/dependencies/apk-packages.sh - RUN make mrproper && date && make -j $(nproc) CC="$CC" && date +RUN apk add \ + ip6tables \ + iptables \ + nftables \ + iproute2 \ + tar \ + bash \ + go \ + e2fsprogs \ + py-yaml \ + py3-flake8 \ + asciidoctor + # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test +RUN pip3 install junit_xml + +# For zdtm we need an unversioned python binary +RUN ln -s /usr/bin/python3 /usr/bin/python + RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.amd-rocm b/scripts/build/Dockerfile.amd-rocm deleted file mode 100644 index ed66ae4fe..000000000 --- a/scripts/build/Dockerfile.amd-rocm +++ /dev/null @@ -1,97 +0,0 @@ -FROM rocm/pytorch:latest - -ARG CC=gcc - -# Environment -ENV BRANCH=$BRANCH \ - DEBIAN_FRONTEND=noninteractive \ - LC_ALL=en_US.UTF-8 \ - LANG=en_US.UTF-8 \ - LANGUAGE=en_US.UTF-8 - -# -# Package installation -# -RUN apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends \ - --no-upgrade -yq -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" \ - apt-utils \ - apt-transport-https\ - gnupg \ - gnupg2 \ - gettext \ - locales \ - iproute2 \ - iputils-ping \ - moreutils \ - net-tools \ - psmisc\ - supervisor \ - cifs-utils \ - nfs-common \ - systemd \ - fuse \ - xmlto \ - autossh \ - netbase \ - libnet-dev \ - libnl-route-3-dev \ - $CC \ - bsdmainutils \ - ca-certificates \ - build-essential \ - git-core \ - iptables \ - libaio-dev \ - libbsd-dev \ - libcap-dev \ - libgnutls28-dev \ - libgnutls30 \ - libnl-3-dev \ - libprotobuf-c-dev \ - libprotobuf-dev \ - libselinux-dev \ - pkg-config \ - protobuf-c-compiler \ - protobuf-compiler \ - python-protobuf \ - python3-minimal \ - python-ipaddress \ - uuid-dev \ - curl \ - wget \ - vim \ - openssl \ - openssh-server \ - python \ - sudo \ - libnuma1 \ - libdrm-dev \ - libdrm-amdgpu1 \ - asciidoc \ - && \ - rm -rf /var/lib/apt/lists/* && \ - apt-get purge --auto-remove && \ - apt-get clean - -# Clone latest criu code -COPY . /criu -WORKDIR /criu - -RUN make mrproper && date && \ -# Check single object build - make -j $(nproc) CC="$CC" criu/parasite-syscall.o && \ -# Compile criu - make -j $(nproc) CC="$CC" && \ - date && echo BUILD_OK && \ -# Install criu - make -j $(nproc) install && \ - date && echo INSTALL_OK - -WORKDIR /root/criu_build_dir -RUN git clone --recursive -b cl/rocm-transformers https://github.com/lcskrishna/transformers.git && \ - cd transformers && wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json && \ - wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json && \ - wget https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py -ENV SQUAD_DIR=/root/criu_build_dir/transformers -WORKDIR /root/criu_build_dir/transformers -RUN pip3 install tensorboard tensorboardX && pip3 install . diff --git a/scripts/build/Dockerfile.archlinux b/scripts/build/Dockerfile.archlinux index 261bd2d79..d226244ee 100644 --- a/scripts/build/Dockerfile.archlinux +++ b/scripts/build/Dockerfile.archlinux @@ -2,14 +2,39 @@ FROM docker.io/library/archlinux:latest ARG CC=gcc -# Initialize machine ID -RUN systemd-machine-id-setup +RUN pacman -Syu --noconfirm \ + $CC \ + bash \ + make \ + coreutils \ + git \ + gnutls \ + libaio \ + libcap \ + libnet \ + libnl \ + nftables \ + pkgconfig \ + protobuf-c \ + protobuf \ + python-pip \ + python-protobuf \ + which \ + sudo \ + iptables \ + nftables \ + iproute2 \ + tar \ + bash \ + go \ + python-yaml \ + flake8 \ + asciidoctor \ + python-junit-xml \ + diffutils COPY . /criu WORKDIR /criu - -RUN pacman -Syu --noconfirm "$CC" && contrib/dependencies/pacman-packages.sh - RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.armv7-stable-cross.hdr b/scripts/build/Dockerfile.armv7-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.armv7-stable-cross.hdr rename to scripts/build/Dockerfile.armv7-cross.hdr diff --git a/scripts/build/Dockerfile.armv7-cross.tmpl b/scripts/build/Dockerfile.armv7-cross.tmpl new file mode 120000 index 000000000..50eff9213 --- /dev/null +++ b/scripts/build/Dockerfile.armv7-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.armv7-stable-cross.tmpl b/scripts/build/Dockerfile.armv7-stable-cross.tmpl deleted file mode 120000 index 81ef22980..000000000 --- a/scripts/build/Dockerfile.armv7-stable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.armv7-unstable-cross.hdr b/scripts/build/Dockerfile.armv7-unstable-cross.hdr deleted file mode 100644 index f96dc51f7..000000000 --- a/scripts/build/Dockerfile.armv7-unstable-cross.hdr +++ /dev/null @@ -1,6 +0,0 @@ -FROM docker.io/dockcross/base:latest - -ENV ARCH=arm -ENV SUBARCH=armv7 -ENV DEBIAN_ARCH=armhf -ENV CROSS_TRIPLET=arm-linux-gnueabihf diff --git a/scripts/build/Dockerfile.armv7-unstable-cross.tmpl b/scripts/build/Dockerfile.armv7-unstable-cross.tmpl deleted file mode 120000 index 955ae1fd4..000000000 --- a/scripts/build/Dockerfile.armv7-unstable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.centos7 b/scripts/build/Dockerfile.centos7 new file mode 100644 index 000000000..21e70ff0e --- /dev/null +++ b/scripts/build/Dockerfile.centos7 @@ -0,0 +1,45 @@ +FROM centos:7 + +ARG CC=gcc + +RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +RUN yum install -y \ + findutils \ + gcc \ + git \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libcap-devel \ + libnet-devel \ + libnl3-devel \ + make \ + procps-ng \ + protobuf-c-devel \ + protobuf-devel \ + protobuf-python \ + python \ + python-flake8 \ + python-ipaddress \ + python2-future \ + python2-junit_xml \ + python-yaml \ + python-six \ + sudo \ + tar \ + which \ + e2fsprogs \ + python2-pip \ + rubygem-asciidoctor + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date + +# The rpc test cases are running as user #1000, let's add the user +RUN adduser -u 1000 test + +RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.centos8 b/scripts/build/Dockerfile.centos8 new file mode 100644 index 000000000..488f95d65 --- /dev/null +++ b/scripts/build/Dockerfile.centos8 @@ -0,0 +1,52 @@ +FROM registry.centos.org/centos/centos:8 + +ARG CC=gcc + +RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm dnf-plugins-core +RUN yum config-manager --set-enabled powertools +RUN yum install -y --allowerasing \ + asciidoc \ + coreutils \ + chkconfig \ + diffutils \ + findutils \ + gcc \ + git \ + gnutls-devel \ + iproute \ + iptables \ + libaio-devel \ + libasan \ + libcap-devel \ + libnet-devel \ + libnl3-devel \ + libselinux-devel \ + make \ + procps-ng \ + protobuf-c-devel \ + protobuf-devel \ + python3-devel \ + python3-flake8 \ + python3-PyYAML \ + python3-future \ + python3-protobuf \ + python3-pip \ + sudo \ + tar \ + which \ + xmlto + +RUN alternatives --set python /usr/bin/python3 +ENV PYTHON=python3 + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) CC="$CC" && date + +# The rpc test cases are running as user #1000, let's add the user +RUN adduser -u 1000 test + +RUN pip3 install junit_xml + +RUN make -C test/zdtm -j $(nproc) diff --git a/scripts/build/Dockerfile.cross.tmpl b/scripts/build/Dockerfile.cross.tmpl new file mode 100644 index 000000000..8b95fbb1c --- /dev/null +++ b/scripts/build/Dockerfile.cross.tmpl @@ -0,0 +1,43 @@ +COPY scripts/ci/apt-install /bin/apt-install + +# Add the cross compiler sources +RUN echo "deb http://deb.debian.org/debian/ buster main" >> /etc/apt/sources.list && \ + dpkg --add-architecture ${DEBIAN_ARCH} && \ + apt-install emdebian-archive-keyring + +RUN apt-install \ + crossbuild-essential-${DEBIAN_ARCH} \ + libc6-dev-${DEBIAN_ARCH}-cross \ + libc6-${DEBIAN_ARCH}-cross \ + libbz2-dev:${DEBIAN_ARCH} \ + libexpat1-dev:${DEBIAN_ARCH} \ + ncurses-dev:${DEBIAN_ARCH} \ + libssl-dev:${DEBIAN_ARCH} \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-protobuf \ + libnl-3-dev:${DEBIAN_ARCH} \ + libprotobuf-dev:${DEBIAN_ARCH} \ + libnet-dev:${DEBIAN_ARCH} \ + libprotobuf-c-dev:${DEBIAN_ARCH} \ + libcap-dev:${DEBIAN_ARCH} \ + libaio-dev:${DEBIAN_ARCH} \ + libnl-route-3-dev:${DEBIAN_ARCH} + +ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLET} \ + AS=/usr/bin/${CROSS_TRIPLET}-as \ + AR=/usr/bin/${CROSS_TRIPLET}-ar \ + CC=/usr/bin/${CROSS_TRIPLET}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ + LD=/usr/bin/${CROSS_TRIPLET}-ld \ + FC=/usr/bin/${CROSS_TRIPLET}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index c26a5fd57..9d3bb0f87 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -1,10 +1,11 @@ ARG CC=gcc +COPY scripts/ci/prepare-for-fedora-rawhide.sh /bin/prepare-for-fedora-rawhide.sh +RUN /bin/prepare-for-fedora-rawhide.sh + COPY . /criu WORKDIR /criu -RUN dnf install -y "$CC" && scripts/ci/prepare-for-fedora-rawhide.sh - RUN make mrproper && date && make -j $(nproc) CC="$CC" && date # The rpc test cases are running as user #1000, let's add the user diff --git a/scripts/build/Dockerfile.hotspot-alpine b/scripts/build/Dockerfile.hotspot-alpine deleted file mode 100644 index cd632dddf..000000000 --- a/scripts/build/Dockerfile.hotspot-alpine +++ /dev/null @@ -1,11 +0,0 @@ -FROM docker.io/library/eclipse-temurin:11-alpine -ARG CC=gcc - -COPY . /criu -WORKDIR /criu - -RUN apk add --no-cache maven "$CC" && contrib/dependencies/apk-packages.sh - -RUN make mrproper && make -j $(nproc) CC="$CC" - -ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.hotspot-ubuntu b/scripts/build/Dockerfile.hotspot-ubuntu deleted file mode 100644 index a459e1ec7..000000000 --- a/scripts/build/Dockerfile.hotspot-ubuntu +++ /dev/null @@ -1,11 +0,0 @@ -FROM docker.io/library/eclipse-temurin:11-jammy -ARG CC=gcc - -COPY . /criu -WORKDIR /criu - -RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh - -RUN make mrproper && make -j $(nproc) CC="$CC" - -ENTRYPOINT ["mvn", "-q", "-f", "test/javaTests/pom.xml", "test"] diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl index a37f16e49..a15038631 100644 --- a/scripts/build/Dockerfile.linux32.tmpl +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -1,10 +1,32 @@ ARG CC=gcc +COPY scripts/ci/apt-install /bin/apt-install + +RUN apt-install \ + libnet-dev \ + libnl-route-3-dev \ + $CC \ + bsdmainutils \ + build-essential \ + git-core \ + iptables \ + libaio-dev \ + libcap-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnl-3-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python3-minimal \ + python3-future + COPY . /criu WORKDIR /criu -RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh - RUN uname -m && setarch linux32 uname -m && setarch --list RUN make mrproper && date && \ diff --git a/scripts/build/Dockerfile.mips64el-stable-cross.hdr b/scripts/build/Dockerfile.mips64el-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.mips64el-stable-cross.hdr rename to scripts/build/Dockerfile.mips64el-cross.hdr diff --git a/scripts/build/Dockerfile.mips64el-cross.tmpl b/scripts/build/Dockerfile.mips64el-cross.tmpl new file mode 120000 index 000000000..50eff9213 --- /dev/null +++ b/scripts/build/Dockerfile.mips64el-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.mips64el-stable-cross.tmpl b/scripts/build/Dockerfile.mips64el-stable-cross.tmpl deleted file mode 120000 index 81ef22980..000000000 --- a/scripts/build/Dockerfile.mips64el-stable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.mips64el-unstable-cross.hdr b/scripts/build/Dockerfile.mips64el-unstable-cross.hdr deleted file mode 100644 index e78c94aa5..000000000 --- a/scripts/build/Dockerfile.mips64el-unstable-cross.hdr +++ /dev/null @@ -1,6 +0,0 @@ -FROM dockcross/base:latest - -ENV ARCH=mips -ENV SUBARCH=mips -ENV DEBIAN_ARCH=mips64el -ENV CROSS_TRIPLET=mips64el-linux-gnuabi64 diff --git a/scripts/build/Dockerfile.mips64el-unstable-cross.tmpl b/scripts/build/Dockerfile.mips64el-unstable-cross.tmpl deleted file mode 120000 index 955ae1fd4..000000000 --- a/scripts/build/Dockerfile.mips64el-unstable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.openj9-alpine b/scripts/build/Dockerfile.openj9-alpine new file mode 100644 index 000000000..f92011283 --- /dev/null +++ b/scripts/build/Dockerfile.openj9-alpine @@ -0,0 +1,32 @@ +# FIXME: Replace with eclipse-temurin once Alpine support has been added. +# https://github.com/adoptium/containers/pull/60 +FROM adoptopenjdk/openjdk8-openj9:alpine +ARG CC=gcc + +RUN apk update && apk add \ + bash \ + build-base \ + coreutils \ + git \ + gnutls-dev \ + libaio-dev \ + libcap-dev \ + libnet-dev \ + libnl3-dev \ + pkgconfig \ + protobuf-c-dev \ + protobuf-dev \ + python3 \ + sudo \ + maven \ + ip6tables \ + iptables \ + bash + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && make -j $(nproc) CC="$CC" + +ENTRYPOINT mvn -q -f test/javaTests/pom.xml test + diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu index 18664f100..8936adf81 100644 --- a/scripts/build/Dockerfile.openj9-ubuntu +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -1,12 +1,34 @@ -FROM docker.io/library/ibm-semeru-runtimes:open-11-jdk-jammy +FROM docker.io/library/eclipse-temurin:8-focal ARG CC=gcc -RUN mkdir -p /etc/criu && echo 'ghost-limit 16777216' > /etc/criu/default.conf +COPY scripts/ci/apt-install /bin/apt-install + +RUN apt-install protobuf-c-compiler \ + libprotobuf-c-dev \ + libaio-dev \ + python3-future \ + libprotobuf-dev \ + protobuf-compiler \ + libcap-dev \ + libnl-3-dev \ + gdb \ + bash \ + python3-protobuf \ + python3-yaml \ + libnet-dev \ + libnl-route-3-dev \ + libbsd-dev \ + make \ + git \ + pkg-config \ + iptables \ + gcc \ + maven + COPY . /criu WORKDIR /criu -RUN contrib/apt-install maven "$CC" && contrib/dependencies/apt-packages.sh - RUN make mrproper && make -j $(nproc) CC="$CC" -ENTRYPOINT ["mvn", "-f", "test/javaTests/pom.xml", "test"] +ENTRYPOINT mvn -q -f test/javaTests/pom.xml test + diff --git a/scripts/build/Dockerfile.ppc64-stable-cross.hdr b/scripts/build/Dockerfile.ppc64-cross.hdr similarity index 100% rename from scripts/build/Dockerfile.ppc64-stable-cross.hdr rename to scripts/build/Dockerfile.ppc64-cross.hdr diff --git a/scripts/build/Dockerfile.ppc64-cross.tmpl b/scripts/build/Dockerfile.ppc64-cross.tmpl new file mode 120000 index 000000000..50eff9213 --- /dev/null +++ b/scripts/build/Dockerfile.ppc64-cross.tmpl @@ -0,0 +1 @@ +Dockerfile.cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.ppc64-stable-cross.tmpl b/scripts/build/Dockerfile.ppc64-stable-cross.tmpl deleted file mode 120000 index 81ef22980..000000000 --- a/scripts/build/Dockerfile.ppc64-stable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.stable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.ppc64-unstable-cross.hdr b/scripts/build/Dockerfile.ppc64-unstable-cross.hdr deleted file mode 100644 index 38547ac55..000000000 --- a/scripts/build/Dockerfile.ppc64-unstable-cross.hdr +++ /dev/null @@ -1,5 +0,0 @@ -FROM dockcross/base:latest - -ENV ARCH=ppc64 -ENV DEBIAN_ARCH=ppc64el -ENV CROSS_TRIPLET=powerpc64le-linux-gnu diff --git a/scripts/build/Dockerfile.ppc64-unstable-cross.tmpl b/scripts/build/Dockerfile.ppc64-unstable-cross.tmpl deleted file mode 120000 index 955ae1fd4..000000000 --- a/scripts/build/Dockerfile.ppc64-unstable-cross.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.unstable-cross.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.hdr b/scripts/build/Dockerfile.riscv64-stable-cross.hdr deleted file mode 100644 index d4c414023..000000000 --- a/scripts/build/Dockerfile.riscv64-stable-cross.hdr +++ /dev/null @@ -1,5 +0,0 @@ -FROM ubuntu:jammy - -ENV ARCH=riscv64 -ENV DEBIAN_ARCH=riscv64 -ENV CROSS_TRIPLET=riscv64-linux-gnu diff --git a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl b/scripts/build/Dockerfile.riscv64-stable-cross.tmpl deleted file mode 100644 index 8933a6c82..000000000 --- a/scripts/build/Dockerfile.riscv64-stable-cross.tmpl +++ /dev/null @@ -1,31 +0,0 @@ -# Add the cross compiler sources -RUN apt-get clean -y && apt-get update -y && apt-get install -y --no-install-recommends gnupg2 - -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 871920D1991BC93C 8D69674688B6CB36 B523E5F3FC4E5F2C - -COPY scripts/ci/riscv64-cross/amd64-sources.list /etc/apt/sources.list - -COPY scripts/ci/riscv64-cross/riscv64-sources.list /etc/apt/sources.list.d/ - -RUN dpkg --add-architecture ${DEBIAN_ARCH} && \ - apt-get update -y - -ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ - CROSS_ROOT=/usr/${CROSS_TRIPLET} \ - AS=/usr/bin/${CROSS_TRIPLET}-as \ - AR=/usr/bin/${CROSS_TRIPLET}-ar \ - CC=/usr/bin/${CROSS_TRIPLET}-gcc \ - CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ - CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ - LD=/usr/bin/${CROSS_TRIPLET}-ld \ - FC=/usr/bin/${CROSS_TRIPLET}-gfortran - -ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ - PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig - -COPY . /criu -WORKDIR /criu - -RUN contrib/dependencies/apt-cross-packages.sh - -RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.stable-cross.tmpl b/scripts/build/Dockerfile.stable-cross.tmpl deleted file mode 100644 index 56104081f..000000000 --- a/scripts/build/Dockerfile.stable-cross.tmpl +++ /dev/null @@ -1,29 +0,0 @@ -# Add the cross compiler sources -RUN echo "deb http://deb.debian.org/debian/ stable main" >> /etc/apt/sources.list && \ - dpkg --add-architecture ${DEBIAN_ARCH} - -ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ - CROSS_ROOT=/usr/${CROSS_TRIPLET} \ - AS=/usr/bin/${CROSS_TRIPLET}-as \ - AR=/usr/bin/${CROSS_TRIPLET}-ar \ - CC=/usr/bin/${CROSS_TRIPLET}-gcc \ - CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ - CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ - LD=/usr/bin/${CROSS_TRIPLET}-ld \ - FC=/usr/bin/${CROSS_TRIPLET}-gfortran - -ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ - PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig - -COPY . /criu -WORKDIR /criu - -RUN contrib/dependencies/apt-cross-packages.sh - -# amdgpu_plugin with armv7 is not supported -RUN make mrproper && date && \ - make -j $(nproc) && \ - if [ "$SUBARCH" != "armv7" ]; then \ - make -j $(nproc) amdgpu_plugin; \ - fi && \ - make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.tmpl b/scripts/build/Dockerfile.tmpl index 498b99be9..e0e72372d 100644 --- a/scripts/build/Dockerfile.tmpl +++ b/scripts/build/Dockerfile.tmpl @@ -1,11 +1,40 @@ ARG CC=gcc -COPY . /criu -WORKDIR /criu +COPY scripts/ci/apt-install /bin/apt-install # On Ubuntu, kernel modules such as ip_tables and xt_mark may not be loaded by default # We need to install kmod to enable iptables to load these modules for us. -RUN contrib/apt-install "$CC" && contrib/dependencies/apt-packages.sh +RUN apt-install \ + libnet-dev \ + libnl-route-3-dev \ + $CC \ + bsdmainutils \ + build-essential \ + git-core \ + iptables \ + libaio-dev \ + libbsd-dev \ + libcap-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnftables-dev \ + libnl-3-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + iproute2 \ + kmod \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python-is-python3 \ + python3-minimal \ + python3-protobuf \ + python3-yaml \ + python3-future + +COPY . /criu +WORKDIR /criu RUN git clean -dfx && date && \ # Check single object build diff --git a/scripts/build/Dockerfile.unstable-cross.tmpl b/scripts/build/Dockerfile.unstable-cross.tmpl deleted file mode 100644 index 7edb289b6..000000000 --- a/scripts/build/Dockerfile.unstable-cross.tmpl +++ /dev/null @@ -1,23 +0,0 @@ -# Add the cross compiler sources -RUN echo "deb http://deb.debian.org/debian/ unstable main" >> /etc/apt/sources.list && \ - dpkg --add-architecture ${DEBIAN_ARCH} - -ENV CROSS_COMPILE=${CROSS_TRIPLET}- \ - CROSS_ROOT=/usr/${CROSS_TRIPLET} \ - AS=/usr/bin/${CROSS_TRIPLET}-as \ - AR=/usr/bin/${CROSS_TRIPLET}-ar \ - CC=/usr/bin/${CROSS_TRIPLET}-gcc \ - CPP=/usr/bin/${CROSS_TRIPLET}-cpp \ - CXX=/usr/bin/${CROSS_TRIPLET}-g++ \ - LD=/usr/bin/${CROSS_TRIPLET}-ld \ - FC=/usr/bin/${CROSS_TRIPLET}-gfortran - -ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ - PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLET}/pkgconfig - -COPY . /criu -WORKDIR /criu - -RUN contrib/dependencies/apt-cross-packages.sh - -RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.x86_64.hdr b/scripts/build/Dockerfile.x86_64.hdr index a666f6c26..32fc2978a 100644 --- a/scripts/build/Dockerfile.x86_64.hdr +++ b/scripts/build/Dockerfile.x86_64.hdr @@ -1,5 +1,5 @@ -FROM ubuntu:24.04 +FROM ubuntu:focal -COPY contrib/apt-install /bin/apt-install +COPY scripts/ci/apt-install /bin/apt-install RUN apt-install gcc-multilib diff --git a/scripts/build/Makefile b/scripts/build/Makefile index a420cea94..62e3a9920 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,7 +1,5 @@ -ARCHES := x86_64 fedora-asan fedora-rawhide armv7hf -STABLE_CROSS_ARCHES := armv7-stable-cross aarch64-stable-cross ppc64-stable-cross mips64el-stable-cross riscv64-stable-cross -UNSTABLE_CROSS_ARCHES := armv7-unstable-cross aarch64-unstable-cross ppc64-unstable-cross mips64el-unstable-cross -NON_CLANG := $(UNSTABLE_CROSS_ARCHES) $(STABLE_CROSS_ARCHES) +ARCHES := x86_64 fedora-asan fedora-rawhide centos7 armv7hf centos8 +NON_CLANG := armv7-cross aarch64-cross ppc64-cross mips64el-cross CREATE_DOCKERFILES := $(ARCHES) $(NON_CLANG) TARGETS := $(ARCHES) alpine archlinux TARGETS_CLANG := $(addsuffix $(TARGETS),-clang) diff --git a/scripts/ci/Makefile b/scripts/ci/Makefile index bad8065f2..02b4d871c 100644 --- a/scripts/ci/Makefile +++ b/scripts/ci/Makefile @@ -11,14 +11,22 @@ ifdef CLANG target-suffix = -clang endif -TARGETS := alpine fedora-rawhide archlinux -ZDTM_OPTS := +TARGETS := alpine fedora-rawhide centos7 centos8 archlinux +ZDTM_OPTIONS := UNAME := $(shell uname -m) export UNAME CONTAINER_RUNTIME := docker export CONTAINER_RUNTIME -alpine: ZDTM_OPTS=-x zdtm/static/binfmt_misc -x zdtm/static/sched_policy00 +alpine: ZDTM_OPTIONS=-x zdtm/static/binfmt_misc -x zdtm/static/netns-nf -x zdtm/static/sched_policy00 -x zdtm/static/seccomp_strict -x zdtm/static/sigaltstack -x zdtm/static/signalfd00 -x zdtm/static/config_inotify_irmap + +define DOCKER_JSON +{ + "storage-driver": "devicemapper" +} +endef + +export DOCKER_JSON ifeq ($(GITHUB_ACTIONS),true) # GitHub Actions does not give us a real TTY and errors out with @@ -30,29 +38,41 @@ endif export CONTAINER_TERMINAL -# Here we assume that any CPU architecture besides x86_64 is running in containers -# that may not support running docker with '--privileged'. ifeq ($(UNAME),x86_64) - CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged --userns=host --cgroupns=host -v /lib/modules:/lib/modules --tmpfs /run + # On anything besides x86_64 Travis is running unprivileged LXD + # containers which do not support running docker with '--privileged'. + CONTAINER_OPTS := --rm $(CONTAINER_TERMINAL) --privileged -v /lib/modules:/lib/modules --tmpfs /run else CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run endif ifeq ($(CONTAINER_RUNTIME),podman) + # Just as Docker needs to use devicemapper Podman needs vfs + # as graphdriver as overlayfs does not support all test cases + STORAGE_DRIVER := vfs # Podman limits the number of processes in a container using cgroups. # Disable it as it breaks the thread-bomb test CONTAINER_OPTS += --pids-limit=0 endif -export ZDTM_OPTS +export STORAGE_DRIVER -$(TARGETS): - $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run --env-file docker.env -v `pwd`/../../:/criu $(if $(ZDTM_OPTS),-e ZDTM_OPTS) $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh +restart-docker: + if [ "$$UNAME" = "x86_64" ] && [ "$$CONTAINER_RUNTIME" = "docker" ]; then \ + echo "$$DOCKER_JSON" > /etc/docker/daemon.json; \ + cat /etc/docker/daemon.json; \ + systemctl status docker; \ + systemctl restart docker; \ + systemctl status docker; \ + fi -fedora-asan: +$(TARGETS): restart-docker $(MAKE) -C ../build $@$(target-suffix) - $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) -v `pwd`/../../:/criu criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTS) + $(CONTAINER_RUNTIME) run --env-file docker.env $(CONTAINER_OPTS) criu-$@ scripts/ci/run-ci-tests.sh + +fedora-asan: restart-docker + $(MAKE) -C ../build $@$(target-suffix) + $(CONTAINER_RUNTIME) run $(CONTAINER_OPTS) criu-$@ ./scripts/ci/asan.sh $(ZDTM_OPTIONS) docker-test: ./docker-test.sh @@ -60,8 +80,11 @@ docker-test: podman-test: ./podman-test.sh -java-test: - ./java-test.sh +# overlayfs behaves differently on Ubuntu and breaks CRIU +# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 +# Switch to devicemapper +openj9-test: restart-docker + ./openj9-test.sh setup-vagrant: ./vagrant.sh setup @@ -69,26 +92,7 @@ setup-vagrant: vagrant-fedora-no-vdso: setup-vagrant ./vagrant.sh fedora-no-vdso -vagrant-fedora-rawhide: setup-vagrant - ./vagrant.sh fedora-rawhide - -vagrant-fedora-non-root: setup-vagrant - ./vagrant.sh fedora-non-root - -.PHONY: setup-vagrant vagrant-fedora-no-vdso vagrant-fedora-rawhide vagrant-fedora-non-root - -check-commit: - ($(MAKE) -j $$(nproc) -C ../.. && \ - echo "Commit $$(git rev-parse --short HEAD) built successfully") || \ - (echo "Build failed for $$(git rev-list -n 1 --pretty HEAD)" && \ - exit 1) - -.PHONY: check-commit - -loongarch64-qemu-test: - ./loongarch64-qemu-test.sh - -.PHONY: loongarch64-qemu-test +.PHONY: setup-vagrant vagrant-fedora-no-vdso %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/contrib/apt-install b/scripts/ci/apt-install similarity index 80% rename from contrib/apt-install rename to scripts/ci/apt-install index 676e0f794..5a790901a 100755 --- a/contrib/apt-install +++ b/scripts/ci/apt-install @@ -15,7 +15,8 @@ while true; do if [ "${install_retry_counter}" -gt "${max_apt_retries}" ]; then exit 1 fi - apt-get update -y && apt-get install -y --no-install-recommends "$@" && break + # shellcheck disable=SC2068 + apt-get clean -qqy && apt-get update -qqy && apt-get install -qqy --no-install-recommends $@ && break # In case it is a network error let's wait a bit. echo "Retrying attempt ${install_retry_counter}" diff --git a/scripts/ci/asan.sh b/scripts/ci/asan.sh index 8b72fa5f1..8113b9b19 100755 --- a/scripts/ci/asan.sh +++ b/scripts/ci/asan.sh @@ -1,12 +1,11 @@ #!/bin/bash +# shellcheck disable=2044 + set -x cat /proc/self/mountinfo -time make ASAN=1 -j 4 V=1 -time make -j4 -C test/zdtm V=1 - chmod 0777 test chmod 0777 test/zdtm/transition/ chmod 0777 test/zdtm/static @@ -14,8 +13,7 @@ chmod 0777 test/zdtm/static ./test/zdtm.py run -a --keep-going -k always --parallel 4 -x zdtm/static/rtc "$@" ret=$? -shopt -s globstar nullglob -for i in /**/asan.log*; do +for i in $(find / -name 'asan.log*'); do echo "$i" echo ======================================== cat "$i" diff --git a/scripts/ci/docker-test.sh b/scripts/ci/docker-test.sh index c1c745544..d4b11bd55 100755 --- a/scripts/ci/docker-test.sh +++ b/scripts/ci/docker-test.sh @@ -1,35 +1,43 @@ #!/bin/bash +# shellcheck disable=SC1091,SC2015 + set -x -e -o pipefail -# Workaround: Docker 28.x and 29.x has a known regression that breaks the checkpoint and -# restore (C/R) feature. Let's install previous, or next major version. See -# https://github.com/moby/moby/issues/50750 for details on the bug. -export DEBIAN_FRONTEND=noninteractive -apt remove -y docker-ce docker-ce-cli -../../contrib/apt-install -y ca-certificates curl -install -m 0755 -d /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc -chmod a+r /etc/apt/keyrings/docker.asc -# shellcheck disable=SC1091 -echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" > /etc/apt/sources.list.d/docker.list -apt update -y -apt-cache madison docker-ce | awk '{ print $3 }' -verstr="$(apt-cache madison docker-ce | awk '{ print $3 }' | sort | grep -Ev ':(28|29)\.'| tail -n 1)" -../../contrib/apt-install -y "docker-ce=$verstr" "docker-ce-cli=$verstr" +./apt-install \ + apt-transport-https \ + ca-certificates \ + curl \ + software-properties-common + +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + +add-apt-repository \ + "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ + $(lsb_release -cs) \ + stable test" + +./apt-install docker-ce + +. /etc/lsb-release -# docker checkpoint and restore is an experimental feature echo '{ "experimental": true }' > /etc/docker/daemon.json -service docker restart CRIU_LOG='/criu.log' mkdir -p /etc/criu echo "log-file=$CRIU_LOG" > /etc/criu/runc.conf -# Test checkpoint/restore with action script -echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf +service docker stop +systemctl stop containerd.service + +# Always use the latest containerd release. +# Restore with containerd versions after v1.2.14 and before v1.5.0-beta.0 are broken. +# https://github.com/checkpoint-restore/criu/issues/1223 +CONTAINERD_DOWNLOAD_URL=$(curl -s https://api.github.com/repos/containerd/containerd/releases/latest | grep '"browser_download_url":.*/containerd-.*-linux-amd64.tar.gz.$' | cut -d\" -f4) +wget -nv "$CONTAINERD_DOWNLOAD_URL" -O - | tar -xz -C /usr/ + +systemctl restart containerd.service +service docker restart export SKIP_CI_TEST=1 @@ -77,35 +85,17 @@ checkpoint_container () { docker wait cr } -print_logs () { +restore_container () { + CHECKPOINT_NAME=$1 + + docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log || { cat "$(grep log 'log file:' | sed 's/log file:\s*//')" || true docker logs cr || true cat $CRIU_LOG || true dmesg docker ps exit 1 -} - -declare -i max_restore_container_tries=3 - -restore_container () { - CHECKPOINT_NAME=$1 - - for i in $(seq $max_restore_container_tries); do - docker start --checkpoint "$CHECKPOINT_NAME" cr 2>&1 | tee log && break - - # FIXME: There is a race condition in docker/containerd that causes - # docker to occasionally fail when starting a container from a - # checkpoint immediately after the checkpoint has been created. - # https://github.com/moby/moby/issues/42900 - if grep -Eq '^Error response from daemon: failed to upload checkpoint to containerd: commit failed: content sha256:.*: already exists$' log; then - echo "Retry container restore: $i/$max_restore_container_tries" - sleep 1; - else - print_logs - fi - - done + } } # Scenario: Create multiple containers and checkpoint and restore them once diff --git a/scripts/ci/java-test.sh b/scripts/ci/java-test.sh deleted file mode 100755 index a5b13a107..000000000 --- a/scripts/ci/java-test.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -cd ../.. || exit 1 - -sudo modprobe iptable_filter - -failures="" - -docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . -if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then - failures="$failures openj9-ubuntu" -fi - -docker build -t criu-hotspot-alpine-test:latest -f scripts/build/Dockerfile.hotspot-alpine . -if ! docker run --rm --privileged criu-hotspot-alpine-test:latest; then - failures="$failures hotspot-alpine" -fi - -docker build -t criu-hotspot-ubuntu-test:latest -f scripts/build/Dockerfile.hotspot-ubuntu . -if ! docker run --rm --privileged criu-hotspot-ubuntu-test:latest; then - failures="$failures hotspot-ubuntu" -fi - -if [ -n "$failures" ]; then - echo "Tests failed on $failures" - exit 1 -fi diff --git a/scripts/ci/loongarch64-qemu-test.sh b/scripts/ci/loongarch64-qemu-test.sh deleted file mode 100755 index 7e00ab65a..000000000 --- a/scripts/ci/loongarch64-qemu-test.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -set -o nounset -set -o errexit -set -x - -../../contrib/apt-install \ - apt-transport-https \ - ca-certificates \ - curl \ - software-properties-common \ - sshpass \ - openssh-client - -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - -add-apt-repository \ - "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ - $(lsb_release -cs) \ - stable test" - -../../contrib/apt-install docker-ce - -# shellcheck source=/dev/null -. /etc/lsb-release - -# docker checkpoint and restore is an experimental feature -echo '{ "experimental": true }' > /etc/docker/daemon.json -service docker restart - -docker info - -# run a loongarch64 vm - -PORT='2222' -USER='root' -PASSWORD='loongarch64' -NAME='vm' - -docker run \ - -d \ - --net host \ - --name $NAME \ - merore/archlinux-loongarch64 - -run() { - if [ -z "$1" ]; then - echo "Command cannot be empty." - exit 1 - fi - sshpass -p $PASSWORD ssh -o StrictHostKeyChecking=no -p $PORT $USER@127.0.0.1 "$1" -} - -# wait vm to start -while (! run "uname -a") -do - echo "Wait vm to start..." - sleep 1 -done -echo "The loongarch64 vm is started!" - -# Tar criu and send to vm -tar -cf criu.tar ../../../criu -sshpass -p $PASSWORD scp -o StrictHostKeyChecking=no -P $PORT criu.tar $USER@127.0.0.1:/root - -# build and test -run 'cd /root; tar -xf criu.tar' -run 'cd /root/criu; make -j4 && make -j4 -C test/zdtm' -run "cd /root/criu; ./test/zdtm.py run -t zdtm/static/maps02 -t zdtm/static/maps05 -t zdtm/static/maps06 -t zdtm/static/maps10 -t zdtm/static/maps_file_prot -t zdtm/static/memfd00 -t zdtm/transition/fork -t zdtm/transition/fork2 -t zdtm/transition/shmem -f h" diff --git a/scripts/ci/openj9-test.sh b/scripts/ci/openj9-test.sh new file mode 100755 index 000000000..b8c07f180 --- /dev/null +++ b/scripts/ci/openj9-test.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +cd ../.. || exit 1 + +failures="" + +docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . +if ! docker run --rm --privileged criu-openj9-ubuntu-test:latest; then + failures="$failures ubuntu" +fi + +docker build -t criu-openj9-alpine-test:latest -f scripts/build/Dockerfile.openj9-alpine . +if ! docker run --rm --privileged criu-openj9-alpine-test:latest; then + failures="$failures alpine" +fi + +if [ -n "$failures" ]; then + echo "Tests failed on $failures" + exit 1 +fi diff --git a/scripts/ci/podman-test.sh b/scripts/ci/podman-test.sh index 185783011..5e5eb764d 100755 --- a/scripts/ci/podman-test.sh +++ b/scripts/ci/podman-test.sh @@ -7,27 +7,29 @@ export SKIP_CI_TEST=1 cd ../../ -make install PREFIX=/usr +make install criu --version -# FIXME: Disable checkpoint/restore of cgroups -# https://github.com/checkpoint-restore/criu/issues/2091 -mkdir -p /etc/criu -echo "manage-cgroups ignore" > /etc/criu/runc.conf -sed -i 's/#runtime\s*=\s*.*/runtime = "runc"/' /usr/share/containers/containers.conf +# Install crun build dependencies +scripts/ci/apt-install libyajl-dev libseccomp-dev libsystemd-dev -# Test checkpoint/restore with action script -echo "action-script /usr/bin/true" | sudo tee /etc/criu/default.conf +# Install crun from source to test libcriu integration +tmp_dir=$(mktemp -d -t ci-XXXXXXXXXX) +pushd "${tmp_dir}" +git clone --depth=1 https://github.com/containers/crun +cd crun +./autogen.sh && ./configure --prefix=/usr +make -j"$(nproc)" +make install +popd +rm -rf "${tmp_dir}" -cat /proc/self/mountinfo podman info +# shellcheck disable=SC2016 podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' -# Show criu logs in case of error -trap 'cat /var/lib/containers/storage/overlay-containers/*/userdata/*.log' EXIT - sleep 1 for i in $(seq 20); do echo "Test $i for podman container checkpoint" @@ -68,5 +70,3 @@ for i in $(seq 20); do podman ps -a rm -f /tmp/chkpt.tar.gz done - -trap 'echo PASS' EXIT \ No newline at end of file diff --git a/scripts/ci/prepare-for-fedora-rawhide.sh b/scripts/ci/prepare-for-fedora-rawhide.sh index b0b45fcc3..e5900e563 100755 --- a/scripts/ci/prepare-for-fedora-rawhide.sh +++ b/scripts/ci/prepare-for-fedora-rawhide.sh @@ -1,21 +1,39 @@ #!/bin/bash set -e -x -contrib/dependencies/dnf-packages.sh dnf install -y \ diffutils \ - e2fsprogs \ findutils \ - gawk \ + gcc \ + git \ + gnutls-devel \ gzip \ - kmod \ - libselinux-utils \ + iproute \ + iptables \ + nftables \ + nftables-devel \ + libaio-devel \ + libasan \ + libcap-devel \ + libnet-devel \ + libnl3-devel \ + make \ procps-ng \ - python3-pip \ + protobuf-c-devel \ + protobuf-devel \ + python3-flake8 \ + python3-PyYAML \ + python3-future \ + python3-protobuf \ + python3-junit_xml \ python-unversioned-command \ redhat-rpm-config \ sudo \ - tar + tar \ + which \ + e2fsprogs \ + rubygem-asciidoctor \ + kmod # /tmp is no longer 755 in the rawhide container image and breaks CI - fix it chmod 1777 /tmp diff --git a/scripts/ci/riscv64-cross/amd64-sources.list b/scripts/ci/riscv64-cross/amd64-sources.list deleted file mode 100644 index 72dad920c..000000000 --- a/scripts/ci/riscv64-cross/amd64-sources.list +++ /dev/null @@ -1,10 +0,0 @@ -deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy main restricted -deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted -deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy universe -deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates universe -deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy multiverse -deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-updates multiverse -deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse -deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security main restricted -deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security universe -deb [arch=amd64] http://security.ubuntu.com/ubuntu/ jammy-security multiverse \ No newline at end of file diff --git a/scripts/ci/riscv64-cross/riscv64-sources.list b/scripts/ci/riscv64-cross/riscv64-sources.list deleted file mode 100644 index 67b8067b6..000000000 --- a/scripts/ci/riscv64-cross/riscv64-sources.list +++ /dev/null @@ -1,42 +0,0 @@ -# See http://help.ubuntu.com/community/UpgradeNotes for how to upgrade to -# newer versions of the distribution. -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy main restricted - -## Major bug fix updates produced after the final release of the -## distribution. -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates main restricted - -## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu -## team. Also, please note that software in universe WILL NOT receive any -## review or updates from the Ubuntu security team. -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy universe -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy universe -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates universe - -## N.B. software from this repository is ENTIRELY UNSUPPORTED by the Ubuntu -## team, and may not be under a free licence. Please satisfy yourself as to -## your rights to use the software. Also, please note that software in -## multiverse WILL NOT receive any review or updates from the Ubuntu -## security team. -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy multiverse -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-updates multiverse - -## N.B. software from this repository may not have been tested as -## extensively as that contained in the main release, although it includes -## newer versions of some applications which may provide useful features. -## Also, please note that software in backports WILL NOT receive any review -## or updates from the Ubuntu security team. -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-backports main restricted universe multiverse - -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security main restricted -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security universe -deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse -# deb-src http://ports.ubuntu.com/ubuntu-ports/ jammy-security multiverse \ No newline at end of file diff --git a/scripts/ci/run-ci-tests.sh b/scripts/ci/run-ci-tests.sh index 05a3b71e8..7c66e6802 100755 --- a/scripts/ci/run-ci-tests.sh +++ b/scripts/ci/run-ci-tests.sh @@ -1,20 +1,25 @@ #!/bin/bash set -x -e -CI_PKGS=() +CI_PKGS="protobuf-c-compiler libprotobuf-c-dev libaio-dev libgnutls28-dev + libgnutls30 libprotobuf-dev protobuf-compiler libcap-dev + libnl-3-dev gdb bash libnet-dev util-linux asciidoctor + libnl-route-3-dev time flake8 libbsd-dev python3-yaml + libperl-dev pkg-config python3-future python3-protobuf + python3-junit.xml" -X86_64_PKGS=(gcc-multilib) - -# Convert from string to array. -IFS=" " read -r -a ZDTM_OPTS <<< "$ZDTM_OPTS" +X86_64_PKGS="gcc-multilib" UNAME_M=$(uname -m) if [ "$UNAME_M" != "x86_64" ]; then - # Some tests rely on kernel features that may not be available - # when running in a container. Here we assume that x86_64 systems - # are baremetal, and skip the tests for all other CPU architectures. - # The RUN_TESTS environment variable can override this, e.g., for aarch64. + # For Travis only x86_64 seems to be baremetal. Other + # architectures are running in unprivileged LXD containers. + # That seems to block most of CRIU's interfaces. + + # But with the introduction of baremetal aarch64 systems in + # Travis (arch: arm64-graviton2) we can override this using + # an evironment variable [ -n "$RUN_TESTS" ] || SKIP_CI_TEST=1 fi @@ -28,13 +33,9 @@ ci_prep () { # not run anymore with 'sudo -u \#1000' if the UID does not exist. adduser -u 1000 --disabled-password --gecos "criutest" criutest || : - # This can fail on aarch64 + # This can fail on aarch64 travis service apport stop || : - # Ubuntu has set up AppArmor in 24.04 so that it blocks use of user - # namespaces by unprivileged users. We need this for some of our tests. - sysctl kernel.apparmor_restrict_unprivileged_userns=0 || : - if [ "$CLANG" = "1" ]; then # clang support CC=clang @@ -45,28 +46,27 @@ ci_prep () { else CC=gcc fi - CI_PKGS+=("$CC") + CI_PKGS="$CI_PKGS $CC" # Do not install x86_64 specific packages on other architectures if [ "$UNAME_M" = "x86_64" ]; then - CI_PKGS+=("${X86_64_PKGS[@]}") + CI_PKGS="$CI_PKGS $X86_64_PKGS" fi - contrib/dependencies/apt-packages.sh - contrib/apt-install "${CI_PKGS[@]}" + scripts/ci/apt-install "$CI_PKGS" chmod a+x "$HOME" + + # zdtm uses an unversioned python binary to run the tests. + # let's point python to python3 + ln -sf /usr/bin/python3 /usr/bin/python } test_stream() { - # Testing CRIU streaming to criu-image-streamer - - # FIXME: Currently, hugetlb mappings is not premapped, so in the restore content - # phase, we skip page read these pages, enqueue the iovec for later reading in - # restorer and eventually close the page read. However, image-streamer expects the - # whole image to be read and the image is not reopened, sent twice. These MAP_HUGETLB - # test cases will result in EPIPE error at the moment. - STREAM_TEST_EXCLUDE=(-x maps09 -x maps10) - ./test/zdtm.py run --stream -p 2 --keep-going -a "${STREAM_TEST_EXCLUDE[@]}" "${ZDTM_OPTS[@]}" + # We must test CRIU features that dump content into an image file to ensure + # streaming compatibility. + STREAM_TEST_PATTERN='.*(ghost|fifo|unlink|memfd|shmem|socket_queue).*' + # shellcheck disable=SC2086 + ./test/zdtm.py run --stream -p 2 --keep-going -T "$STREAM_TEST_PATTERN" $ZDTM_OPTS } print_header() { @@ -83,11 +83,9 @@ print_env() { print_header "uname -a" uname -a || : print_header "Mounted file systems" - cat /proc/self/mountinfo || : + mount || : print_header "Kernel command line" cat /proc/cmdline || : - print_header "Kernel modules" - lsmod || : print_header "Distribution information" [ -e /etc/lsb-release ] && cat /etc/lsb-release [ -e /etc/redhat-release ] && cat /etc/redhat-release @@ -106,9 +104,6 @@ print_env() { set -x } -# FIXME: workaround for the issue https://github.com/checkpoint-restore/criu/issues/1866 -modprobe -v sit || : - print_env ci_prep @@ -118,14 +113,8 @@ if [ "${CD_TO_TOP}" = "1" ]; then fi export GCOV CC -if [ -z "$COMPILE_FLAGS" ]; then - LOCAL_COMPILE_FLAGS=("V=1") -else - IFS=" " read -r -a LOCAL_COMPILE_FLAGS <<< "$COMPILE_FLAGS" - LOCAL_COMPILE_FLAGS=("V=1" "${LOCAL_COMPILE_FLAGS[@]}") -fi $CC --version -time make CC="$CC" -j4 "${LOCAL_COMPILE_FLAGS[@]}" +time make CC="$CC" -j4 V=1 ./criu/criu -v4 cpuinfo dump || : ./criu/criu -v4 cpuinfo check || : @@ -143,21 +132,8 @@ time make unittest [ -n "$SKIP_CI_TEST" ] && exit 0 -# Umount cpuset in cgroupv1 to make it move to cgroupv2 -if [ -d /sys/fs/cgroup/cpuset ]; then - umount /sys/fs/cgroup/cpuset -fi - ulimit -c unlimited -cgid=$$ -cleanup_cgroup() { - ./test/zdtm_umount_cgroups $cgid - dmesg -} -trap cleanup_cgroup EXIT -./test/zdtm_mount_cgroups $cgid - echo "|$(pwd)/test/abrt.sh %P %p %s %e" > /proc/sys/kernel/core_pattern if [ "${COMPAT_TEST}x" = "yx" ] ; then @@ -167,20 +143,21 @@ if [ "${COMPAT_TEST}x" = "yx" ] ; then # for 32-bit tests. A better way would involve launching docker.. # But it would require making zdtm.py aware of docker and launching # tests inside the CT. - INCOMPATIBLE_LIBS=(libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev) - IA32_PKGS=() + INCOMPATIBLE_LIBS="libaio-dev libcap-dev libnl-3-dev libnl-route-3-dev" + IA32_PKGS="" REFUGE=64-refuge mkdir "$REFUGE" - for i in "${INCOMPATIBLE_LIBS[@]}" ; do + for i in $INCOMPATIBLE_LIBS ; do for j in $(dpkg --listfiles "$i" | grep '\.so$') ; do cp "$j" "$REFUGE/" done - IA32_PKGS+=("$i:i386") + IA32_PKGS="$IA32_PKGS $i:i386" done - apt-get remove "${INCOMPATIBLE_LIBS[@]}" + # shellcheck disable=SC2086 + apt-get remove $INCOMPATIBLE_LIBS dpkg --add-architecture i386 - contrib/apt-install "${IA32_PKGS[@]}" + scripts/ci/apt-install "$IA32_PKGS" mkdir -p /usr/lib/x86_64-linux-gnu/ mv "$REFUGE"/* /usr/lib/x86_64-linux-gnu/ fi @@ -217,27 +194,22 @@ if [ "${STREAM_TEST}" = "1" ]; then exit 0 fi -./test/zdtm.py run -a -p 2 --keep-going "${ZDTM_OPTS[@]}" -if criu/criu check --feature move_mount_set_group; then - ./test/zdtm.py run -a -p 2 --mntns-compat-mode --keep-going "${ZDTM_OPTS[@]}" -fi +# shellcheck disable=SC2086 +./test/zdtm.py run -a -p 2 --keep-going $ZDTM_OPTS -./test/zdtm.py run -a -p 2 --keep-going --criu-config "${ZDTM_OPTS[@]}" - -# Newer kernels are blocking access to userfaultfd: -# uffd: Set unprivileged_userfaultfd sysctl knob to 1 if kernel faults must be handled without obtaining CAP_SYS_PTRACE capability -if [ -e /proc/sys/vm/unprivileged_userfaultfd ]; then - echo 1 > /proc/sys/vm/unprivileged_userfaultfd -fi - -LAZY_EXCLUDE=(-x maps04 -x cmdlinenv00 -x maps007) +LAZY_EXCLUDE="-x maps04 -x cmdlinenv00 -x maps007" LAZY_TESTS='.*(maps0|uffd-events|lazy-thp|futex|fork).*' -LAZY_OPTS=(-p 2 -T "$LAZY_TESTS" "${LAZY_EXCLUDE[@]}" "${ZDTM_OPTS[@]}") +LAZY_OPTS="-p 2 -T $LAZY_TESTS $LAZY_EXCLUDE $ZDTM_OPTS" -./test/zdtm.py run "${LAZY_OPTS[@]}" --lazy-pages -./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages -./test/zdtm.py run "${LAZY_OPTS[@]}" --remote-lazy-pages --tls +# shellcheck disable=SC2086 +./test/zdtm.py run $LAZY_OPTS --lazy-pages +# shellcheck disable=SC2086 +./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages +# FIXME: post-copy migration of THP over TLS (sometimes) fails with: +# Error (criu/tls.c:321): tls: Pull callback recv failed: Connection reset by peer +# shellcheck disable=SC2086 +./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls -x lazy-thp bash -x ./test/jenkins/criu-fault.sh if [ "$UNAME_M" == "x86_64" ]; then @@ -249,31 +221,22 @@ bash -x ./test/jenkins/criu-inhfd.sh if [ -z "$SKIP_EXT_DEV_TEST" ]; then make -C test/others/mnt-ext-dev/ run - if criu/criu check --feature move_mount_set_group; then - EXTRA_OPTS=--mntns-compat-mode make -C test/others/mnt-ext-dev/ run - fi fi make -C test/others/make/ run CC="$CC" -if [ -n "$CIRCLECI" ]; then +if [ -n "$TRAVIS" ] || [ -n "$CIRCLECI" ]; then # GitHub Actions (and Cirrus CI) does not provide a real TTY and CRIU will fail with: # Error (criu/tty.c:1014): tty: Don't have tty to inherit session from, aborting make -C test/others/shell-job/ run fi -make -C test/others/criu-ns/ run -make -C test/others/skip-file-rwx-check/ run make -C test/others/rpc/ run ./test/zdtm.py run -t zdtm/static/env00 --sibling -./test/zdtm.py run -t zdtm/static/maps00 --preload-libfault -./test/zdtm.py run -t zdtm/static/maps02 --preload-libfault - ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --noauto-dedup ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server ./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --page-server --dedup -./test/zdtm.py run -t zdtm/transition/maps007 --pre 2 --pre-dump-mode read ./test/zdtm.py run -t zdtm/transition/pid_reuse --pre 2 # start time based pid reuse detection ./test/zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2 # pidfd based pid reuse detection @@ -293,51 +256,9 @@ ip net add test ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/transition/fork -t zdtm/static/ghost_holes00 -t zdtm/static/socket-tcp -t zdtm/static/msgque -k always ./test/crit-recode.py -# Rootless tests -# Check if cap_checkpoint_restore is supported and also if unshare -c is supported. -# -# Do not run this test in a container (see https://github.com/checkpoint-restore/criu/issues/2312). -# Before v6.8-rc1~215^2~6, the kernel currently did not show correct device and -# inode numbers in /proc/pid/maps for stackable file systems. -skip=0 -findmnt -no FSTYPE / | grep overlay && { - ./criu/criu check --feature overlayfs_maps || skip=1 -} -unshare -c /bin/true || skip=1 -capsh --supports=cap_checkpoint_restore || skip=1 - -if [ "$skip" == 0 ]; then - make -C test/zdtm/ cleanout - rm -rf test/dump - setcap cap_checkpoint_restore,cap_sys_ptrace+eip criu/criu - if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then - # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. - selinuxmode=$(getenforce) - if [ "$selinuxmode" != "Disabled" ]; then - setenforce Permissive - fi - - fi - # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore - # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, - # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. - sudo --user=#65534 --group=#65534 unshare -Ucfpm --mount-proc -- bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" - if [ -d /sys/fs/selinux ] && command -v getenforce &>/dev/null; then - if [ "$selinuxmode" != "Disabled" ]; then - setenforce "$selinuxmode" - fi - fi - setcap -r criu/criu -else - echo "Skipping unprivileged mode tests" -fi - # more crit testing make -C test/others/crit run -# coredump testing -make -C test/others/criu-coredump run - # libcriu testing make -C test/others/libcriu run @@ -347,9 +268,6 @@ make -C test/others/ns_ext run # config file parser and parameter testing make -C test/others/config-file run -# action script testing -make -C test/others/action-script run - # Skip all further tests when running with GCOV=1 # The one test which currently cannot handle GCOV testing is compel/test # Probably because the GCOV Makefile infrastructure does not exist in compel @@ -357,15 +275,3 @@ make -C test/others/action-script run # compel testing make -C compel/test - -# amdgpu and cuda plugin testing -make amdgpu_plugin -make -C plugins/amdgpu/ test_topology_remap -./plugins/amdgpu/test_topology_remap - -./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin cuda -./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu -./test/zdtm.py run -t zdtm/static/maps00 -t zdtm/static/maps02 --criu-plugin amdgpu cuda -./test/zdtm.py run -t zdtm/static/busyloop00 --criu-plugin inventory_test_enabled inventory_test_disabled - -./test/zdtm.py run -t zdtm/static/sigpending -t zdtm/static/pthread00 --mocked-cuda-checkpoint --fault 138 diff --git a/scripts/ci/vagrant.sh b/scripts/ci/vagrant.sh index 5f2de32b8..839b100c8 100755 --- a/scripts/ci/vagrant.sh +++ b/scripts/ci/vagrant.sh @@ -1,47 +1,47 @@ #!/bin/bash -# This script is used to run vagrant based tests on Cirrus CI. -# This script is started via .cirrus.yml +# This script is used to run vagrant based tests on Travis. +# This script is started via sudo from .travis.yml set -e set -x -VAGRANT_VERSION=2.4.7 -FEDORA_VERSION=42 -FEDORA_BOX_VERSION=1.1.0 +VAGRANT_VERSION=2.2.16 +FEDORA_VERSION=34 +FEDORA_BOX_VERSION=34.20210423.0 setup() { + if [ -n "$TRAVIS" ]; then + # Load the kvm modules for vagrant to use qemu + modprobe kvm kvm_intel + fi + # Tar up the git checkout to have vagrant rsync it to the VM - tar cf /tmp/criu.tar -C ../../../ criu + tar cf criu.tar ../../../criu # Cirrus has problems with the following certificate. - wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}-1_"$(dpkg --print-architecture)".deb -O /tmp/vagrant.deb && \ + wget --no-check-certificate https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_"$(uname -m)".deb -O /tmp/vagrant.deb && \ dpkg -i /tmp/vagrant.deb - ../../contrib/apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu-system \ - ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base openssh-client + ./apt-install libvirt-clients libvirt-daemon-system libvirt-dev qemu-utils qemu \ + ruby build-essential libxml2-dev qemu-kvm rsync ebtables dnsmasq-base \ + openssh-client systemctl restart libvirtd vagrant plugin install vagrant-libvirt - vagrant init cloud-image/fedora-${FEDORA_VERSION} --box-version ${FEDORA_BOX_VERSION} - + vagrant init fedora/${FEDORA_VERSION}-cloud-base --box-version ${FEDORA_BOX_VERSION} # The default libvirt Vagrant VM uses 512MB. - # VMs in our CI typically have around 16GB. + # Travis VMs should have around 7.5GB. # Increasing it to 4GB should work. sed -i Vagrantfile -e 's,^end$, config.vm.provider :libvirt do |libvirt|'"\n"' libvirt.memory = 4096;end'"\n"'end,g' - # Sync /tmp/criu.tar into the VM - # We want to use $HOME without expansion - # shellcheck disable=SC2016 - sed -i Vagrantfile -e 's|^end$| config.vm.provision "file", source: "/tmp/criu.tar", destination: "$HOME/criu.tar"'"\n"'end|g' - vagrant up --provider=libvirt --no-tty mkdir -p /root/.ssh vagrant ssh-config >> /root/.ssh/config - + ssh default sudo dnf upgrade -y + ssh default sudo dnf install -y gcc git gnutls-devel nftables-devel libaio-devel \ + libasan libcap-devel libnet-devel libnl3-devel make protobuf-c-devel \ + protobuf-devel python3-flake8 python3-future python3-protobuf \ + python3-junit_xml rubygem-asciidoctor iptables libselinux-devel libbpf-devel # Disable sssd to avoid zdtm test failures in pty04 due to sssd socket ssh default sudo systemctl mask sssd - - ssh default 'sudo mkdir -p --mode=777 /vagrant && mv $HOME/criu.tar /vagrant && cd /vagrant && tar xf criu.tar' - ssh default sudo dnf upgrade -y - ssh default sudo /vagrant/criu/contrib/dependencies/dnf-packages.sh ssh default cat /proc/cmdline } @@ -49,53 +49,13 @@ fedora-no-vdso() { ssh default sudo grubby --update-kernel ALL --args="vdso=0" vagrant reload ssh default cat /proc/cmdline - ssh default 'cd /vagrant/criu; make -j' - ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going' + ssh default 'cd /vagrant; tar xf criu.tar; cd criu; make -j 4' + # BPF tests are failing see: https://github.com/checkpoint-restore/criu/issues/1354 + # Needs to be fixed, skip for now + ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -a --keep-going -x zdtm/static/bpf_hash -x zdtm/static/bpf_array' # This test (pidfd_store_sk) requires pidfd_getfd syscall which is guaranteed in Fedora 33. # It is also skipped from -a because it runs in RPC mode only ssh default 'cd /vagrant/criu/test; sudo ./zdtm.py run -t zdtm/transition/pidfd_store_sk --rpc --pre 2' } -fedora-rawhide() { - # Upgrade the kernel to the latest vanilla one - ssh default sudo dnf -y copr enable @kernel-vanilla/stable - ssh default sudo dnf upgrade -y - - # The 6.2 kernel of Fedora 38 in combination with rawhide userspace breaks - # zdtm/static/socket-tcp-nfconntrack. To activate the new kernel previously - # installed this reboots the VM. - vagrant reload - ssh default uname -a - # - # Workaround the problem: - # error running container: error from /usr/bin/crun creating container for [...]: sd-bus call: Transport endpoint is not connected - # Let's just use runc instead of crun - # see also https://github.com/kata-containers/tests/issues/4283 - # - ssh default 'sudo dnf remove -y crun || true' - ssh default sudo dnf install -y podman runc - # Some tests in the container need selinux to be disabled. - # In the container it is not possible to change the state of selinux. - # Let's just disable it for this test run completely. - ssh default 'sudo setenforce Permissive' - ssh default 'cd /vagrant/criu; sudo -E make -C scripts/ci fedora-rawhide CONTAINER_RUNTIME=podman BUILD_OPTIONS="--security-opt seccomp=unconfined"' -} - -fedora-non-root() { - ssh default uname -a - ssh default 'cd /vagrant/criu; make -j' - # Setting the capability should be the only line needed to run as non-root on Fedora - # In other environments either set /proc/sys/kernel/yama/ptrace_scope to 0 or grant cap_sys_ptrace to criu - ssh default 'sudo setcap cap_checkpoint_restore+eip /vagrant/criu/criu/criu' - # Run it once as non-root - ssh default 'cd /vagrant/criu; criu/criu check --unprivileged; ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' - # Run it as root with '--rootless' - ssh default 'cd /vagrant/criu; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h; sudo chmod 777 test/dump/zdtm/static/{env00,pthread00}; sudo ./test/zdtm.py run -t zdtm/static/env00 -t zdtm/static/pthread00 -f h --rootless' - # Run it as non-root in a user namespace. Since CAP_CHECKPOINT_RESTORE behaves differently in non-user namespaces (e.g. no access to map_files) this tests that we can dump and restore - # under those conditions. Note that the "... && true" part is necessary; we need at least one statement after the tests so that bash can reap zombies in the user namespace, - # otherwise it will exec the last statement and get replaced and nobody will be left to reap our zombies. - # Note: selinux in Enforcing mode prevents us from calling clone3() or writing to ns_last_pid on restore; hence set to Permissive for the test and then set back. - ssh default 'cd /vagrant/criu; selinuxmode=`getenforce` && sudo setenforce Permissive && unshare -Ucfpm --mount-proc bash -c "./test/zdtm.py run -t zdtm/static/maps00 -f h --rootless && true" && sudo setenforce $selinuxmode' -} - $1 diff --git a/scripts/crit-setup.py b/scripts/crit-setup.py new file mode 100644 index 000000000..871e55921 --- /dev/null +++ b/scripts/crit-setup.py @@ -0,0 +1,11 @@ +from distutils.core import setup + +setup(name="crit", + version="0.0.1", + description="CRiu Image Tool", + author="CRIU team", + author_email="criu@openvz.org", + url="https://github.com/checkpoint-restore/criu", + package_dir={'pycriu': 'lib/py'}, + packages=["pycriu", "pycriu.images"], + scripts=["crit/crit"]) diff --git a/scripts/criu-ns b/scripts/criu-ns index 5950d7c50..a97c0002b 100755 --- a/scripts/criu-ns +++ b/scripts/criu-ns @@ -1,12 +1,9 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import ctypes import ctypes.util import errno import sys import os -import fcntl -import termios -import time # constants for unshare CLONE_NEWNS = 0x00020000 @@ -71,19 +68,7 @@ def _wait_for_process_status(criu_pid): try: (pid, status) = os.wait() if pid == criu_pid: - # The following code block is based on - # os.waitstatus_to_exitcode() introduced in Python 3.9 - # and we implement this for comparability with older - # versions of Python. - if os.WIFSIGNALED(status): - return os.WTERMSIG(status) - elif os.WIFEXITED(status): - return os.WEXITSTATUS(status) - elif os.WIFSTOPPED(status): - return os.WSTOPSIG(status) - else: - raise Exception("CRIU was terminated by an " - "unidentified reason") + return os.WEXITSTATUS(status) except OSError: return -251 @@ -93,42 +78,8 @@ def run_criu(args): Spawn CRIU binary """ print(sys.argv) - - if "--criu-binary" in args: - try: - opt_index = args.index("--criu-binary") - path = args[opt_index + 1] - del args[opt_index:opt_index + 2] - args.insert(0, "criu") - os.execv(path, args) - raise OSError(errno.ENOENT, "No such command") - except (ValueError, IndexError, FileNotFoundError): - raise OSError(errno.ENOENT, "--criu-binary missing argument") - else: - args.insert(0, "criu") - os.execvp("criu", args) - raise OSError(errno.ENOENT, "No such command") - - -# pidns_holder creates a process that is reparented to the init. -# -# The init process can exit if it doesn't have any child processes and its -# pidns is destroyed in this case. CRIU dump is running in the target pid -# namespace and it kills dumped processes at the end. We need to create a -# holder process to be sure that the pid namespace will not be destroy before -# criu exits. -def pidns_holder(): - r, w = os.pipe() - pid = os.fork() - if pid == 0: - pid = os.fork() - if pid == 0: - os.close(w) - # The write end is owned by the parent process and it is closed by - # kernel when the parent process exits. - os.read(r, 1) - sys.exit(0) - os.waitpid(pid, 0) + os.execlp('criu', *['criu'] + args) + raise OSError(errno.ENOENT, "No such command") def wrap_restore(): @@ -136,8 +87,8 @@ def wrap_restore(): if '--restore-sibling' in restore_args: raise OSError(errno.EINVAL, "--restore-sibling is not supported") - # Unshare pid namespace - if _unshare(CLONE_NEWPID) != 0: + # Unshare pid and mount namespaces + if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: _errno = ctypes.get_errno() raise OSError(_errno, errno.errorcode[_errno]) @@ -149,65 +100,11 @@ def wrap_restore(): restore_detached = True restore_args.remove('--restore-detached') - restore_pidfile = None - if '--pidfile' in restore_args: - try: - opt_index = restore_args.index('--pidfile') - restore_pidfile = restore_args[opt_index + 1] - del restore_args[opt_index:opt_index + 2] - except (ValueError, IndexError, FileNotFoundError): - raise OSError(errno.ENOENT, "--pidfile missing argument") - - if not restore_pidfile.startswith('/'): - for base_dir_opt in ['--work-dir', '-W', '--images-dir', '-D']: - if base_dir_opt in restore_args: - try: - opt_index = restore_args.index(base_dir_opt) - restore_pidfile = os.path.join(restore_args[opt_index + 1], restore_pidfile) - break - except (ValueError, IndexError, FileNotFoundError): - raise OSError(errno.ENOENT, base_dir_opt + " missing argument") - criu_pid = os.fork() if criu_pid == 0: - # Unshare mount namespace - if _unshare(CLONE_NEWNS) != 0: - _errno = ctypes.get_errno() - raise OSError(_errno, errno.errorcode[_errno]) - - os.setsid() - # Set stdin tty to be a controlling tty of our new session, this is - # required by --shell-job option, as for it CRIU would try to set a - # process group of restored root task to be a foreground group on the - # terminal. - if '--shell-job' in restore_args or '-j' in restore_args: - if os.isatty(sys.stdin.fileno()): - fcntl.ioctl(sys.stdin.fileno(), termios.TIOCSCTTY, 1) - else: - raise OSError(errno.EINVAL, 'The stdin is not a tty for a --shell-job') - _mount_new_proc() run_criu(restore_args) - if restore_pidfile: - restored_pid = None - retry = 5 - - while not restored_pid and retry: - with open('/proc/%d/task/%d/children' % (criu_pid, criu_pid)) as f: - line = f.readline().strip() - if len(line): - restored_pid = line - break - retry -= 1 - time.sleep(1) - - if restored_pid: - with open(restore_pidfile, 'w+') as f: - f.write(restored_pid) - else: - print("Warn: Search of restored pid for --pidfile option timeouted") - if restore_detached: return 0 @@ -216,7 +113,7 @@ def wrap_restore(): def get_varg(args): for i in range(1, len(sys.argv)): - if sys.argv[i] not in args: + if not sys.argv[i] in args: continue if i + 1 >= len(sys.argv): @@ -234,9 +131,9 @@ def _set_namespace(fd): raise OSError(_errno, errno.errorcode[_errno]) -def is_my_namespace(fd, ns): +def is_my_namespace(fd): """Returns True if fd refers to current namespace""" - return os.stat('/proc/self/ns/%s' % ns).st_ino == os.fstat(fd).st_ino + return os.stat('/proc/self/ns/pid').st_ino != os.fstat(fd).st_ino def set_pidns(tpid, pid_idx): @@ -246,7 +143,7 @@ def set_pidns(tpid, pid_idx): pid namespace. """ ns_fd = os.open('/proc/%s/ns/pid' % tpid, os.O_RDONLY) - if not is_my_namespace(ns_fd, "pid"): + if is_my_namespace(ns_fd): for line in open('/proc/%s/status' % tpid): if not line.startswith('NSpid:'): continue @@ -271,7 +168,7 @@ def set_mntns(tpid): will be the same in target mntns. """ ns_fd = os.open('/proc/%s/ns/mnt' % tpid, os.O_RDONLY) - if not is_my_namespace(ns_fd, "mnt"): + if is_my_namespace(ns_fd): root_st = os.stat('/') cwd_st = os.stat('.') cwd_path = os.path.realpath('.') @@ -302,12 +199,10 @@ def wrap_dump(): set_pidns(pid, pid_idx) set_mntns(pid) - pidns_holder() - criu_pid = os.fork() if criu_pid == 0: run_criu(sys.argv[1:]) - return _wait_for_process_status(criu_pid) + return _wait_for_process_status(pid) def show_usage(): diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index 727e9689e..8df20afb7 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -35,6 +35,34 @@ int main(void) } endef +define FEATURE_TEST_STRLCPY + +#include + +#ifdef CONFIG_HAS_LIBBSD +# include +#endif + +int main(void) +{ + return strlcpy(NULL, NULL, 0); +} +endef + +define FEATURE_TEST_STRLCAT + +#include + +#ifdef CONFIG_HAS_LIBBSD +# include +#endif + +int main(void) +{ + return strlcat(NULL, NULL, 0); +} +endef + define FEATURE_TEST_PTRACE_PEEKSIGINFO #include @@ -109,6 +137,19 @@ ENTRY(main) END(main) endef +define FEATURE_TEST_FSCONFIG + +#include + +int main(void) +{ + if (FSCONFIG_CMD_CREATE > 0) + return 0; + return 0; +} + +endef + define FEATURE_TEST_NFTABLES_LIB_API_0 #include @@ -143,34 +184,3 @@ int main(void) return memfd_create(NULL, 0); } endef - -define FEATURE_TEST_OPENAT2 - -#include - -int main(void) -{ - if (RESOLVE_NO_XDEV > 0) - return 0; - return 0; -} -endef - -define FEATURE_TEST_NO_LIBC_RSEQ_DEFS - -#ifdef __has_include -#if __has_include(\"sys/rseq.h\") -#include -#endif -#endif - -enum rseq_cpu_id_state { - RSEQ_CPU_ID_UNINITIALIZED = -1, - RSEQ_CPU_ID_REGISTRATION_FAILED = -2, -}; - -int main(void) -{ - return 0; -} -endef diff --git a/scripts/fetch-clang-format.sh b/scripts/fetch-clang-format.sh index 5b6037d61..c9006c518 100755 --- a/scripts/fetch-clang-format.sh +++ b/scripts/fetch-clang-format.sh @@ -8,11 +8,8 @@ URL="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/.c curl -s "${URL}" | sed -e " s,^\( *\)#\([A-Z]\),\1\2,g; s,ControlStatements,ControlStatementsExceptForEachMacros,g; - s,ColumnLimit: 80,ColumnLimit: 0,g; + s,ColumnLimit: 80,ColumnLimit: 120,g; s,Intended for clang-format >= 4,Intended for clang-format >= 11,g; - s,ForEachMacros:,ForEachMacros:\n - 'for_each_bit',g; s,ForEachMacros:,ForEachMacros:\n - 'for_each_pstree_item',g; s,\(AlignTrailingComments:.*\)$,\1\nAlignConsecutiveMacros: true,g; - s,AlignTrailingComments: false,AlignTrailingComments: true,g; - s,\(IndentCaseLabels: false\),\1\nIndentGotoLabels: false,g; " > .clang-format diff --git a/scripts/flake8.cfg b/scripts/flake8.cfg index bd4f95bb2..b6a587729 100644 --- a/scripts/flake8.cfg +++ b/scripts/flake8.cfg @@ -2,5 +2,3 @@ # E501 line too long # W504 line break after binary operator ignore = E501,W504 -# F401: imported but unused -per-file-ignores = __init__.py:F401 diff --git a/scripts/github-indent-warnings.py b/scripts/github-indent-warnings.py deleted file mode 100755 index 04f82d6c1..000000000 --- a/scripts/github-indent-warnings.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/python3 -import sys -import re - -re_file = r'^diff --git a/(\S\S*)\s.*$' -re_line = r'^@@ -(\d\d*)\D.*@@.*$' - -if __name__ == '__main__': - if len(sys.argv) != 1 and len(sys.argv) != 2: - print(f'usage: {sys.argv[0]} ') - print(f'usage: | {sys.argv[0]}') - exit(1) - - input_file = sys.stdin.fileno() - if len(sys.argv) == 2: - input_file = sys.argv[1] - - with open(input_file, 'r') as fi: - file_name = None - line_number = None - for line in fi: - file_matches = re.findall(re_file, line) - if len(file_matches) == 1: - file_name = file_matches[0] - continue - - if file_name is None: - continue - - line_matches = re.findall(re_line, line) - if len(line_matches) == 1: - line_number = int(line_matches[0]) + 3 - print(f'::warning file={file_name},line={line_number}::clang-format: Possible coding style problem (https://github.com/checkpoint-restore/criu/blob/criu-dev/CONTRIBUTING.md#automatic-tools-to-fix-coding-style)') diff --git a/scripts/install-debian-pkgs.sh b/scripts/install-debian-pkgs.sh new file mode 100755 index 000000000..540c2c094 --- /dev/null +++ b/scripts/install-debian-pkgs.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Install required packages for development environment in Debian Distro + +REQ_PKGS=${REQ_PKGS:=contrib/debian/dev-packages.lst} + +help_msg="Install required packages for development environment in Debian Distro +Usage: + scripts/install-debian-pkgs.sh" + +function print_help() +{ + exec echo -e "$help_msg" +} + +function process() +{ + sudo apt-get update + sudo apt-get install -yq "$( sed 's/\#.*$//' ${REQ_PKGS} )" +} + +if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then + print_help +else + process +fi diff --git a/scripts/magic-gen.py b/scripts/magic-gen.py index 38dff1424..3b1f29fb5 100755 --- a/scripts/magic-gen.py +++ b/scripts/magic-gen.py @@ -1,4 +1,4 @@ -#!/bin/env python3 +#!/bin/env python2 import sys diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index 603c322cf..c1c1e94af 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -20,9 +20,7 @@ ARCH ?= $(shell echo $(SUBARCH) | sed \ -e s/ppc64.*/ppc64/ \ -e s/mips.*/mips/ \ -e s/sh[234].*/sh/ \ - -e s/aarch64.*/aarch64/ \ - -e s/riscv64.*/riscv64/ \ - -e s/loongarch64.*/loongarch64/) + -e s/aarch64.*/aarch64/) export SUBARCH ARCH diff --git a/scripts/nmk/scripts/macro.mk b/scripts/nmk/scripts/macro.mk index 1dcbbd6ec..b36d5b26a 100644 --- a/scripts/nmk/scripts/macro.mk +++ b/scripts/nmk/scripts/macro.mk @@ -10,7 +10,7 @@ define include-once endef # Helper to build built-in target in directory. -# $(eval $(call gen-built-in,,,)) +# $(eval $(call gen-built-in,,,)) define gen-built-in $(1)/%: $(2) $$(Q) $$(MAKE) $$(build)=$(1) $$@ diff --git a/scripts/nmk/scripts/main.mk b/scripts/nmk/scripts/main.mk index 7f11bda23..493a164f8 100644 --- a/scripts/nmk/scripts/main.mk +++ b/scripts/nmk/scripts/main.mk @@ -1,7 +1,7 @@ ifndef ____nmk_defined__main # -# General inclusion statement +# Genaral inclusion statement ifndef ____nmk_defined__include include $(__nmk_dir)include.mk diff --git a/scripts/nmk/scripts/msg.mk b/scripts/nmk/scripts/msg.mk index 38fd3cb7e..d07f21607 100644 --- a/scripts/nmk/scripts/msg.mk +++ b/scripts/nmk/scripts/msg.mk @@ -59,7 +59,7 @@ define newline endef -# map function: +# map funciton: # $1 - func to call # $2 - list over which map the $1 func # result is divided with newlines diff --git a/scripts/nmk/scripts/tools.mk b/scripts/nmk/scripts/tools.mk index de5782c13..1681d4e90 100644 --- a/scripts/nmk/scripts/tools.mk +++ b/scripts/nmk/scripts/tools.mk @@ -23,7 +23,7 @@ MAKE := make MKDIR := mkdir -p AWK := awk PERL := perl -FULL_PYTHON := $(shell command -v python3 2>/dev/null) +FULL_PYTHON := $(shell which python3 2>/dev/null || which python2 2>/dev/null) PYTHON ?= $(shell basename $(FULL_PYTHON)) FIND := find SH := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ @@ -36,7 +36,7 @@ CTAGS := ctags export RM HOSTLD LD HOSTCC CC CPP AS AR STRIP OBJCOPY OBJDUMP export NM SH MAKE MKDIR AWK PERL PYTHON SH CSCOPE -export USE_ASCIIDOCTOR ?= $(shell command -v asciidoctor 2>/dev/null) +export USE_ASCIIDOCTOR ?= $(shell which asciidoctor 2>/dev/null) # # Footer. diff --git a/scripts/protobuf-gen.sh b/scripts/protobuf-gen.sh index 25d2feaeb..0c738f13a 100644 --- a/scripts/protobuf-gen.sh +++ b/scripts/protobuf-gen.sh @@ -1,15 +1,15 @@ #!/bin/bash +# shellcheck disable=SC2013,SC1004 + TR="y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/" -sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { +for x in $(sed -n '/PB_AUTOGEN_START/,/PB_AUTOGEN_STOP/ { /PB_AUTOGEN_ST/d; - /^[ \t]*$/d; s/,.*$//; s/\tPB_//; p; - }' criu/include/protobuf-desc.h | \ -while IFS= read -r x; do + }' criu/include/protobuf-desc.h); do x_la=$(echo "$x" | sed $TR) x_uf=$(echo "$x" | sed -nr 's/^./&#\\\ /; diff --git a/scripts/ruff.toml b/scripts/ruff.toml deleted file mode 100644 index 2b0385976..000000000 --- a/scripts/ruff.toml +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore `E401` (import violations) in all `__init__.py` files -[lint.per-file-ignores] -"__init__.py" = ["F401"] - diff --git a/scripts/systemd-autofs-restart.sh b/scripts/systemd-autofs-restart.sh index b35adc94f..4d2be1c10 100755 --- a/scripts/systemd-autofs-restart.sh +++ b/scripts/systemd-autofs-restart.sh @@ -3,11 +3,11 @@ # This script can be used as a workaround for systemd autofs mount migration. # The problem is that systemd is a clever guy: before mounting of actual file # system on top of autofs mount, it first checks that device number of autofs -# mount is equal to the one, stored in systemd internals. If they do not match, +# mount is equal to the one, stored in sytemd internals. If they do not match, # systemd ignores kernel request. # The problem happens each time autofs is restored (new device number for # autofs superblock) and can't be properly solved without some kind of "device -# namespaces", where device number can be preserved. +# namespaces", where device number can be preseved. # But some of systemd services can be painlessly restarted. Like # proc-sys-fs-binfmt_misc. # diff --git a/scripts/uninstall_module.py b/scripts/uninstall_module.py deleted file mode 100755 index 2da63c800..000000000 --- a/scripts/uninstall_module.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/python3 -""" -`pip uninstall` doesn't support `--prefix`. -https://github.com/pypa/pip/issues/11213 -""" -import argparse -import os -import shutil -import site -import subprocess -import sys - -# With Python 3.13 the subprocess module now uses the `posix_spawn()` -# function which requires loading the `signal` module: -# https://docs.python.org/3/whatsnew/3.13.html#subprocess -# -# We need to load this module here, before PYTHONPATH and sys.path -# have been modified to use the path specified with `--prefix`. -# -# flake8: noqa: F401 -import signal - -import importlib_metadata - - -def add_site_dir(prefix: str): - """ - Add site directory with prefix to sys.path and update PYTHONPATH. - """ - # If prefix is used, we need to make sure that we - # do not uninstall other packages from the system paths. - sys.path = [] - site.PREFIXES = [prefix] - pkgs = site.getsitepackages() - for path in pkgs: - site.addsitedir(path) - if 'dist-packages' in path: - # Ubuntu / Debian might use both dist- and site- packages. - site.addsitedir(path.replace('dist-packages', 'site-packages')) - os.environ['PYTHONPATH'] = os.pathsep.join(sys.path) - - -def uninstall_module(package_name: str, prefix=None): - """ - Enable support for '--prefix' with 'pip uninstall'. - """ - dist_info_path = None - if prefix: - add_site_dir(prefix) - try: - distribution = next(importlib_metadata.Distribution.discover(name=package_name)) - dist_info_path = str(distribution._path) - except StopIteration: - print(f"Skipping {package_name} as it is not installed.") - sys.exit(0) - - command = [sys.executable, '-m', 'pip', 'uninstall', '-y', package_name] - try: - subprocess.check_call(command, env=os.environ) - if dist_info_path and os.path.isdir(dist_info_path): - # .dist-info files are not cleaned up when the package - # has been installed with --prefix. - # https://github.com/pypa/pip/issues/5573 - shutil.rmtree(dist_info_path) - if 'dist-packages' in dist_info_path: - shutil.rmtree(dist_info_path.replace('dist-packages', 'site-packages')) - except subprocess.CalledProcessError as err: - print(f'Error uninstalling package {package_name}: {err}') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('module_name', help='The name of the module to uninstall') - parser.add_argument('--prefix', help='The prefix where the module was installed') - args = parser.parse_args() - uninstall_module(args.module_name, args.prefix) diff --git a/soccr/soccr.c b/soccr/soccr.c index 8e1ce1c63..f6fb1946b 100644 --- a/soccr/soccr.c +++ b/soccr/soccr.c @@ -216,7 +216,7 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str data->unsq_len = size; if (data->state == TCP_CLOSE) { - /* A connection could be reset. In this case a sent queue + /* A connection could be reseted. In thise case a sent queue * may contain some data. A user can't read this data, so let's * ignore them. Otherwise we will need to add a logic whether * the send queue contains a fin packet or not and decide whether @@ -227,7 +227,7 @@ static int refresh_sk(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, str data->outq_len = 0; } - /* Don't account the fin packet. It doesn't contain real data. */ + /* Don't account the fin packet. It doesn't countain real data. */ if ((1 << data->state) & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) { if (data->outq_len) data->outq_len--; @@ -441,7 +441,7 @@ union libsoccr_addr *libsoccr_get_addr(struct libsoccr_sk *sk, int self, unsigne if (flags & ~GET_SA_FLAGS) return NULL; - /* FIXME -- implemented in CRIU, makes sense to have it here too */ + /* FIXME -- implemeted in CRIU, makes sence to have it here too */ return NULL; } @@ -503,7 +503,7 @@ static int libsoccr_set_sk_data_noq(struct libsoccr_sk *sk, struct libsoccr_sk_d if (mstate & (RCVQ_FIRST_FIN | RCVQ_SECOND_FIN)) data->inq_seq--; - /* outq_seq is adjusted due to not accounting the fin packet */ + /* outq_seq is adjusted due to not accointing the fin packet */ if (mstate & (SNDQ_FIRST_FIN | SNDQ_SECOND_FIN)) data->outq_seq--; @@ -609,8 +609,8 @@ static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsig libnet_type = LIBNET_RAW4; l = libnet_init(libnet_type, /* injection type */ - NULL, /* network interface */ - errbuf); /* errbuf */ + NULL, /* network interface */ + errbuf); /* errbuf */ if (l == NULL) { loge("libnet_init failed (%s)\n", errbuf); return -1; @@ -623,17 +623,17 @@ static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsig ret = libnet_build_tcp(ntohs(sk->dst_addr->v4.sin_port), /* source port */ ntohs(sk->src_addr->v4.sin_port), /* destination port */ - data->inq_seq, /* sequence number */ - data->outq_seq - data->outq_len, /* acknowledgement num */ - flags, /* control flags */ - data->rcv_wnd, /* window size */ - 0, /* checksum */ - 10, /* urgent pointer */ - LIBNET_TCP_H + 20, /* TCP packet size */ - NULL, /* payload */ - 0, /* payload size */ - l, /* libnet handle */ - 0); /* libnet id */ + data->inq_seq, /* sequence number */ + data->outq_seq - data->outq_len, /* acknowledgement num */ + flags, /* control flags */ + data->rcv_wnd, /* window size */ + 0, /* checksum */ + 10, /* urgent pointer */ + LIBNET_TCP_H + 20, /* TCP packet size */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ if (ret == -1) { loge("Can't build TCP header: %s\n", libnet_geterror(l)); goto err; @@ -646,28 +646,28 @@ static int send_fin(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsig memcpy(&src, &sk->src_addr->v6.sin6_addr, sizeof(src)); ret = libnet_build_ipv6(0, 0, LIBNET_TCP_H, /* length */ - IPPROTO_TCP, /* protocol */ - 64, /* hop limit */ - dst, /* source IP */ - src, /* destination IP */ - NULL, /* payload */ - 0, /* payload size */ - l, /* libnet handle */ - 0); /* libnet id */ + IPPROTO_TCP, /* protocol */ + 64, /* hop limit */ + dst, /* source IP */ + src, /* destination IP */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ } else if (family == AF_INET) ret = libnet_build_ipv4(LIBNET_IPV4_H + LIBNET_TCP_H + 20, /* length */ - 0, /* TOS */ - 242, /* IP ID */ - 0, /* IP Frag */ - 64, /* TTL */ - IPPROTO_TCP, /* protocol */ - 0, /* checksum */ - dst_v4, /* source IP */ - src_v4, /* destination IP */ - NULL, /* payload */ - 0, /* payload size */ - l, /* libnet handle */ - 0); /* libnet id */ + 0, /* TOS */ + 242, /* IP ID */ + 0, /* IP Frag */ + 64, /* TTL */ + IPPROTO_TCP, /* protocol */ + 0, /* checksum */ + dst_v4, /* source IP */ + src_v4, /* destination IP */ + NULL, /* payload */ + 0, /* payload size */ + l, /* libnet handle */ + 0); /* libnet id */ else { loge("Unknown socket family\n"); goto err; @@ -781,7 +781,7 @@ int libsoccr_restore(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsi return 0; } -static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __u32 len) +static int __send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) { int ret, err = -1, max_chunk; int off; @@ -816,7 +816,7 @@ static int __send_queue(struct libsoccr_sk *sk, const char *queue, char *buf, __ continue; } - logerr("Can't restore %s queue data (%d), want (%d-%d:%d:%d)", queue, ret, off, chunk, len, max_chunk); + logerr("Can't restore %d queue data (%d), want (%d:%d:%d)", queue, ret, chunk, len, max_chunk); goto err; } off += ret; @@ -837,7 +837,7 @@ static int send_queue(struct libsoccr_sk *sk, int queue, char *buf, __u32 len) return -1; } - return __send_queue(sk, queue == TCP_RECV_QUEUE ? "recv" : "send", buf, len); + return __send_queue(sk, queue, buf, len); } static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigned data_size, int queue, @@ -876,7 +876,7 @@ static int libsoccr_restore_queue(struct libsoccr_sk *sk, struct libsoccr_sk_dat * they can be restored without any tricks. */ tcp_repair_off(sk->fd); - if (__send_queue(sk, "not-sent send", buf + len, ulen)) + if (__send_queue(sk, TCP_SEND_QUEUE, buf + len, ulen)) return -3; if (tcp_repair_on(sk->fd)) return -4; diff --git a/soccr/soccr.h b/soccr/soccr.h index 68ddb577b..934d43827 100644 --- a/soccr/soccr.h +++ b/soccr/soccr.h @@ -1,9 +1,9 @@ #ifndef __LIBSOCCR_H__ #define __LIBSOCCR_H__ -#include /* sockaddr_in, sockaddr_in6 */ +#include /* sockaddr_in, sockaddr_in6 */ #include /* TCP_REPAIR_WINDOW, TCP_TIMESTAMP */ -#include /* uint32_t */ -#include /* sockaddr */ +#include /* uint32_t */ +#include /* sockaddr */ #include "common/config.h" @@ -171,8 +171,8 @@ int libsoccr_save(struct libsoccr_sk *sk, struct libsoccr_sk_data *data, unsigne * Get a pointer on the contents of queues. The amount of bytes is * determined from the filled libsoccr_sk_data by queue_id. * - * For TCP_RECV_QUEUE the length is .inq_len - * For TCP_SEND_QUEUE the length is .outq_len + * For TCP_RECV_QUEUE the lenght is .inq_len + * For TCP_SEND_QUEUE the lenght is .outq_len * * For any other queues returns NULL. * diff --git a/soccr/test/Makefile b/soccr/test/Makefile index 499901b0c..458540045 100644 --- a/soccr/test/Makefile +++ b/soccr/test/Makefile @@ -21,6 +21,7 @@ tcp-conn-v6: tcp-conn-v6.c test: tcp-constructor tcp-conn tcp-conn-v6 unshare -n sh -c "ip link set up dev lo; ./tcp-conn" unshare -n sh -c "ip link set up dev lo; ./tcp-conn-v6" - python3 run.py ./$(RUN) + python run.py ./$(RUN) .PHONY: test + diff --git a/soccr/test/run.py b/soccr/test/run.py index 57c556e36..1ffe58a58 100644 --- a/soccr/test/run.py +++ b/soccr/test/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import sys, os import hashlib diff --git a/soccr/test/tcp-test.py b/soccr/test/tcp-test.py index b48f532eb..ff3fe29dc 100755 --- a/soccr/test/tcp-test.py +++ b/soccr/test/tcp-test.py @@ -1,5 +1,6 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python2 +from __future__ import print_function import sys, socket import hashlib diff --git a/test/Makefile b/test/Makefile index 0bfdab680..cf7dacac4 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,14 +5,13 @@ export ZDTM_ARGS all: $(MAKE) zdtm - $(MAKE) zdtm-config $(MAKE) zdtm-pre-dump $(MAKE) zdtm-snapshot $(MAKE) zdtm-iter $(MAKE) zdtm-freezer .PHONY: all -TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job criu-ns skip-file-rwx-check +TESTS = unix-callback mem-snap rpc libcriu mounts/ext security pipes crit socketpairs overlayfs mnt-ext-dev shell-job other: for t in $(TESTS); do \ @@ -24,10 +23,6 @@ zdtm: ./zdtm.py run -a --parallel 2 .PHONY: zdtm -zdtm-config: - ./zdtm.py run -a --parallel 2 --criu-config -.PHONY: zdtm-config - zdtm-pre-dump: ./zdtm.py run --pre 2:1 -t zdtm/transition/fork -f uns .PHONY: zdtm-pre-dump @@ -45,6 +40,10 @@ zdtm-freezer: ./zdtm.py run --test zdtm/transition/thread-bomb --pre 3 --freezecg zdtm:f .PHONY: zdtm-freezer +fault-injection: + $(MAKE) -C fault-injection +.PHONY: fault-injection + override CFLAGS += -D_GNU_SOURCE clean_root: @@ -52,7 +51,7 @@ clean_root: .PHONY: clean_root clean: clean_root - $(RM) zdtm_ct zdtm-tst-list umount2 + $(RM) zdtm_ct zdtm-tst-list umount2 zdtm_test_config.conf $(Q) $(RM) *.log $(Q) $(RM) -r ./dump/ $(Q) $(MAKE) -C zdtm cleandep clean cleanout diff --git a/test/check_actions.py b/test/check_actions.py new file mode 100755 index 000000000..4973e3938 --- /dev/null +++ b/test/check_actions.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +import sys +import os + +actions = set(['pre-dump', 'pre-restore', 'post-dump', 'setup-namespaces', \ + 'post-setup-namespaces', 'post-restore', 'post-resume', \ + 'network-lock', 'network-unlock' ]) +errors = [] +af = os.path.dirname(os.path.abspath(__file__)) + '/actions_called.txt' + +for act in open(af): + act = act.strip().split() + act.append('EMPTY') + act.append('EMPTY') + + if act[0] == 'EMPTY': + raise Exception("Error in test, bogus actions line") + + if act[1] == 'EMPTY': + errors.append('Action %s misses CRTOOLS_IMAGE_DIR' % act[0]) + + if act[0] in ('post-dump', 'setup-namespaces', 'post-setup-namespaces', \ + 'post-restore', 'post-resume', 'network-lock', 'network-unlock'): + if act[2] == 'EMPTY': + errors.append('Action %s misses CRTOOLS_INIT_PID' % act[0]) + elif not act[2].isdigit() or int(act[2]) == 0: + errors.append('Action %s PID is not number (%s)' % + (act[0], act[2])) + + actions -= set([act[0]]) + +if actions: + errors.append('Not all actions called: %r' % actions) + +if errors: + for x in errors: + print(x) + sys.exit(1) + +print('PASS') diff --git a/test/crit-recode.py b/test/crit-recode.py index f119271d8..4135681e1 100755 --- a/test/crit-recode.py +++ b/test/crit-recode.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import pycriu import sys import os diff --git a/test/cuda-checkpoint/.gitignore b/test/cuda-checkpoint/.gitignore deleted file mode 100644 index 717fb7028..000000000 --- a/test/cuda-checkpoint/.gitignore +++ /dev/null @@ -1 +0,0 @@ -cuda-checkpoint diff --git a/test/cuda-checkpoint/Makefile b/test/cuda-checkpoint/Makefile deleted file mode 100644 index c59dadddc..000000000 --- a/test/cuda-checkpoint/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) - -BIN := cuda-checkpoint -SRC := cuda-checkpoint.c -DEP := $(SRC:%.c=%.d) -OBJ := $(SRC:%.c=%.o) -TARGETS := $(BIN) - -include ../zdtm/Makefile.inc - -all: $(TARGETS) -.PHONY: all - -clean-more: - $(RM) $(TARGETS) -.PHONY: clean-more -clean: clean-more diff --git a/test/cuda-checkpoint/cuda-checkpoint.c b/test/cuda-checkpoint/cuda-checkpoint.c deleted file mode 100644 index 3b7ce8b9f..000000000 --- a/test/cuda-checkpoint/cuda-checkpoint.c +++ /dev/null @@ -1,57 +0,0 @@ -/* The mocked version of cuda-checkpoint. */ -#include -#include -#include - -int main(int argc, char *argv[]) -{ - int c; - - while (1) { - int option_index = 0; - static struct option long_options[] = { - { "pid", required_argument, 0, 'p' }, - { "get-state", no_argument, 0, 's' }, - { "get-restore-tid", no_argument, 0, 'g' }, - { "action", required_argument, 0, 'a' }, - { "timeout", required_argument, 0, 't' }, - { "help", no_argument, 0, 'h' }, - { 0, 0, 0, 0 } - }; - - c = getopt_long(argc, argv, "p:ga:ht:", - long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'p': - printf("%s\n", optarg); - break; - case 'g': - case 'a': - case 't': - break; - case 's': - printf("running\n"); - break; - case 'h': - printf("--action - execute an action"); - break; - - default: - fprintf(stderr, "getopt returned character code 0%o ??\n", c); - return 1; - } - } - - if (optind < argc) { - fprintf(stderr, "non-option ARGV-elements: "); - while (optind < argc) - fprintf(stderr, "%s ", argv[optind++]); - fprintf(stderr, "\n"); - return 1; - } - - return 0; -} diff --git a/test/exhaustive/pipe.py b/test/exhaustive/pipe.py index afe20846a..fdadc480c 100755 --- a/test/exhaustive/pipe.py +++ b/test/exhaustive/pipe.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import argparse import os @@ -75,7 +75,7 @@ def get_pipe_rw(pid, fd): def check_pipe_y(pid, fd, rw, inos): ino = get_pipe_ino(pid, fd) - if ino is None: + if ino == None: return 'missing ' if not inos.has_key(fd): inos[fd] = ino @@ -89,7 +89,7 @@ def check_pipe_y(pid, fd, rw, inos): def check_pipe_n(pid, fd): ino = get_pipe_ino(pid, fd) - if ino is None: + if ino == None: return None else: return 'present ' @@ -102,7 +102,7 @@ def check_pipe_end(kids, fd, comb, rw, inos): res = check_pipe_y(t_pid, fd, rw, inos) else: res = check_pipe_n(t_pid, fd) - if res is not None: + if res != None: return res + 'kid(%d)' % t_nr t_nr += 1 return None @@ -111,7 +111,7 @@ def check_pipe_end(kids, fd, comb, rw, inos): def check_pipe(kids, fds, comb, inos): for e in (0, 1): # 0 == R, 1 == W, see get_pipe_rw() res = check_pipe_end(kids, fds[e], comb[e], e, inos) - if res is not None: + if res != None: return res + 'end(%d)' % e return None @@ -124,7 +124,7 @@ def check_pipes(kids, pipes, comb): p_inos = {} for p_fds in pipes: res = check_pipe(kids, p_fds, comb[p_nr], p_inos) - if res is not None: + if res != None: return res + 'pipe(%d)' % p_nr p_nr += 1 @@ -182,7 +182,7 @@ def make_comb(comb, opts, status_pipe): if v == '0': print('\tCheck pipes') res = check_pipes(kids, pipes, comb) - if res is None: + if res == None: ex_code = 0 else: print('\tFAIL %s' % res) diff --git a/test/exhaustive/unix.py b/test/exhaustive/unix.py index 689b1fb3a..98dbbb7b0 100755 --- a/test/exhaustive/unix.py +++ b/test/exhaustive/unix.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import sys import os @@ -261,7 +261,7 @@ class sock: # that hasn't contributed to some new states is # just waste of time, so we close only connected # sockets or listeners that has at least one - # incoming connection pending or served + # incoming connection pendig or served if self.listen: if self.icons: @@ -304,7 +304,7 @@ class sock: for psk in st.sockets: if psk == self: continue - if psk.peer is not None and psk.peer != self.sk_id: + if psk.peer != None and psk.peer != self.sk_id: # Peer by someone else, can do nothing continue @@ -356,11 +356,11 @@ class sock: i_dsc += sock.name_of(psk) dsc += '-I%s' % i_dsc if self.inqueue: - from_set = set() + froms = set() for m in self.inqueue: - from_set.add(m[0]) + froms.add(m[0]) q_dsc = '' - for f in from_set: + for f in froms: fsk = st.get_socket(f, True) q_dsc += sock.name_of(fsk) dsc += '-M%s' % q_dsc @@ -462,7 +462,7 @@ fail_desc = { def chk_real_state(st): - # Before anything else -- check that we still have + # Before enything else -- check that we still have # all the sockets at hands for sk in st.sockets: if not sk.visible: diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip index 32c57d929..252778969 100755 --- a/test/inhfd/memfd.py.checkskip +++ b/test/inhfd/memfd.py.checkskip @@ -1,7 +1,7 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import ctypes libc = ctypes.CDLL(None) -# libc may not have memfd_create (e.g., centos) +# libc may not have memfd_create (e.g., centos on travis) libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml index ddb6c89cf..faae44d1b 100644 --- a/test/javaTests/pom.xml +++ b/test/javaTests/pom.xml @@ -38,7 +38,7 @@ org.testng testng - 7.7.0 + 6.3.1 diff --git a/test/javaTests/src/org/criu/java/tests/Sockets.java b/test/javaTests/src/org/criu/java/tests/Sockets.java index 160bc90e6..94cc217c4 100644 --- a/test/javaTests/src/org/criu/java/tests/Sockets.java +++ b/test/javaTests/src/org/criu/java/tests/Sockets.java @@ -129,7 +129,7 @@ class Sockets { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); e.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.SEVERE, "Exception occured:" + e); logger.log(Level.FINE, writer.toString()); } if (b != null) { diff --git a/test/javaTests/src/org/criu/java/tests/SocketsClient.java b/test/javaTests/src/org/criu/java/tests/SocketsClient.java index 40ffe1d3f..1c8e7b9a1 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsClient.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsClient.java @@ -121,7 +121,7 @@ class SocketsClient { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); exception.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.SEVERE, "Exception occured:" + exception); logger.log(Level.FINE, writer.toString()); } diff --git a/test/javaTests/src/org/criu/java/tests/SocketsConnect.java b/test/javaTests/src/org/criu/java/tests/SocketsConnect.java index b64d2eee1..164c21089 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsConnect.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsConnect.java @@ -80,7 +80,7 @@ class SocketsConnect { } if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { logger.log(Level.SEVERE, "Killing the server process and client process"); - logger.log(Level.SEVERE, "Exception occurred in the client or server process: check their log for details"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); serverProcess.destroy(); clientProcess.destroy(); b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); @@ -145,7 +145,7 @@ class SocketsConnect { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); e.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.SEVERE, "Exception occured:" + e); logger.log(Level.FINE, writer.toString()); } if (b != null) { diff --git a/test/javaTests/src/org/criu/java/tests/SocketsConnectClient.java b/test/javaTests/src/org/criu/java/tests/SocketsConnectClient.java index e72165413..ed1c7fab3 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsConnectClient.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsConnectClient.java @@ -68,7 +68,7 @@ class SocketsConnectClient { try { socket = new Socket(SocketHelper.IP_ADDRESS, port); } catch (Exception e) { - logger.log(Level.SEVERE, "Exception occurred when connecting to port: " + e); + logger.log(Level.SEVERE, "Exception occured when connecting to port: " + e); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); } @@ -117,7 +117,7 @@ class SocketsConnectClient { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); exception.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.SEVERE, "Exception occured:" + exception); logger.log(Level.FINE, writer.toString()); } diff --git a/test/javaTests/src/org/criu/java/tests/SocketsConnectServer.java b/test/javaTests/src/org/criu/java/tests/SocketsConnectServer.java index 39918cec0..1e4cf3aeb 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsConnectServer.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsConnectServer.java @@ -55,14 +55,14 @@ class SocketsConnectServer { socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); } - ServerSocket s = new ServerSocket(port); + ServerSocket ser = new ServerSocket(port); logger.log(Level.INFO, "Server will be listening on Port: " + port); /* * Timeout after 7 sec if client does not connect */ try { - s.setSoTimeout(7 * 1000); + ser.setSoTimeout(7 * 1000); } catch (SocketException e) { logger.log(Level.SEVERE, "Cannot set timeout!"); @@ -73,14 +73,14 @@ class SocketsConnectServer { try { if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { - s.close(); + ser.close(); System.exit(1); } /* * Checkpoint when server is listening for connections, and no client has connected to the server. */ socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); - socket = s.accept(); + socket = ser.accept(); SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); } catch (Exception e) { @@ -88,13 +88,13 @@ class SocketsConnectServer { socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); } - if (!s.isBound()) { + if (!ser.isBound()) { logger.log(Level.SEVERE, "Server is not bound to a port"); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); } - if (s.getLocalPort() != port) { + if (ser.getLocalPort() != port) { logger.log(Level.SEVERE, "Server is not listening on correct port"); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); diff --git a/test/javaTests/src/org/criu/java/tests/SocketsData.java b/test/javaTests/src/org/criu/java/tests/SocketsData.java index bc8470cef..67d8cef0e 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsData.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsData.java @@ -80,7 +80,7 @@ class SocketsData { } if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { logger.log(Level.SEVERE, "Killing the server process and client process"); - logger.log(Level.SEVERE, "Exception occurred in the client or server process: check their log for details"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); serverProcess.destroy(); clientProcess.destroy(); b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); @@ -144,7 +144,7 @@ class SocketsData { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); e.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.SEVERE, "Exception occured:" + e); logger.log(Level.FINE, writer.toString()); } if (b != null) { diff --git a/test/javaTests/src/org/criu/java/tests/SocketsDataClient.java b/test/javaTests/src/org/criu/java/tests/SocketsDataClient.java index fa3aa2e76..49885a886 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsDataClient.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsDataClient.java @@ -74,7 +74,7 @@ class SocketsDataClient { try { socket = new Socket(SocketHelper.IP_ADDRESS, port); } catch (IOException e) { - logger.log(Level.SEVERE, "Exception occurred when connecting to port: " + e); + logger.log(Level.SEVERE, "Exception occured when connecting to port: " + e); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); } @@ -129,7 +129,7 @@ class SocketsDataClient { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); exception.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.SEVERE, "Exception occured:" + exception); logger.log(Level.FINE, writer.toString()); } if (socketMappedBuffer != null) { diff --git a/test/javaTests/src/org/criu/java/tests/SocketsDataServer.java b/test/javaTests/src/org/criu/java/tests/SocketsDataServer.java index d332b1a1a..65fe92a9d 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsDataServer.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsDataServer.java @@ -50,14 +50,14 @@ class SocketsDataServer { System.exit(1); } - ServerSocket s = new ServerSocket(port); + ServerSocket ser = new ServerSocket(port); logger.log(Level.INFO, "Server will be listening on Port " + port); /* * Wait for 7 seconds for client to connect, else throw a timeout exception */ try { - s.setSoTimeout(7 * 1000); + ser.setSoTimeout(7 * 1000); } catch (SocketException e) { logger.log(Level.SEVERE, "cannot set timeout"); @@ -70,7 +70,7 @@ class SocketsDataServer { * begin listening for connections. */ socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); - socket = s.accept(); + socket = ser.accept(); BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); PrintStream outstream = new PrintStream(socket.getOutputStream()); diff --git a/test/javaTests/src/org/criu/java/tests/SocketsListen.java b/test/javaTests/src/org/criu/java/tests/SocketsListen.java index 550d0b6a8..3fad38549 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsListen.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsListen.java @@ -81,7 +81,7 @@ class SocketsListen { } if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { logger.log(Level.SEVERE, "Killing the server process and client process"); - logger.log(Level.SEVERE, "Exception occurred in the client or server process: check their log for details"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); serverProcess.destroy(); clientProcess.destroy(); b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); @@ -141,7 +141,7 @@ class SocketsListen { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); e.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.SEVERE, "Exception occured:" + e); logger.log(Level.FINE, writer.toString()); } if (b != null) { diff --git a/test/javaTests/src/org/criu/java/tests/SocketsListenClient.java b/test/javaTests/src/org/criu/java/tests/SocketsListenClient.java index b615c67c8..efcb3d545 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsListenClient.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsListenClient.java @@ -75,7 +75,7 @@ class SocketsListenClient { try { socket = new Socket(SocketHelper.IP_ADDRESS, port); } catch (Exception e) { - logger.log(Level.SEVERE, "Exception occurred when connecting to port: " + e); + logger.log(Level.SEVERE, "Exception occured when connecting to port: " + e); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); } PrintStream out = new PrintStream(socket.getOutputStream()); @@ -123,7 +123,7 @@ class SocketsListenClient { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); exception.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.SEVERE, "Exception occured:" + exception); logger.log(Level.FINE, writer.toString()); } diff --git a/test/javaTests/src/org/criu/java/tests/SocketsListenServer.java b/test/javaTests/src/org/criu/java/tests/SocketsListenServer.java index 4384f8b05..46fef40ec 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsListenServer.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsListenServer.java @@ -58,13 +58,13 @@ class SocketsListenServer { } logger.log(Level.INFO, "Server will be listening on Port " + port); - ServerSocket s = new ServerSocket(port); + ServerSocket ser = new ServerSocket(port); /* * Server has bound to a port but is not listening yet! */ logger.log(Level.INFO, "Going to checkpoint"); if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { - s.close(); + ser.close(); System.exit(1); } /* @@ -73,13 +73,13 @@ class SocketsListenServer { socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); - if (!s.isBound()) { + if (!ser.isBound()) { logger.log(Level.SEVERE, "Server is not bound to a port"); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); } - if (s.getLocalPort() != port) { + if (ser.getLocalPort() != port) { logger.log(Level.SEVERE, "SServer is not listening on correct port"); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); @@ -88,7 +88,7 @@ class SocketsListenServer { * Timeout after 5 sec if client does not connect */ try { - s.setSoTimeout(5 * 1000); + ser.setSoTimeout(5 * 1000); } catch (SocketException e) { logger.log(Level.SEVERE, "cannot set timeout"); @@ -102,7 +102,7 @@ class SocketsListenServer { * will begin listening for connections. */ socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); - socket = s.accept(); + socket = ser.accept(); } catch (Exception e) { logger.log(Level.SEVERE, "Timed out while waiting for client to connect\n" + e); diff --git a/test/javaTests/src/org/criu/java/tests/SocketsMultiple.java b/test/javaTests/src/org/criu/java/tests/SocketsMultiple.java index 41423b2cb..5e55c4274 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsMultiple.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsMultiple.java @@ -80,7 +80,7 @@ class SocketsMultiple { } if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { logger.log(Level.SEVERE, "Killing the server process and client process"); - logger.log(Level.SEVERE, "Exception occurred in the client or server process: check their log for details"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); serverProcess.destroy(); clientProcess.destroy(); b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); @@ -140,7 +140,7 @@ class SocketsMultiple { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); e.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.SEVERE, "Exception occured:" + e); logger.log(Level.FINE, writer.toString()); } if (b != null) { diff --git a/test/javaTests/src/org/criu/java/tests/SocketsMultipleClient.java b/test/javaTests/src/org/criu/java/tests/SocketsMultipleClient.java index b1c98b33f..d97a946fd 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsMultipleClient.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsMultipleClient.java @@ -161,7 +161,7 @@ class SocketsMultipleClient { StringWriter writer = new StringWriter(); PrintWriter printWriter = new PrintWriter(writer); exception.printStackTrace(printWriter); - logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.SEVERE, "Exception occured:" + exception); logger.log(Level.FINE, writer.toString()); } diff --git a/test/javaTests/src/org/criu/java/tests/SocketsMultipleServer.java b/test/javaTests/src/org/criu/java/tests/SocketsMultipleServer.java index ad54e250b..a7e4d3b9e 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsMultipleServer.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsMultipleServer.java @@ -62,7 +62,7 @@ class SocketsMultipleServer { socketMappedBuffer.putChar(7, Helper.STATE_INIT); socketMappedBuffer.putChar(9, Helper.STATE_INIT); - ServerSocket s = new ServerSocket(port); + ServerSocket ser = new ServerSocket(port); logger.log(Level.INFO, "Server will be listening on Port " + port); Socket[] sockets = new Socket[4]; @@ -73,8 +73,8 @@ class SocketsMultipleServer { socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); for (int i = 1; i <= 4; i++) { - sockets[i - 1] = s.accept(); - ServerThread serverThread = new ServerThread(sockets[i - 1], "s-socket " + i, 2 * i + 1, logger, socketMappedBuffer); + sockets[i - 1] = ser.accept(); + ServerThread serverThread = new ServerThread(sockets[i - 1], "ser-socket " + i, 2 * i + 1, logger, socketMappedBuffer); serverThread.start(); if (i == 3) { logger.log(Level.INFO, "Connected to client: 3"); diff --git a/test/javaTests/src/org/criu/java/tests/SocketsServer.java b/test/javaTests/src/org/criu/java/tests/SocketsServer.java index dc162cb7d..051233443 100644 --- a/test/javaTests/src/org/criu/java/tests/SocketsServer.java +++ b/test/javaTests/src/org/criu/java/tests/SocketsServer.java @@ -54,18 +54,18 @@ class SocketsServer { socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); } - ServerSocket s = new ServerSocket(port); + ServerSocket ser = new ServerSocket(port); logger.log(Level.INFO, "Server will be listening on Port " + port); /* * Timeout after 5 second if client does not connect */ - s.setSoTimeout(5 * 1000); + ser.setSoTimeout(5 * 1000); logger.log(Level.INFO, "Waiting for client to connect"); Socket socket = null; try { socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); - socket = s.accept(); + socket = ser.accept(); } catch (Exception e) { logger.log(Level.SEVERE, "Timed out while waiting for client to connect"); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); @@ -90,13 +90,13 @@ class SocketsServer { socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); - if (!s.isBound()) { + if (!ser.isBound()) { logger.log(Level.SEVERE, "Server is not bound to a port"); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); } - if (s.getLocalPort() != port) { + if (ser.getLocalPort() != port) { logger.log(Level.SEVERE, "Server is not listening on correct port"); socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); System.exit(1); diff --git a/test/jenkins/actions.sh b/test/jenkins/actions.sh new file mode 100755 index 000000000..801904500 --- /dev/null +++ b/test/jenkins/actions.sh @@ -0,0 +1,8 @@ +# Check how crit de/encodes images +set -e +source `dirname $0`/criu-lib.sh +# prep +rm -f actions_called.txt +./test/zdtm.py run -t zdtm/static/env00 --script "$(pwd)/test/show_action.sh" || fail +./test/check_actions.py || fail +exit 0 diff --git a/test/jenkins/crit.sh b/test/jenkins/crit.sh index cec26c2b4..fcf1c58d4 100755 --- a/test/jenkins/crit.sh +++ b/test/jenkins/crit.sh @@ -2,6 +2,6 @@ set -e source `dirname $0`/criu-lib.sh prep -./test/zdtm.py run --all -f best -x maps04 -x cgroup02 -x cgroup_ignore --norst --keep-img always || fail +./test/zdtm.py run --all -f best -x maps04 -x cgroup02 --norst --keep-img always || fail PYTHONPATH="$(pwd)/lib/" ./test/crit-recode.py || fail exit 0 diff --git a/test/jenkins/criu-compat-mounts.sh b/test/jenkins/criu-compat-mounts.sh deleted file mode 100755 index cc68035f3..000000000 --- a/test/jenkins/criu-compat-mounts.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -# Make one regular C/R cycle with mount-v2 disabled -set -e -source `dirname $0`/criu-lib.sh -prep -FAIL=0 -./test/zdtm.py run --all --mntns-compat-mode --keep-going --report report --parallel 4 || FAIL=$? - -# Make device-external mounts test -EXTRA_OPTS=--mntns-compat-mode make -C test/others/mnt-ext-dev/ run || FAIL=$? - -if [ $FAIL -ne 0 ]; then - fail -fi diff --git a/test/jenkins/criu-dedup.sh b/test/jenkins/criu-dedup.sh index edb1b653d..0041496d8 100755 --- a/test/jenkins/criu-dedup.sh +++ b/test/jenkins/criu-dedup.sh @@ -9,8 +9,8 @@ prep # Additionally run these tests as they touch a lot of # memory and it makes sense to additionally check it # with delays between iterations -./test/zdtm.py run -t zdtm/transition/maps007 --report report -f h --pre 8:.1 --dedup || fail -./test/zdtm.py run -t zdtm/static/mem-touch --report report -f h --pre 8:.1 --dedup || fail -./test/zdtm.py run -t zdtm/transition/maps008 --report report -f h --pre 8:.1 --dedup || fail -./test/zdtm.py run -t zdtm/transition/maps007 --report report -f h --pre 8:.1 --noauto-dedup || fail -./test/zdtm.py run -t zdtm/static/mem-touch --report report -f h --pre 8:.1 --noauto-dedup || fail +./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/transition/maps008 --keep-going --report report -f h --pre 8:.1 --dedup || fail +./test/zdtm.py run -t zdtm/transition/maps007 --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail +./test/zdtm.py run -t zdtm/static/mem-touch --keep-going --report report -f h --pre 8:.1 --noauto-dedup || fail diff --git a/test/jenkins/criu-dump.sh b/test/jenkins/criu-dump.sh index 761b3de0d..4c49532b2 100755 --- a/test/jenkins/criu-dump.sh +++ b/test/jenkins/criu-dump.sh @@ -5,4 +5,4 @@ set -e source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -./test/zdtm.py run --all --keep-going --report report --parallel 4 --norst -x 'maps04' -x 'cgroup02' -x 'cgroup_ignore' || fail +./test/zdtm.py run --all --keep-going --report report --parallel 4 --norst -x 'maps04' -x 'cgroup02' || fail diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index 6ee7ce33a..a8c3a5cf7 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -4,16 +4,10 @@ set -e source `dirname $0`/criu-lib.sh prep -./test/zdtm.py run -t zdtm/static/env00 --fault 1 --report report -f h || fail -./test/zdtm.py run -t zdtm/static/unlink_fstat00 --fault 2 --report report -f h || fail -./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --report report -f h || fail - -# FIXME: fhandles looks broken on btrfs -findmnt --noheadings --target . | grep -q btrfs || NOBTRFS=$? -if [ $NOBTRFS -eq 1 ] ; then - ./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --pre 2 -f uns || fail -fi - +./test/zdtm.py run -t zdtm/static/env00 --fault 1 --keep-going --report report -f h || fail +./test/zdtm.py run -t zdtm/static/unlink_fstat00 --fault 2 --keep-going --report report -f h || fail +./test/zdtm.py run -t zdtm/static/maps00 --fault 3 --keep-going --report report -f h || fail +./test/zdtm.py run -t zdtm/static/inotify_irmap --fault 128 --keep-going --pre 2 -f uns || fail ./test/zdtm.py run -t zdtm/static/env00 --fault 129 -f uns || fail ./test/zdtm.py run -t zdtm/transition/fork --fault 130 -f h || fail ./test/zdtm.py run -t zdtm/static/vdso01 --fault 127 || fail @@ -23,29 +17,19 @@ if [ "${COMPAT_TEST}" != "y" ] ; then ./test/zdtm.py run -t zdtm/static/vdso01 --fault 133 -f h || fail fi -./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 2 --report report || fail -./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 4 --report report || fail +./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 2 --keep-going --report report || fail +./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 4 --keep-going --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 6 --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_link_remap --fault 6 --report report || fail ./test/zdtm.py run -t zdtm/static/unlink_fstat03 --fault 6 --report report || fail -./test/zdtm.py run -t zdtm/static/env00 --fault 5 --report report || fail -./test/zdtm.py run -t zdtm/static/maps04 --fault 131 --report report --pre 2:1 || fail -./test/zdtm.py run -t zdtm/transition/maps008 --fault 131 --report report --pre 2:1 || fail +./test/zdtm.py run -t zdtm/static/env00 --fault 5 --keep-going --report report || fail +./test/zdtm.py run -t zdtm/static/maps04 --fault 131 --keep-going --report report --pre 2:1 || fail +./test/zdtm.py run -t zdtm/transition/maps008 --fault 131 --keep-going --report report --pre 2:1 || fail ./test/zdtm.py run -t zdtm/static/maps01 --fault 132 -f h || fail # 134 is corrupting extended registers set, should run in a sub-thread (fpu03) # without restore (that will check if parasite corrupts extended registers) ./test/zdtm.py run -t zdtm/static/fpu03 --fault 134 -f h --norst || fail # also check for the main thread corruption ./test/zdtm.py run -t zdtm/static/fpu00 --fault 134 -f h --norst || fail - -# check set_compel_interrupt_only_mode -./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 -./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:t --fault 137 --norst -# check set_compel_interrupt_only_mode when test cgroup is frozen -./test/zdtm.py run -t zdtm/static/env00 --freezecg zdtm:f --fault 137 - -if ./test/zdtm.py run -t zdtm/static/vfork00 --fault 136 --report report -f h ; then - fail -fi diff --git a/test/jenkins/criu-fcg.sh b/test/jenkins/criu-fcg.sh index 81395b7ba..ca5054f5e 100755 --- a/test/jenkins/criu-fcg.sh +++ b/test/jenkins/criu-fcg.sh @@ -6,10 +6,10 @@ source `dirname $0`/criu-lib.sh prep mount_tmpfs_to_dump -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:f || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:f --pre 3 || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:f --norst || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --pre 3 || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:f --norst || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:t || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:t --pre 3 || fail -./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --report report --freezecg zdtm:t --norst || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --pre 3 || fail +./test/zdtm.py run -t zdtm/transition/thread-bomb -f h --keep-going --report report --freezecg zdtm:t --norst || fail diff --git a/test/jenkins/criu-lazy-migration.pipeline b/test/jenkins/criu-lazy-migration.pipeline index 45dc2c776..2c863f170 100644 --- a/test/jenkins/criu-lazy-migration.pipeline +++ b/test/jenkins/criu-lazy-migration.pipeline @@ -21,6 +21,7 @@ pipeline { stage('Test'){ steps { sh './test/jenkins/run_ct sh -c "mount --make-rprivate / && mount --rbind . /mnt && cd /mnt && ./test/jenkins/criu-lazy-migration.sh"' + junit 'test/report/criu-testreport*.xml' } } } diff --git a/test/libfault/Makefile b/test/libfault/Makefile deleted file mode 100644 index cbe47fdf2..000000000 --- a/test/libfault/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -CC = gcc -CFLAGS = -c -fPIC -ldl - -SRC = libfault.c -OBJ = $(SRC:.c=.o) - -LIB = libfault.so - -.PHONY: all clean run - -all: $(LIB) - -$(LIB): $(OBJ) - $(CC) -shared -o $(LIB) $(OBJ) - -$(OBJ): $(SRC) - $(CC) $(CFLAGS) $< - -clean: - rm -f $(OBJ) $(LIB) - diff --git a/test/libfault/libfault.c b/test/libfault/libfault.c deleted file mode 100644 index 650bf08ca..000000000 --- a/test/libfault/libfault.c +++ /dev/null @@ -1,31 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include - -ssize_t (*original_pread)(int fd, void *buf, size_t count, off_t offset) = NULL; - -/** - * This function is a wrapper around pread() that is used for testing CRIU's - * handling of cases where pread() returns less data than requested. - * - * pmc_fill() in criu/pagemap.c is a good example of where this can happen. - */ -ssize_t pread64(int fd, void *buf, size_t count, off_t offset) -{ - if (!original_pread) { - original_pread = dlsym(RTLD_NEXT, "pread"); - if (!original_pread) { - errno = EIO; - return -1; - } - } - - /* The following aims to simulate the case when pread() returns less - * data than requested. We need to ensure that CRIU handles such cases. */ - if (count > 2048) { - count -= 1024; - } - - return original_pread(fd, buf, count, offset); -} diff --git a/test/others/action-script/.gitignore b/test/others/action-script/.gitignore deleted file mode 100644 index ca9a0b541..000000000 --- a/test/others/action-script/.gitignore +++ /dev/null @@ -1 +0,0 @@ -actions_called.txt diff --git a/test/others/action-script/Makefile b/test/others/action-script/Makefile deleted file mode 100644 index 594edc070..000000000 --- a/test/others/action-script/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -run: - ./run.sh -.PHONY: run diff --git a/test/others/action-script/check_actions.py b/test/others/action-script/check_actions.py deleted file mode 100755 index 0140d8762..000000000 --- a/test/others/action-script/check_actions.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 - -import os -import sys - -EXPECTED_ACTIONS = [ - 'pre-dump', - 'network-lock', - 'post-dump', - 'pre-restore', - 'setup-namespaces', - 'post-setup-namespaces', - 'post-restore', - 'network-unlock', - 'pre-resume', - 'post-resume', -] - -errors = [] -actions_called = [] -actions_called_file = os.path.join(os.path.dirname(__file__), 'actions_called.txt') - -with open(actions_called_file) as f: - for index, line in enumerate(f): - parts = line.strip().split() - parts += ['EMPTY'] * (3 - len(parts)) - action_hook, image_dir, pid = parts - - if action_hook == 'EMPTY': - raise ValueError("Error in test: bogus actions line") - - expected_action = EXPECTED_ACTIONS[index] if index < len(EXPECTED_ACTIONS) else None - if action_hook != expected_action: - raise ValueError(f"Invalid action: {action_hook} != {expected_action}") - - if image_dir == 'EMPTY': - errors.append(f'Action {action_hook} misses CRTOOLS_IMAGE_DIR') - - if action_hook != 'pre-restore': - if pid == 'EMPTY': - errors.append(f'Action {action_hook} misses CRTOOLS_INIT_PID') - elif not pid.isdigit() or int(pid) == 0: - errors.append(f'Action {action_hook} PID is not a valid number ({pid})') - - actions_called.append(action_hook) - -if actions_called != EXPECTED_ACTIONS: - errors.append(f'Not all actions called: {actions_called!r}') - -if errors: - print('\n'.join(errors)) - sys.exit(1) - -print('Check Actions PASS') diff --git a/test/others/action-script/run.sh b/test/others/action-script/run.sh deleted file mode 100755 index 574f6fc86..000000000 --- a/test/others/action-script/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -set -e - -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" - -rm -f "${SCRIPT_DIR}"/actions_called.txt -"${SCRIPT_DIR}"/../../zdtm.py run -t zdtm/static/env00 -f ns --script "$SCRIPT_DIR/show_action.sh" || exit 1 -"${SCRIPT_DIR}"/check_actions.py || exit 1 - -exit 0 diff --git a/test/others/app-emu/java/HelloWorld/run.sh b/test/others/app-emu/java/HelloWorld/run.sh index e6dcbd9fc..0ed6afd14 100644 --- a/test/others/app-emu/java/HelloWorld/run.sh +++ b/test/others/app-emu/java/HelloWorld/run.sh @@ -18,7 +18,7 @@ setsid java HelloWorld & pid=${!} -echo Launched java application with pid $pid in background +echo Lanuched java application with pid $pid in background ${criu} dump -D dump -o dump.log -v4 --shell-job -t ${pid} || { echo "Dump failed" diff --git a/test/others/app-emu/job/job.c b/test/others/app-emu/job/job.c index 4477bb996..a3c6af74c 100644 --- a/test/others/app-emu/job/job.c +++ b/test/others/app-emu/job/job.c @@ -50,7 +50,7 @@ int main(int argc, char *argv[]) snprintf(buf, sizeof(buf), "/proc/%d/fd", pid); fd_dir = opendir(buf); if (!fd_dir) { - printf("can't open %s\n", buf); + printf("cant open %s\n", buf); return -1; } @@ -70,13 +70,13 @@ int main(int argc, char *argv[]) closedir(fd_dir); if (ioctl(fd, TIOCGSID, &tty_sid) < 0) { - printf("can't obtain sid on stdout\n"); + printf("cant obtain sid on stdout\n"); return -1; } printf("stdout sid = %d\n", tty_sid); if (ioctl(fd, TIOCGPGRP, &tty_gid) < 0) { - printf("can't obtain gid on stdout\n"); + printf("cant obtain gid on stdout\n"); return -1; } printf("stdout gid = %d\n", tty_gid); diff --git a/test/others/app-emu/make/run.sh b/test/others/app-emu/make/run.sh index d871b7d9c..7cb44c770 100644 --- a/test/others/app-emu/make/run.sh +++ b/test/others/app-emu/make/run.sh @@ -28,7 +28,7 @@ setsid make -j4 & pid=${!} -echo Launched make in $pid background +echo Lanuched make in $pid background sleep 2 ${criu} dump --shell-job -D dump -o dump.log -v4 -t ${pid} || { diff --git a/test/others/bers/bers.c b/test/others/bers/bers.c index b291e3bcb..37cf84dd3 100644 --- a/test/others/bers/bers.c +++ b/test/others/bers/bers.c @@ -391,7 +391,7 @@ usage: pr_msg(" -f|--files create files for each task\n"); pr_msg(" -m|--memory allocate megabytes for each task\n"); pr_msg(" --memory-chunks split memory to equal parts\n"); - pr_msg(" --mem-fill fill memory with data depending on :\n"); + pr_msg(" --mem-fill fill memory with data dependin on :\n"); pr_msg(" all fill every byte of memory\n"); pr_msg(" light fill first bytes of every page\n"); pr_msg(" dirtify fill every page\n"); diff --git a/test/others/bers/bers.txt b/test/others/bers/bers.txt index 30ee49580..17c0c0800 100644 --- a/test/others/bers/bers.txt +++ b/test/others/bers/bers.txt @@ -61,7 +61,7 @@ EXAMPLE bers -d test/bers/dump -t 256 -m 54 -c 4 -f 200 --mem-fill dirtify --mem-cycle dirtify -We generate 256 tasks with each allocating 54 megabytes of memory split +We generate 256 tasks wit each allocating 54 megabytes of memory splitted equally into 4 memory areas. Each task opens 200 files. On creation and cycling we touch every page of every memory area. diff --git a/test/others/config-file/run.sh b/test/others/config-file/run.sh index 26b835b45..92195883e 100755 --- a/test/others/config-file/run.sh +++ b/test/others/config-file/run.sh @@ -11,7 +11,7 @@ set -xbm -# shellcheck source=test/others/env.sh +#shellcheck disable=SC1091 source ../env.sh if [ ! -d /etc/criu ]; then diff --git a/test/others/crit/test.sh b/test/others/crit/test.sh index 2698bbd3c..0d38043d7 100755 --- a/test/others/crit/test.sh +++ b/test/others/crit/test.sh @@ -1,12 +1,11 @@ #!/bin/bash -# shellcheck disable=SC2002 +# shellcheck disable=SC1091,SC2002 set -x -# shellcheck source=test/others/env.sh source ../env.sh -images_list=() +images_list="" function gen_imgs { PID=$(../loop) @@ -17,15 +16,15 @@ function gen_imgs { exit 1 fi - images_list=(./*.img) - if [ "${#images_list[@]}" -eq 0 ]; then + images_list=$(ls -1 ./*.img) + if [ -z "$images_list" ]; then echo "Failed to generate images" exit 1 fi } function run_test1 { - for x in "${images_list[@]}" + for x in $images_list do echo "=== $x" if [[ $x == *pages* ]]; then @@ -46,16 +45,15 @@ function run_test1 { function run_test2 { - PROTO_IN="${images_list[0]}" + mapfile -t array <<< "$images_list" + + PROTO_IN=${array[0]} JSON_IN=$(mktemp -p ./ tmp.XXXXXXXXXX.json) OUT=$(mktemp -p ./ tmp.XXXXXXXXXX.log) # prepare ${CRIT} decode -i "${PROTO_IN}" -o "${JSON_IN}" - # show info about image - ${CRIT} info "${PROTO_IN}" - # proto in - json out decode cat "${PROTO_IN}" | ${CRIT} decode || exit 1 cat "${PROTO_IN}" | ${CRIT} decode -o "${OUT}" || exit 1 @@ -101,8 +99,6 @@ function run_test2 { ${CRIT} x ./ rss || exit 1 } -${CRIT} --version - gen_imgs run_test1 run_test2 diff --git a/test/others/criu-coredump/test.sh b/test/others/criu-coredump/test.sh index 2be82e64c..62d9f7edc 100755 --- a/test/others/criu-coredump/test.sh +++ b/test/others/criu-coredump/test.sh @@ -1,11 +1,7 @@ -#!/bin/bash - -set -x -# shellcheck source=test/others/env.sh -source ../env.sh || exit 1 +source ../env.sh function gen_imgs { - PID=$(../loop with a very very very very very very very very very very very very long cmdline) + PID=$(../loop) if ! $CRIU dump -v4 -o dump.log -D ./ -t "$PID"; then echo "Failed to checkpoint process $PID" cat dump.log @@ -13,7 +9,7 @@ function gen_imgs { exit 1 fi - images_list=$(ls -1 ./*.img) + images_list=$(ls -1 *.img) if [ -z "$images_list" ]; then echo "Failed to generate images" exit 1 @@ -36,19 +32,12 @@ function run_test { for x in $cores do echo "=== try readelf $x" - readelf -a "$x" || exit $? + readelf -a $x || exit $? echo "=== done" done echo "= done" } -UNAME_M=$(uname -m) - -if [[ "$UNAME_M" != "aarch64" && "$UNAME_M" != "armv7l" &&"$UNAME_M" != "x86_64" ]]; then - echo "criu-coredump only supports aarch64 armv7l, and x86_64. skipping." - exit 0 -fi - gen_imgs run_test diff --git a/test/others/criu-ns/Makefile b/test/others/criu-ns/Makefile deleted file mode 100644 index 4d901a111..000000000 --- a/test/others/criu-ns/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -run: - @make -C ../.. zdtm_ct - ../../zdtm_ct run.py diff --git a/test/others/criu-ns/run.py b/test/others/criu-ns/run.py deleted file mode 100755 index 0a36438e8..000000000 --- a/test/others/criu-ns/run.py +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/env python3 - -import fcntl -import os -import pathlib -import pty -import shutil -import subprocess -import sys -import termios -import time - - -CRIU_BIN = "../../../criu/criu" -CRIU_NS = "../../../scripts/criu-ns" -IMG_DIR = "dumpdir" -DUMP_LOG = "dump.log" -RESTORE_LOG = "restore.log" -PIDFILE = "pidfile" - - -def check_dumpdir(path=IMG_DIR): - if os.path.isdir(path): - shutil.rmtree(path) - os.mkdir(path, 0o755) - - -def run_task_with_own_pty(task): - fd_m, fd_s = pty.openpty() - - pid = os.fork() - if pid == 0: - os.close(fd_m) - os.setsid() - os.dup2(fd_s, 0) - os.dup2(fd_s, 1) - os.dup2(fd_s, 2) - fcntl.ioctl(fd_s, termios.TIOCSCTTY, 1) - os.close(fd_s) - task() - exit(0) - - os.close(fd_s) - fd_m = os.fdopen(fd_m, "rb") - os.set_blocking(fd_m.fileno(), False) - - while True: - try: - data = fd_m.read() - except IOError: - break - if data is not None: - print(data.decode("utf-8")) - - _, status = os.waitpid(pid, 0) - - try: - data = fd_m.read() - except IOError as err: - print(err) - - if data is not None: - print(data.decode("utf-8")) - fd_m.close() - - if status != 0: - print("task %s exited badly: %d" % (task.__name__, status)) - exit(1) - - return 0 - - -def create_pty(): - fd_m, fd_s = pty.openpty() - return (os.fdopen(fd_m, "wb"), os.fdopen(fd_s, "wb")) - - -def create_isolated_dumpee(): - pathlib.Path("running").touch() - fd_m, fd_s = create_pty() - pid = os.fork() - if pid == 0: - os.setsid() - os.dup2(fd_s.fileno(), 0) - os.dup2(fd_s.fileno(), 1) - os.dup2(fd_s.fileno(), 2) - fcntl.ioctl(fd_s.fileno(), termios.TIOCSCTTY, 1) - while True: - if not os.access("running", os.F_OK): - sys.exit(0) - time.sleep(1) - fd_m.close() - fd_s.close() - return pid - - -def criu_ns_dump(pid, shell_job=False): - cmd = [CRIU_NS, "dump", "-D", IMG_DIR, "-v4", "-t", str(pid), - "--log-file", DUMP_LOG, "--criu-binary", CRIU_BIN] - if shell_job: - cmd.append("--shell-job") - ret = subprocess.Popen(cmd).wait() - return ret - - -def criu_ns_restore(shell_job=False, restore_detached=False): - cmd = [CRIU_NS, "restore", "-D", IMG_DIR, "-v4", "--log-file", - RESTORE_LOG, "--criu-binary", CRIU_BIN] - if shell_job: - cmd.append("--shell-job") - if restore_detached: - cmd += ["--restore-detached", "--pidfile", PIDFILE] - ret = subprocess.Popen(cmd).wait() - return ret - - -def read_log_file(filename): - logfile_path = os.path.join(IMG_DIR, filename) - with open(logfile_path) as logfile: - print(logfile.read()) - - -def test_dump_and_restore_with_shell_job(): - print("Test criu-ns dump and restore with --shell-job option") - check_dumpdir() - pathlib.Path("running").touch() - pid = os.fork() - if pid == 0: - while True: - if not os.access("running", os.F_OK): - sys.exit(0) - time.sleep(1) - - ret = criu_ns_dump(pid, shell_job=True) - if ret != 0: - read_log_file(DUMP_LOG) - sys.exit(ret) - - os.unlink("running") - fd_m, fd_s = create_pty() - pid = os.fork() - if pid == 0: - os.setsid() - fd_m.close() - # since criu-ns takes control of the tty stdin - os.dup2(fd_s.fileno(), 0) - ret = criu_ns_restore(shell_job=True) - if ret != 0: - read_log_file(RESTORE_LOG) - sys.exit(ret) - os._exit(0) - - fd_s.close() - os.waitpid(pid, 0) - - -def test_dump_and_restore_without_shell_job(restore_detached=False): - print("Test criu-ns dump and restore with an isolated process" - "(%d)" % restore_detached) - check_dumpdir() - pid = create_isolated_dumpee() - ret = criu_ns_dump(pid) - if ret != 0: - read_log_file(DUMP_LOG) - sys.exit(ret) - - if not restore_detached: - os.unlink("running") - - pid = os.fork() - if pid == 0: - os.setsid() - ret = criu_ns_restore(restore_detached=restore_detached) - if ret != 0: - read_log_file(RESTORE_LOG) - sys.exit(ret) - os._exit(0) - - os.waitpid(pid, 0) - - -def test_dump_and_restore_in_pidns(): - if os.system("grep NSpid /proc/self/status"): - return - - print("Test criu-ns dump and restore in namespaces") - - def _dump(): - pid = create_isolated_dumpee() - ret = criu_ns_dump(pid) - if ret != 0: - read_log_file(DUMP_LOG) - sys.exit(ret) - - def _restore(): - ret = criu_ns_restore(restore_detached=True) - if ret != 0: - read_log_file(RESTORE_LOG) - sys.exit(ret) - - def _get_restored_pid(): - restored_pid = 0 - pidfile_path = os.path.join(IMG_DIR, PIDFILE) - if not os.path.exists(pidfile_path): - raise FileNotFoundError("pidfile not found") - with open(pidfile_path, "r") as pidfile: - restored_pid = pidfile.read().strip() - return int(restored_pid) - - def _redump(): - global IMG_DIR - try: - restored_pid = _get_restored_pid() - except FileNotFoundError: - sys.exit(1) - IMG_DIR = "dumpdir2" - check_dumpdir(IMG_DIR) - ret = criu_ns_dump(restored_pid) - if ret != 0: - read_log_file(DUMP_LOG) - sys.exit(ret) - - def _re_restore(): - os.unlink("running") - ret = criu_ns_restore() - if ret != 0: - read_log_file(RESTORE_LOG) - sys.exit(ret) - - check_dumpdir() - _dump() - _restore() - _redump() - _re_restore() - - -def main(): - test_dump_and_restore_with_shell_job() - test_dump_and_restore_without_shell_job() - test_dump_and_restore_without_shell_job(restore_detached=True) - test_dump_and_restore_in_pidns() - - -if __name__ == "__main__": - run_task_with_own_pty(main) diff --git a/test/others/env.sh b/test/others/env.sh index 6fa2c9691..b514e87d9 100755 --- a/test/others/env.sh +++ b/test/others/env.sh @@ -1,13 +1,17 @@ #!/bin/sh -BASE_DIR="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/../../")" - -CRIU="${BASE_DIR}/criu/criu" +CRIU=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu/criu) criu=$CRIU - -export PYTHONPATH="${BASE_DIR}/lib:${BASE_DIR}/crit:${PYTHONPATH-}" -CRIT="python3 -m crit" +if [ $(which python3) ]; then + PYTHON=python3 +elif [ $(which python2) ]; then + PYTHON=python2 +else + echo "FAIL: Neither python3 nor python2" + exit 1 +fi +#export PYTHON +CRIT=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../crit/crit-"${PYTHON}") crit=$CRIT - -CRIU_COREDUMP="${BASE_DIR}/coredump/coredump" +CRIU_COREDUMP=$(readlink -f `dirname ${BASH_SOURCE[0]}`/../../criu-coredump/criu-coredump) criu_coredump=$CRIU_COREDUMP diff --git a/test/others/ext-tty/run.py b/test/others/ext-tty/run.py index 2c268a2c8..8109033cb 100755 --- a/test/others/ext-tty/run.py +++ b/test/others/ext-tty/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import subprocess import os, sys, time, signal, pty diff --git a/test/others/libcriu/.gitignore b/test/others/libcriu/.gitignore index 30a56999c..6424681ab 100644 --- a/test/others/libcriu/.gitignore +++ b/test/others/libcriu/.gitignore @@ -3,9 +3,4 @@ test_iters test_notify test_self test_sub -test_join_ns -test_pre_dump -test_feature_check -output/ -libcriu.so.* -test_rpc_config +wdir diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index 927f17c23..226396e6a 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -1,15 +1,10 @@ include ../../../../criu/Makefile.versions -TESTS += test_sub +TESTS += test_sub TESTS += test_self TESTS += test_notify -TESTS += test_rpc_config TESTS += test_iters TESTS += test_errno -TESTS += test_join_ns -TESTS += test_pre_dump -TESTS += test_check -TESTS += test_feature_check all: $(TESTS) .PHONY: all diff --git a/test/others/libcriu/lib.h b/test/others/libcriu/lib.h index 59372fca5..6fdf8aef2 100644 --- a/test/others/libcriu/lib.h +++ b/test/others/libcriu/lib.h @@ -1,5 +1,3 @@ void what_err_ret_mean(int ret); int chk_exit(int status, int want); int get_version(void); - -#define SUCC_ECODE 42 diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index 6b36d4496..5a0dca46b 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -1,52 +1,45 @@ #!/bin/bash set -x - -MAIN_DIR=$(dirname "$0") -OUTPUT_DIR="${MAIN_DIR}/output" -TEST_DIR="${OUTPUT_DIR}/$1" -TEST_LOG="${TEST_DIR}/test.log" -DUMP_LOG="${TEST_DIR}/dump.log" -RESTORE_LOG="${TEST_DIR}/restore.log" - -# shellcheck source=test/others/env.sh -source "${MAIN_DIR}/../env.sh" || exit 1 +source ../env.sh || exit 1 echo "== Clean" make clean make libcriu +rm -rf wdir -rm -rf "${OUTPUT_DIR}" +echo "== Prepare" +mkdir -p wdir/i/ echo "== Run tests" export LD_LIBRARY_PATH=. -export PATH="${MAIN_DIR}/../../../criu:${PATH}" +export PATH="`dirname ${BASH_SOURCE[0]}`/../../../criu:$PATH" RESULT=0 -run_test() { +function run_test { echo "== Build $1" - if ! make "$1"; then + if ! make $1; then echo "FAIL build $1" + echo "** Output of $1/test.log" + cat wdir/i/$1/test.log + echo "---------------" + if [ -f wdir/i/$1/dump.log ]; then + echo "** Contents of dump.log" + cat wdir/i/$1/dump.log + echo "---------------" + fi + if [ -f wdir/i/$1/restore.log ]; then + echo "** Contents of restore.log" + cat wdir/i/$1/restore.log + echo "---------------" + fi RESULT=1; else echo "== Test $1" - mkdir -p "${TEST_DIR}" - if ! setsid ./"$1" "${CRIU}" "${TEST_DIR}" < /dev/null &>> "${TEST_LOG}"; then + mkdir wdir/i/$1/ + if ! setsid ./$1 ${CRIU} wdir/i/$1/ < /dev/null &>> wdir/i/$1/test.log; then echo "$1: FAIL" - echo "** Output of ${TEST_LOG}" - cat "${TEST_LOG}" - echo "---------------" - if [ -f "${DUMP_LOG}" ]; then - echo "** Contents of dump.log" - cat "${DUMP_LOG}" - echo "---------------" - fi - if [ -f "${RESTORE_LOG}" ]; then - echo "** Contents of restore.log" - cat "${RESTORE_LOG}" - echo "---------------" - fi RESULT=1 fi fi @@ -55,27 +48,13 @@ run_test() { run_test test_sub run_test test_self run_test test_notify -run_test test_rpc_config -if [ "$(uname -m)" = "x86_64" ]; then +if [ "$(uname -m)" == "x86_64" ]; then # Skip this on aarch64 as aarch64 has no dirty page tracking run_test test_iters - run_test test_pre_dump fi run_test test_errno -run_test test_join_ns -run_test test_check -if criu check --feature mem_dirty_track > /dev/null; then - export CRIU_FEATURE_MEM_TRACK=1 -fi -if criu check --feature uffd-noncoop > /dev/null; then - export CRIU_FEATURE_LAZY_PAGES=1 -fi -if criu check --feature pidfd_store > /dev/null; then - export CRIU_FEATURE_PIDFD_STORE=1 -fi -run_test test_feature_check echo "== Tests done" make libcriu_clean -[ "${RESULT}" -eq 0 ] && echo "Success" || echo "FAIL" -exit "${RESULT}" +[ $RESULT -eq 0 ] && echo "Success" || echo "FAIL" +exit $RESULT diff --git a/test/others/libcriu/test_check.c b/test/others/libcriu/test_check.c deleted file mode 100644 index 4af3b3630..000000000 --- a/test/others/libcriu/test_check.c +++ /dev/null @@ -1,17 +0,0 @@ -#include -#include "criu.h" -#include "lib.h" - -int main(int argc, char **argv) -{ - int ret; - - printf("--- Start check ---\n"); - criu_init_opts(); - criu_set_service_binary(argv[1]); - - if (criu_check()) - return -1; - - return 0; -} diff --git a/test/others/libcriu/test_feature_check.c b/test/others/libcriu/test_feature_check.c deleted file mode 100644 index d88e0de23..000000000 --- a/test/others/libcriu/test_feature_check.c +++ /dev/null @@ -1,65 +0,0 @@ -#include "criu.h" -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lib.h" - -int main(int argc, char **argv) -{ - int ret; - char *env; - bool mem_track = 0; - bool lazy_pages = 0; - bool pidfd_store = 0; - struct criu_feature_check features = { - .mem_track = true, - .lazy_pages = true, - .pidfd_store = true, - }; - - printf("--- Start feature check ---\n"); - criu_init_opts(); - criu_set_service_binary(argv[1]); - - env = getenv("CRIU_FEATURE_MEM_TRACK"); - if (env) { - mem_track = true; - } - env = getenv("CRIU_FEATURE_LAZY_PAGES"); - if (env) { - lazy_pages = true; - } - env = getenv("CRIU_FEATURE_PIDFD_STORE"); - if (env) { - pidfd_store = true; - } - - ret = criu_feature_check(&features, sizeof(features) + 1); - printf(" `- passing too large structure to libcriu should return -1: %d\n", ret); - if (ret != -1) - return -1; - - ret = criu_feature_check(&features, sizeof(features)); - if (ret < 0) { - what_err_ret_mean(ret); - return ret; - } - - printf(" `- mem_track : %d - expected : %d\n", features.mem_track, mem_track); - if (features.mem_track != mem_track) - return -1; - printf(" `- lazy_pages : %d - expected : %d\n", features.lazy_pages, lazy_pages); - if (features.lazy_pages != lazy_pages) - return -1; - printf(" `- pidfd_store: %d - expected : %d\n", features.pidfd_store, pidfd_store); - if (features.pidfd_store != pidfd_store) - return -1; - - return 0; -} diff --git a/test/others/libcriu/test_iters.c b/test/others/libcriu/test_iters.c index edbaf87f6..2a5ca6def 100644 --- a/test/others/libcriu/test_iters.c +++ b/test/others/libcriu/test_iters.c @@ -46,6 +46,8 @@ static int next_iter(criu_predump_info pi) return cur_iter < MAX_ITERS; } +#define SUCC_ECODE 42 + int main(int argc, char **argv) { int pid, ret, p[2]; @@ -104,7 +106,7 @@ int main(int argc, char **argv) criu_set_service_binary(argv[1]); criu_set_pid(pid); criu_set_log_file("dump.log"); - criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_log_level(4); open_imgdir(); ret = criu_dump_iters(next_iter); @@ -119,7 +121,7 @@ int main(int argc, char **argv) printf("--- Restore loop ---\n"); criu_init_opts(); - criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_log_level(4); criu_set_log_file("restore.log"); criu_set_images_dir_fd(cur_imgdir); diff --git a/test/others/libcriu/test_join_ns.c b/test/others/libcriu/test_join_ns.c deleted file mode 100644 index 50a4cf295..000000000 --- a/test/others/libcriu/test_join_ns.c +++ /dev/null @@ -1,243 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "criu.h" -#include "lib.h" - -#ifndef CLONE_NEWTIME -#define CLONE_NEWTIME 0x00000080 /* New time namespace */ -#endif - -static pid_t child_pid; -static int pipefd[2]; - -static int dir_fd; -static char *criu_bin; - -static bool timens_support(void) -{ - return access("/proc/self/ns/time", F_OK) == 0; -} - -static void create_child_process(void) -{ - pid_t pid = fork(); - if (pid < 0) { - perror("fork failed"); - exit(1); - } - - if (pid == 0) { - if (setsid() < 0) - exit(1); - pid = getpid(); - write(pipefd[1], &pid, sizeof(pid)); - while (1) - sleep(1); - } -} - -static void unshare_namespaces(void) -{ - int flags = CLONE_NEWIPC | CLONE_NEWUTS; - if (unshare(flags)) { - perror("Can't unshare namespaces"); - exit(1); - } - - if (timens_support() && unshare(CLONE_NEWTIME)) { - perror("unshare(CLONE_NEWTIME) failed"); - exit(1); - } -} - -static void init_criu_request(void) -{ - if (criu_init_opts()) { - fprintf(stderr, "failed to initialise request options\n"); - exit(1); - } - criu_set_service_binary(criu_bin); - criu_set_images_dir_fd(dir_fd); - criu_set_log_level(CRIU_LOG_DEBUG); -} - -static void checkpoint_test(void) -{ - int pid, ret; - - pipe(pipefd); - - pid = fork(); - if (pid < 0) { - perror("fork failed"); - exit(1); - } - - if (pid == 0) { - unshare_namespaces(); - /* Close unused read end */ - close(pipefd[0]); - create_child_process(); - exit(0); - } - - /* Close unused write end */ - close(pipefd[1]); - /* Read child PID */ - read(pipefd[0], &child_pid, sizeof(child_pid)); - - init_criu_request(); - criu_set_log_file("dump.log"); - criu_set_pid(child_pid); - - ret = criu_dump(); - if (ret < 0) { - what_err_ret_mean(ret); - exit(1); - } - - kill(pid, SIGKILL); - if (waitpid(pid, NULL, 0) < 0) { - perror("Can't wait pid"); - exit(1); - } -} - -static void join_ns(const char *ns, pid_t pid) -{ - char ns_file[PATH_MAX]; - snprintf(ns_file, sizeof(ns_file), "/proc/%d/ns/%s", pid, ns); - criu_join_ns_add(ns, ns_file, NULL); -} - -static pid_t create_namespaces(void) -{ - pid_t pid = fork(); - if (pid < 0) { - perror("fork failed"); - exit(1); - } - - if (pid == 0) { - unshare_namespaces(); - while (1) - sleep(1); - } - - return pid; -} - -static int get_ns_ino(pid_t pid, const char *nsname, ino_t *ino) -{ - struct stat st; - char path[PATH_MAX]; - - snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, nsname); - printf("Stat %s\n", path); - if (stat(path, &st)) - return -errno; - *ino = st.st_ino; - - return 0; -} - -static int compare_namespace(const char *nsname, pid_t parent_pid) -{ - ino_t child_ns_ino, parent_ns_ino; - - printf("Compare %s ns for %d and %d\n", nsname, child_pid, parent_pid); - - if (get_ns_ino(child_pid, nsname, &child_ns_ino)) { - perror("Failed to get child ns inode"); - return -1; - } - - if (get_ns_ino(parent_pid, nsname, &parent_ns_ino)) { - perror("Failed to get parent ns inode"); - return -1; - } - - return child_ns_ino != parent_ns_ino; -} - -static int restore_test(void) -{ - int ret; - pid_t parent_pid = create_namespaces(); - - init_criu_request(); - criu_set_log_file("restore.log"); - - join_ns("ipc", parent_pid); - join_ns("uts", parent_pid); - if (timens_support()) - join_ns("time", parent_pid); - - ret = criu_restore_child(); - if (ret < 0) { - what_err_ret_mean(ret); - exit(1); - } - - /* Verify that the child process has joined correct namespaces */ - - if (compare_namespace("ipc", parent_pid)) { - fprintf(stderr, "Error: IPC ns doesn't match\n"); - exit(1); - } - - if (compare_namespace("uts", parent_pid)) { - fprintf(stderr, "Error: UTS ns doesn't match\n"); - exit(1); - } - - if (timens_support() && compare_namespace("time", parent_pid)) { - fprintf(stderr, "Error: Time ns doesn't match\n"); - exit(1); - } - - kill(child_pid, SIGKILL); - if (waitpid(child_pid, NULL, 0) < 0) { - perror("Can't wait child pid"); - exit(1); - } - - kill(parent_pid, SIGKILL); - if (waitpid(parent_pid, NULL, 0) < 0) { - perror("Can't wait parent pid"); - exit(1); - } - - return 0; -} - -int main(int argc, char **argv) -{ - int exit_code; - - criu_bin = argv[1]; - dir_fd = open(argv[2], O_DIRECTORY); - if (dir_fd < 0) { - perror("Can't open images dir"); - return -1; - } - - checkpoint_test(); - exit_code = restore_test(); - - close(dir_fd); - return exit_code; -} diff --git a/test/others/libcriu/test_notify.c b/test/others/libcriu/test_notify.c index 80ad3ffdc..66fe75338 100644 --- a/test/others/libcriu/test_notify.c +++ b/test/others/libcriu/test_notify.c @@ -10,6 +10,8 @@ #include "lib.h" +#define SUCC_ECODE 42 + static int actions_called = 0; static int notify(char *action, criu_notify_arg_t na) { @@ -69,7 +71,7 @@ int main(int argc, char **argv) criu_set_service_binary(argv[1]); criu_set_pid(pid); criu_set_log_file("dump.log"); - criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_log_level(4); criu_set_notify_cb(notify); fd = open(argv[2], O_DIRECTORY); criu_set_images_dir_fd(fd); diff --git a/test/others/libcriu/test_pre_dump.c b/test/others/libcriu/test_pre_dump.c deleted file mode 100644 index ed9cd2125..000000000 --- a/test/others/libcriu/test_pre_dump.c +++ /dev/null @@ -1,151 +0,0 @@ -#include "criu.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "lib.h" - -static int wdir_fd, cur_imgdir = -1; - -static int stop = 0; -static void sh(int sig) -{ - stop = 1; -} - -static void open_imgdir(void) -{ - char p[10]; - static int id = 0; - - if (id > 0) { - sprintf(p, "../dir-%d", id); - criu_set_parent_images(p); - } - if (cur_imgdir != -1) - close(cur_imgdir); - sprintf(p, "dir-%d", ++id); - mkdirat(wdir_fd, p, 0700); - cur_imgdir = openat(wdir_fd, p, O_DIRECTORY); - criu_set_images_dir_fd(cur_imgdir); -} - -int main(int argc, char **argv) -{ - int pid, ret, p[2]; - - wdir_fd = open(argv[2], O_DIRECTORY); - if (wdir_fd < 0) { - perror("Can't open wdir"); - return 1; - } - - printf("--- Start loop ---\n"); - pipe(p); - pid = fork(); - if (pid < 0) { - perror("Can't"); - return -1; - } - - if (!pid) { - printf(" `- loop: initializing\n"); - if (setsid() < 0) - exit(1); - if (signal(SIGUSR1, sh) == SIG_ERR) - exit(1); - - close(0); - close(1); - close(2); - close(p[0]); - - ret = SUCC_ECODE; - write(p[1], &ret, sizeof(ret)); - close(p[1]); - - while (!stop) - sleep(1); - exit(SUCC_ECODE); - } - - close(p[1]); - - /* Wait for kid to start */ - ret = -1; - read(p[0], &ret, sizeof(ret)); - if (ret != SUCC_ECODE) { - printf("Error starting loop\n"); - goto err; - } - - /* Wait for pipe to get closed, then dump */ - read(p[0], &ret, 1); - close(p[0]); - - printf("--- Dump loop ---\n"); - criu_init_opts(); - criu_set_service_binary(argv[1]); - criu_set_pid(pid); - criu_set_log_file("dump.log"); - criu_set_log_level(CRIU_LOG_DEBUG); - criu_set_track_mem(true); - - open_imgdir(); - ret = criu_pre_dump(); - if (ret < 0) { - what_err_ret_mean(ret); - kill(pid, SIGKILL); - goto err; - } - - printf(" `- Pre Dump 1 succeeded\n"); - - open_imgdir(); - ret = criu_pre_dump(); - if (ret < 0) { - what_err_ret_mean(ret); - kill(pid, SIGKILL); - goto err; - } - - printf(" `- Pre Dump 2 succeeded\n"); - - open_imgdir(); - ret = criu_dump(); - if (ret < 0) { - what_err_ret_mean(ret); - kill(pid, SIGKILL); - goto err; - } - - printf(" `- Final Dump succeeded\n"); - waitpid(pid, NULL, 0); - - printf("--- Restore ---\n"); - criu_init_opts(); - criu_set_log_level(CRIU_LOG_DEBUG); - criu_set_log_file("restore.log"); - criu_set_images_dir_fd(cur_imgdir); - - pid = criu_restore_child(); - if (pid <= 0) { - what_err_ret_mean(pid); - return -1; - } - - printf(" `- Restore returned pid %d\n", pid); - kill(pid, SIGUSR1); -err: - if (waitpid(pid, &ret, 0) < 0) { - perror(" Can't wait kid"); - return -1; - } - - return chk_exit(ret, SUCC_ECODE); -} diff --git a/test/others/libcriu/test_rpc_config.c b/test/others/libcriu/test_rpc_config.c deleted file mode 100644 index 529f13637..000000000 --- a/test/others/libcriu/test_rpc_config.c +++ /dev/null @@ -1,223 +0,0 @@ -#include "criu.h" -#include "lib.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define RANDOM_NAME_LEN 6 -#define PATH_BUF_SIZE 128 - -static volatile sig_atomic_t stop = 0; -static char base_name[RANDOM_NAME_LEN + 1]; -static char log_file[PATH_BUF_SIZE]; -static char conf_file[PATH_BUF_SIZE]; - -static void handle_signal(int sig) -{ - (void)sig; - stop = 1; -} - -static void generate_random_base_name(void) -{ - const char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - size_t charset_len; - int i; - - charset_len = sizeof(charset) - 1; - - for (i = 0; i < RANDOM_NAME_LEN; i++) { - base_name[i] = charset[rand() % charset_len]; - } - base_name[i] = '\0'; - - snprintf(log_file, sizeof(log_file), "/tmp/criu-%s.log", base_name); - snprintf(conf_file, sizeof(conf_file), "/tmp/criu-%s.conf", base_name); -} - -static int create_criu_config_file(void) -{ - int fd; - FILE *fp; - - srand(time(NULL)); - generate_random_base_name(); - - fd = open(conf_file, O_CREAT | O_EXCL | O_WRONLY, 0600); - if (fd < 0) { - perror("Failed to create config file"); - return -1; - } - - fp = fdopen(fd, "w"); - if (!fp) { - perror("fdopen failed"); - close(fd); - unlink(conf_file); - return -1; - } - - fprintf(fp, "log-file=%s\n", log_file); - fflush(fp); - fclose(fp); - - return 0; -} - -static int check_log_file(void) -{ - struct stat st; - - if (stat(log_file, &st) < 0) { - perror("Config file does not exist"); - return -1; - } - - if (st.st_size == 0) { - fprintf(stderr, "Config file is empty\n"); - return -1; - } - - unlink(log_file); - return 0; -} - -int main(int argc, char **argv) -{ - int pipe_fd[2]; - pid_t pid; - int ret; - int child_ret; - - int img_fd = open(argv[2], O_DIRECTORY); - if (img_fd < 0) { - perror("Failed to open images directory"); - goto cleanup; - } - - if (create_criu_config_file() < 0) { - printf("Failed to create config file\n"); - return EXIT_FAILURE; - } - - if (pipe(pipe_fd) < 0) { - perror("pipe"); - return EXIT_FAILURE; - } - - pid = fork(); - if (pid < 0) { - perror("fork failed"); - return EXIT_FAILURE; - } - - if (pid == 0) { - /** child process **/ - printf(" `- loop: initializing\n"); - - if (setsid() < 0 || signal(SIGUSR1, handle_signal) == SIG_ERR) { - _exit(EXIT_FAILURE); - } - - close(STDIN_FILENO); - close(STDOUT_FILENO); - close(STDERR_FILENO); - close(pipe_fd[0]); - - child_ret = SUCC_ECODE; - write(pipe_fd[1], &child_ret, sizeof(child_ret)); - close(pipe_fd[1]); - - while (!stop) { - sleep(1); - } - - _exit(SUCC_ECODE); - } - - /** parent process **/ - close(pipe_fd[1]); - - ret = -1; - if (read(pipe_fd[0], &ret, sizeof(ret)) != sizeof(ret) || ret != SUCC_ECODE) { - printf("Error starting loop\n"); - goto cleanup; - } - - read(pipe_fd[0], &ret, 1); - close(pipe_fd[0]); - - printf("--- Loop process started (pid: %d) ---\n", pid); - - printf("--- Checkpoint ---\n"); - criu_init_opts(); - criu_set_service_binary(argv[1]); - criu_set_images_dir_fd(img_fd); - criu_set_pid(pid); - criu_set_log_level(CRIU_LOG_DEBUG); - - /* The RPC config file should overwrite the log-file set below */ - printf("Setting dump RPC config file: %s\n", conf_file); - criu_set_config_file(conf_file); - criu_set_log_file("dump.log"); - - ret = criu_dump(); - if (ret < 0) { - what_err_ret_mean(ret); - kill(pid, SIGKILL); - printf("criu dump failed\n"); - goto cleanup; - } - - printf(" `- Dump succeeded\n"); - waitpid(pid, NULL, 0); - - if (check_log_file()) { - printf("Error: log file not overwritten by RPC config file\n"); - goto cleanup; - } - - printf("--- Restore loop ---\n"); - criu_init_opts(); - criu_set_images_dir_fd(img_fd); - criu_set_log_level(CRIU_LOG_DEBUG); - - /* The RPC config file should overwrite the log-file set below */ - printf("Setting restore RPC config file: %s\n", conf_file); - criu_set_config_file(conf_file); - criu_set_log_file("restore.log"); - - pid = criu_restore_child(); - if (pid <= 0) { - what_err_ret_mean(pid); - ret = EXIT_FAILURE; - goto cleanup; - } - - printf(" `- Restore returned pid %d\n", pid); - kill(pid, SIGUSR1); - - if (check_log_file()) { - printf("Error: log file not overwritten by RPC config file\n"); - goto cleanup; - } - -cleanup: - if (waitpid(pid, &ret, 0) < 0) { - perror("waitpid failed"); - return EXIT_FAILURE; - } - - printf("Remove RPC config file: %s\n", conf_file); - unlink(conf_file); - return chk_exit(ret, SUCC_ECODE); -} diff --git a/test/others/libcriu/test_self.c b/test/others/libcriu/test_self.c index 0a7e63a58..468edc0f7 100644 --- a/test/others/libcriu/test_self.c +++ b/test/others/libcriu/test_self.c @@ -37,7 +37,7 @@ int main(int argc, char *argv[]) } criu_set_images_dir_fd(fd); - criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_log_level(4); printf("--- Start child ---\n"); pid = fork(); diff --git a/test/others/libcriu/test_sub.c b/test/others/libcriu/test_sub.c index af1e09408..41b685847 100644 --- a/test/others/libcriu/test_sub.c +++ b/test/others/libcriu/test_sub.c @@ -15,6 +15,8 @@ static void sh(int sig) stop = 1; } +#define SUCC_ECODE 42 + int main(int argc, char **argv) { int pid, ret, fd, p[2]; @@ -67,7 +69,7 @@ int main(int argc, char **argv) criu_set_service_binary(argv[1]); criu_set_pid(pid); criu_set_log_file("dump.log"); - criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_log_level(4); fd = open(argv[2], O_DIRECTORY); criu_set_images_dir_fd(fd); @@ -83,7 +85,7 @@ int main(int argc, char **argv) printf("--- Restore loop ---\n"); criu_init_opts(); - criu_set_log_level(CRIU_LOG_DEBUG); + criu_set_log_level(4); criu_set_log_file("restore.log"); criu_set_images_dir_fd(fd); diff --git a/test/others/mem-snap/run-predump-2.sh b/test/others/mem-snap/run-predump-2.sh index 5ef1422b4..46af8063b 100755 --- a/test/others/mem-snap/run-predump-2.sh +++ b/test/others/mem-snap/run-predump-2.sh @@ -28,7 +28,7 @@ function stop_test { wtime=1 cd ../../zdtm/static/ make maps04.stop - fgrep PASS maps04.out || fail "Test failed" + cat maps04.out | fgrep PASS || fail "Test failed" echo "OK" } diff --git a/test/others/mem-snap/run-predump.sh b/test/others/mem-snap/run-predump.sh index 06ba74737..d06d2d8fc 100755 --- a/test/others/mem-snap/run-predump.sh +++ b/test/others/mem-snap/run-predump.sh @@ -72,6 +72,6 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -fgrep PASS mem-touch.out || fail "Test failed" +cat mem-touch.out | fgrep PASS || fail "Test failed" echo "Test PASSED" diff --git a/test/others/mem-snap/run-snap-auto-dedup.sh b/test/others/mem-snap/run-snap-auto-dedup.sh index a3801f5b4..f77aa1fcb 100755 --- a/test/others/mem-snap/run-snap-auto-dedup.sh +++ b/test/others/mem-snap/run-snap-auto-dedup.sh @@ -84,7 +84,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -fgrep PASS mem-touch.out || fail "Test failed" +cat mem-touch.out | fgrep PASS || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-dedup-on-restore.sh b/test/others/mem-snap/run-snap-dedup-on-restore.sh index 5dbb5bf44..6ae050bc7 100755 --- a/test/others/mem-snap/run-snap-dedup-on-restore.sh +++ b/test/others/mem-snap/run-snap-dedup-on-restore.sh @@ -78,7 +78,7 @@ fi cd ../../zdtm/static/ make mem-touch.stop -fgrep PASS mem-touch.out || fail "Test failed" +cat mem-touch.out | fgrep PASS || fail "Test failed" if [ $restore_dedup_ok -ne 0 ]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-dedup.sh b/test/others/mem-snap/run-snap-dedup.sh index 40db95325..27fcd55a9 100755 --- a/test/others/mem-snap/run-snap-dedup.sh +++ b/test/others/mem-snap/run-snap-dedup.sh @@ -90,7 +90,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -fgrep PASS mem-touch.out || fail "Test failed" +cat mem-touch.out | fgrep PASS || fail "Test failed" if [[ $dedup_ok_2 -ne 0 || $dedup_ok_1 -ne 0 ]]; then fail "Dedup test failed" diff --git a/test/others/mem-snap/run-snap-maps04.sh b/test/others/mem-snap/run-snap-maps04.sh index 267d51deb..2def909d9 100755 --- a/test/others/mem-snap/run-snap-maps04.sh +++ b/test/others/mem-snap/run-snap-maps04.sh @@ -58,7 +58,7 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log --auto-dedup -d -v4 || fa make -C ../../zdtm/static/ maps04.stop sleep 1 -fgrep PASS "../zdtm/static/maps04.out" || fail "Test failed" +cat "../zdtm/static/maps04.out" | fgrep PASS || fail "Test failed" size=$(du -sh -BK dump/1/pages-*.img | grep -Eo '[0-9]+' | head -1) if [ $size -ne 0 ] ; then diff --git a/test/others/mem-snap/run-snap.sh b/test/others/mem-snap/run-snap.sh index c91cd0098..b97bd295e 100755 --- a/test/others/mem-snap/run-snap.sh +++ b/test/others/mem-snap/run-snap.sh @@ -69,6 +69,6 @@ ${CRIU} restore -D "${IMGDIR}/$NRSNAP/" -o restore.log -d -v4 || fail "Fail to r cd ../../zdtm/static/ make mem-touch.stop -fgrep PASS mem-touch.out || fail "Test failed" +cat mem-touch.out | fgrep PASS || fail "Test failed" echo "Test PASSED" diff --git a/test/others/mnt-ext-dev/run.sh b/test/others/mnt-ext-dev/run.sh index 5cdbc45a8..9803a8f77 100755 --- a/test/others/mnt-ext-dev/run.sh +++ b/test/others/mnt-ext-dev/run.sh @@ -2,14 +2,16 @@ set -e -x # construct root -python3 ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns +python ../../zdtm.py run -t zdtm/static/env00 --iter 0 -f ns truncate -s 0 zdtm.loop truncate -s 50M zdtm.loop mkfs.ext4 -F zdtm.loop dev=`losetup --find --show zdtm.loop` +mkdir -p ../../dev +cp -ap $dev ../../dev export ZDTM_MNT_EXT_DEV=$dev -python3 ../../zdtm.py run $EXTRA_OPTS -t zdtm/static/mnt_ext_dev || ret=$? +python ../../zdtm.py run -t zdtm/static/mnt_ext_dev || ret=$? losetup -d $dev unlink zdtm.loop exit $ret diff --git a/test/others/mounts/mounts.sh b/test/others/mounts/mounts.sh index bed156a50..19116d0cf 100755 --- a/test/others/mounts/mounts.sh +++ b/test/others/mounts/mounts.sh @@ -12,7 +12,7 @@ cd $INMNTNS mount --make-rprivate / -for i in `awk '{ print $2 }' < /proc/self/mounts`; do +for i in `cat /proc/self/mounts | awk '{ print $2 }'`; do [ '/' = "$i" ] && continue [ '/proc' = "$i" ] && continue [ '/dev' = "$i" ] && continue @@ -20,7 +20,7 @@ for i in `awk '{ print $2 }' < /proc/self/mounts`; do umount -l $i done -python3 mounts.py +python mounts.py kill $INMNTNS_PID while :; do sleep 10 diff --git a/test/others/mounts/run.sh b/test/others/mounts/run.sh index d665a726a..35927fb5e 100755 --- a/test/others/mounts/run.sh +++ b/test/others/mounts/run.sh @@ -12,12 +12,12 @@ kill -0 $pid || exit cat /proc/$pid/mountinfo | sort -k 4 echo "Suspend server" ${CRIU} dump -D dump -o dump.log -t $pid -v4 || { - grep Error dump/dump.log + cat dump/dump.log | grep Error exit 1 } echo "Resume server" ${CRIU} restore -d -D dump -o restore.log -v4 || { - grep Error dump/dump.log + cat dump/dump.log | grep Error exit 1 } cat /proc/$pid/mountinfo | sort -k 4 diff --git a/test/others/ns_ext/run.sh b/test/others/ns_ext/run.sh index 5d1e139d7..2e9a6fe86 100755 --- a/test/others/ns_ext/run.sh +++ b/test/others/ns_ext/run.sh @@ -2,13 +2,10 @@ set -x -if ! ../../zdtm/static/macvlan.checkskip; then - echo "No macvlan support. Skipping" - exit 0 -fi - if [[ "$1" == "pid" ]]; then NS=pid + # CentOS 7 kernels do not have NSpid -> skip this test + grep NSpid /proc/self/status || exit 0 else NS=net fi @@ -64,7 +61,7 @@ exec 33< $MNT1 exec 34< $MNT2 $CRIU dump -v4 -t $pid -o dump.log -D images --external $NS[$ino]:test_ns --external $NS[$ino2]:test_ns2 RESULT=$? -grep -B 5 Error images/dump.log || echo ok +cat images/dump.log | grep -B 5 Error || echo ok [ "$RESULT" != "0" ] && { echo "CRIU dump failed" echo FAIL @@ -73,7 +70,7 @@ grep -B 5 Error images/dump.log || echo ok $CRIU restore -v4 -o restore.log -D images --inherit-fd fd[33]:test_ns --inherit-fd fd[34]:test_ns2 -d RESULT=$? -grep -B 5 Error images/restore.log || echo ok +cat images/restore.log | grep -B 5 Error || echo ok [ "$RESULT" != "0" ] && { echo "CRIU restore failed" echo FAIL diff --git a/test/others/ns_ext/run_pidns.sh b/test/others/ns_ext/run_pidns.sh index db12106e0..19fb97b95 100755 --- a/test/others/ns_ext/run_pidns.sh +++ b/test/others/ns_ext/run_pidns.sh @@ -2,10 +2,13 @@ set -e +# CentOS 7 kernels do not have NSpid -> skip this test +grep NSpid /proc/self/status || exit 0 + # This test creates a process in non-host pidns and then dumps it and restores # it into host pidns. We use pid >100000 in non-host pidns to make sure it does # not intersect with some host pid on restore but it is potentially racy so -# please run this test only in manually. +# please run this test only in manualy. CRIU=../../../criu/criu @@ -33,7 +36,7 @@ mkdir -p images_pidns echo "$CRIU dump -v4 -o dump.log -t $PID -D images_pidns --external $PIDNS:exti" $CRIU dump -v4 -o dump.log -t $PID -D images_pidns --external $PIDNS:exti RESULT=$? -grep -B 5 Error images_pidns/dump.log || echo ok +cat images_pidns/dump.log | grep -B 5 Error || echo ok [ "$RESULT" != "0" ] && { echo "CRIU dump failed" echo FAIL @@ -45,7 +48,7 @@ exec {pidns_fd}< /proc/self/ns/pid echo "$CRIU restore -v4 -o restore.log -D images_pidns --restore-detached --inherit-fd fd[$pidns_fd]:exti" $CRIU restore -v4 -o restore.log -D images_pidns --restore-detached --inherit-fd fd[$pidns_fd]:exti --pidfile test.pidfile RESULT=$? -grep -B 5 Error images_pidns/restore.log || echo ok +cat images_pidns/restore.log | grep -B 5 Error || echo ok [ "$RESULT" != "0" ] && { echo "CRIU restore failed" echo FAIL diff --git a/test/others/pycriu/.gitignore b/test/others/pycriu/.gitignore deleted file mode 100644 index 567609b12..000000000 --- a/test/others/pycriu/.gitignore +++ /dev/null @@ -1 +0,0 @@ -build/ diff --git a/test/others/pycriu/Makefile b/test/others/pycriu/Makefile deleted file mode 100644 index b6e3b4814..000000000 --- a/test/others/pycriu/Makefile +++ /dev/null @@ -1,63 +0,0 @@ -.SHELLFLAGS := -eu -o pipefail -c -.ONESHELL: - -CRIU ?= ../../../criu/criu -BUILD_DIR ?= build -SOCKET_NAME ?= criu_service.socket -PIDFILE_NAME ?= pidfile -SERVICE_LOG ?= service.log -PYTHON ?= python3 - -PIDFILE := $(BUILD_DIR)/$(PIDFILE_NAME) -CRIU_SOCKET := $(BUILD_DIR)/$(SOCKET_NAME) -STATUS_FIFO := $(BUILD_DIR)/startup.status -STATUS_FD := 200 - -run: start - cleanup() { $(MAKE) --no-print-directory stop || true; } - trap cleanup EXIT INT TERM - "$(PYTHON)" test_check.py - "$(PYTHON)" test_check_fail.py - "$(PYTHON)" test_check_images_dir.py - "$(PYTHON)" test_check_work_dir_fd.py - -start: - mkdir -p "$(BUILD_DIR)" - if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then - echo "Service running (PID $$(cat "$(PIDFILE)"))." - exit 0 - fi - if ! command -v "$(CRIU)" >/dev/null 2>&1; then - echo "CRIU not found at $(CRIU)" - exit 1 - fi - mkfifo "$(STATUS_FIFO)" - exec $(STATUS_FD)<>"$(STATUS_FIFO)" - "$(CRIU)" service \ - -v4 \ - -W "$(BUILD_DIR)" \ - --address "$(SOCKET_NAME)" \ - -d \ - --pidfile "$(PIDFILE_NAME)" \ - -o "$(SERVICE_LOG)" \ - --status-fd "$(STATUS_FD)" - "$(PYTHON)" read.py "$(STATUS_FIFO)" - -stop: - if [ ! -s "$(PIDFILE)" ]; then - echo "pidfile missing or empty" - exit 1 - fi - pid=$$(cat "$(PIDFILE)") - if kill -0 "$$pid" 2>/dev/null; then - kill -9 "$$pid" || true - fi - rm -f "$(PIDFILE)" "$(CRIU_SOCKET)" "$(STATUS_FIFO)" - -clean: - if [ -s "$(PIDFILE)" ] && kill -0 "$$(cat "$(PIDFILE)")" 2>/dev/null; then - kill -9 "$$(cat "$(PIDFILE)")" || true - fi - rm -rf "$(BUILD_DIR)" - -.PHONY: start stop clean run \ No newline at end of file diff --git a/test/others/pycriu/read.py b/test/others/pycriu/read.py deleted file mode 120000 index c2c1e1365..000000000 --- a/test/others/pycriu/read.py +++ /dev/null @@ -1 +0,0 @@ -../rpc/read.py \ No newline at end of file diff --git a/test/others/pycriu/test_check.py b/test/others/pycriu/test_check.py deleted file mode 100755 index 9888158db..000000000 --- a/test/others/pycriu/test_check.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys - -# Add ../../../lib so we can import pycriu -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) -if LIB_DIR not in sys.path: - sys.path.insert(0, LIB_DIR) - -import pycriu # noqa: E402 - -def main(): - socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") - - criu = pycriu.criu() - criu.use_sk(socket_path) - - try: - criu.check() - except Exception as e: - print(f"FAIL: {e}") - return 1 - - print("PASS") - return 0 - -if __name__ == "__main__": - sys.exit(main()) diff --git a/test/others/pycriu/test_check_fail.py b/test/others/pycriu/test_check_fail.py deleted file mode 100755 index b5634c60b..000000000 --- a/test/others/pycriu/test_check_fail.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys - -# Add ../../../lib so we can import pycriu -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) -if LIB_DIR not in sys.path: - sys.path.insert(0, LIB_DIR) - -import pycriu # noqa: E402 - -def main(): - socket_path = os.path.join(SCRIPT_DIR, "build", "criu_service.socket") - - criu = pycriu.criu() - criu.use_sk(socket_path) - - # Intentionally set only log_file (no images/work dir) to ensure check() fails - criu.opts.log_file = "check.log" - - try: - criu.check() - except Exception: - print("PASS") - return 0 - - print("FAIL: check() did not fail when log_file is set without images/work dir") - return 1 - -if __name__ == "__main__": - sys.exit(main()) diff --git a/test/others/pycriu/test_check_images_dir.py b/test/others/pycriu/test_check_images_dir.py deleted file mode 100755 index f479c2a88..000000000 --- a/test/others/pycriu/test_check_images_dir.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys - -# Add ../../../lib so we can import pycriu -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) -if LIB_DIR not in sys.path: - sys.path.insert(0, LIB_DIR) - -import pycriu # noqa: E402 - -def _log_path(images_dir, log_file): - return log_file if os.path.isabs(log_file) else os.path.join(images_dir, log_file) - -def main(): - build_dir = os.path.join(SCRIPT_DIR, "build") - socket_path = os.path.join(build_dir, "criu_service.socket") - - criu = pycriu.criu() - criu.use_sk(socket_path) - - criu.opts.images_dir = build_dir - criu.opts.log_file = "check.log" - criu.opts.log_level = 4 - - try: - criu.check() - except Exception as e: - lp = _log_path(build_dir, criu.opts.log_file) - msg = f"FAIL: {e} ({'see log: ' + lp if os.path.exists(lp) else 'no log found'})" - print(msg) - return 1 - - lp = _log_path(build_dir, criu.opts.log_file) - if not (os.path.isfile(lp) and os.path.getsize(lp) > 0): - print(f"FAIL: log file missing or empty: {lp}") - return 1 - - print("PASS") - return 0 - -if __name__ == "__main__": - sys.exit(main()) diff --git a/test/others/pycriu/test_check_work_dir_fd.py b/test/others/pycriu/test_check_work_dir_fd.py deleted file mode 100755 index e20a83097..000000000 --- a/test/others/pycriu/test_check_work_dir_fd.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -import os -import sys - -# Add ../../../lib so we can import pycriu -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -LIB_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "../../../lib")) -if LIB_DIR not in sys.path: - sys.path.insert(0, LIB_DIR) - -import pycriu # noqa: E402 - -def main(): - build_dir = os.path.join(SCRIPT_DIR, "build") - socket_path = os.path.join(build_dir, "criu_service.socket") - os.makedirs(build_dir, exist_ok=True) - - # Open a directory FD to use as work_dir_fd (prefer O_PATH if available) - flags = getattr(os, "O_PATH", 0) or os.O_RDONLY - fd = os.open(build_dir, flags) - - criu = pycriu.criu() - criu.use_sk(socket_path) - - criu.opts.work_dir_fd = fd - criu.opts.log_file = "check.log" - criu.opts.log_level = 4 - - try: - criu.check() - except Exception as e: - print(f"FAIL: {e}") - return 1 - finally: - try: - os.close(fd) - except Exception: - pass - - print("PASS") - return 0 - -if __name__ == "__main__": - sys.exit(main()) diff --git a/test/others/rpc/Makefile b/test/others/rpc/Makefile index c0e56d528..fc64f0c97 100644 --- a/test/others/rpc/Makefile +++ b/test/others/rpc/Makefile @@ -4,22 +4,13 @@ all: test-c rpc_pb2.py criu CFLAGS += -g -Werror -Wall -I. LDLIBS += -lprotobuf-c -PYTHON ?= python3 +PYTHON ?= python run: all @make -C .. loop - mkdir -p build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} + mkdir -p build chmod a+rwx build - chmod a+rwx build/{imgs_errno,imgs_ps,imgs_c,imgs_loop,imgs_py} rm -f build/status - rm -f build/_marker_* - @# Create all log files to be accessible for anybody - @# so that they can be displayed by any user. - for i in imgs_errno/criu.log imgs_ps/page-server.log imgs_ps/dump.log \ - imgs_c/restore-c.log imgs_loop/criu.log imgs_loop/dump-loop.log \ - imgs_py/criu.log imgs_py/restore-py.log imgs_c/criu.log service.log; do \ - touch build/$$i; chmod 666 build/$$i; \ - done sudo -g '#1000' -u '#1000' mkfifo build/status @# Need to start the criu daemon here to access the pidfile. @# The script read.py is used to wait until 'criu service' @@ -48,7 +39,7 @@ rpc_pb2.py: rpc.proto protoc --proto_path=. --python_out=. rpc.proto rpc.pb-c.c: rpc.proto - protoc --proto_path=. --c_out=. rpc.proto + protoc-c --proto_path=. --c_out=. rpc.proto clean: rm -rf build rpc.pb-c.o test-c.o test-c rpc.pb-c.c rpc.pb-c.h rpc_pb2.py rpc_pb2.pyc criu diff --git a/test/others/rpc/action-script.sh b/test/others/rpc/action-script.sh deleted file mode 100755 index 991e315de..000000000 --- a/test/others/rpc/action-script.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash - -MARKER_FILE="_marker_${CRTOOLS_SCRIPT_ACTION}" - -if [ -z "$CRTOOLS_SCRIPT_ACTION" ]; then - echo "Error: CRTOOLS_SCRIPT_ACTION is not set." - exit 2 -fi - -if [ ! -f "$MARKER_FILE" ]; then - touch "$MARKER_FILE" -else - echo "Error: Running the same action hook for the second time" - exit 1 -fi - -exit 0 diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index c1a8276d8..90c80fcae 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python import argparse import os @@ -13,9 +13,6 @@ from setup_swrk import setup_swrk log_file = 'config_file_test.log' does_not_exist = 'does-not.exist' -script_path = os.path.dirname(os.path.abspath(__file__)) -action_script_file = os.path.join(script_path, 'action-script.sh') - def setup_config_file(content): # Creating a temporary file which will be used as configuration file. @@ -92,37 +89,29 @@ def test_broken_configuration_file(): sys.exit(-1) -def search_in_log_file(log_path, message): - with open(log_path) as f: +def search_in_log_file(log, message): + with open(os.path.join(args['dir'], log)) as f: if message not in f.read(): - print('FAIL: Missing the expected error message (%s) in the log file' % message) + print( + 'FAIL: Missing the expected error message (%s) in the log file' + % message) sys.exit(-1) -def print_log_file(log_path): - print("\n--- Begin log file: %s ---" % log_path) - with open(log_path, 'r') as f: - print(f.read()) - print("--- End log file ---\n") - - def check_results(resp, log): # Check if the specified log file exists - log_path = os.path.join(args['dir'], log) - if not os.path.isfile(log_path): + if not os.path.isfile(os.path.join(args['dir'], log)): print('FAIL: Expected log file %s does not exist' % log) sys.exit(-1) # Dump should have failed with: 'The criu itself is within dumped tree' if resp.type != rpc.DUMP: print('FAIL: Unexpected msg type %r' % resp.type) - print_log_file(log_path) sys.exit(-1) if 'The criu itself is within dumped tree' not in resp.cr_errmsg: print('FAIL: Missing the expected error message in RPC response') - print_log_file(log_path) sys.exit(-1) # Look into the log file for the same message - search_in_log_file(log_path, 'The criu itself is within dumped tree') + search_in_log_file(log, 'The criu itself is within dumped tree') def test_rpc_without_configuration_file(): @@ -167,7 +156,6 @@ def test_rpc_with_configuration_file_overwriting_rpc(): # file settings in the default configuration. log = does_not_exist content = 'log-file ' + log + '\n' - content += 'action-script ' + action_script_file + '\n' content += 'no-tcp-established\nno-shell-job' path = setup_config_file(content) # Only set the configuration file via RPC; @@ -192,18 +180,11 @@ args = vars(parser.parse_args()) cleanup_output(args['dir']) -print("*** Test broken config file ***") test_broken_configuration_file() cleanup_output(args['dir']) - -print("*** Test RPC without config file ***") test_rpc_without_configuration_file() cleanup_output(args['dir']) - -print("*** Test RPC with config file ***") test_rpc_with_configuration_file() cleanup_output(args['dir']) - -print("*** Test configuration file overwriting RPC ***") test_rpc_with_configuration_file_overwriting_rpc() cleanup_output(args['dir']) diff --git a/test/others/rpc/errno.py b/test/others/rpc/errno.py index ea841199f..01a6eee7b 100755 --- a/test/others/rpc/errno.py +++ b/test/others/rpc/errno.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python # Test criu errno import socket, os, errno @@ -40,9 +40,9 @@ class test: resp.ParseFromString(self.s.recv(self._MAX_MSG_SIZE)) return resp - def check_resp(self, resp, typ, err, errmsg = None): + def check_resp(self, resp, typ, err): if resp.type != typ: - raise Exception('Unexpected response type ' + str(resp.type)) + raise Exception('Unexpected responce type ' + str(resp.type)) if resp.success: raise Exception('Unexpected success = True') @@ -50,9 +50,6 @@ class test: if err and resp.cr_errno != err: raise Exception('Unexpected cr_errno ' + str(resp.cr_errno)) - if errmsg and errmsg not in str(resp.cr_errmsg): - raise Exception('Unexpected cr_msg \'' + str(resp.cr_errmsg) + '\'') - def no_process(self): print('Try to dump unexisting process') # Get pid of non-existing process. @@ -70,7 +67,6 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.pid = pid - req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() @@ -88,7 +84,6 @@ class test: req = self.get_base_req() req.type = rpc.DUMP req.opts.leave_running = True - req.opts.network_lock = rpc.SKIP self.send_req(req) resp = self.recv_resp() @@ -135,27 +130,11 @@ class test: print('Success') - def child_first_err(self): - print('Receive correct first error message') - - req = self.get_base_req() - req.type = rpc.CHECK - # Log file must not have subdirectory - req.opts.log_file = "/foo/bar.log" - - self.send_req(req) - resp = self.recv_resp() - - self.check_resp(resp, rpc.CHECK, None, "No subdirs are allowed in log_file name") - - print('Success') - def run(self): self.no_process() self.process_exists() self.bad_options() self.bad_request() - self.child_first_err() t = test() diff --git a/test/others/rpc/ps_test.py b/test/others/rpc/ps_test.py index 259f22e77..b51357d42 100755 --- a/test/others/rpc/ps_test.py +++ b/test/others/rpc/ps_test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python import socket, os, sys, errno import rpc_pb2 as rpc @@ -23,7 +23,6 @@ req.type = rpc.PAGE_SERVER req.opts.log_file = 'page-server.log' req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) -req.opts.network_lock = rpc.SKIP s.send(req.SerializeToString()) diff --git a/test/others/rpc/read.py b/test/others/rpc/read.py old mode 100755 new mode 100644 diff --git a/test/others/rpc/restore-loop.py b/test/others/rpc/restore-loop.py index 67110c2cf..84a2ce56d 100755 --- a/test/others/rpc/restore-loop.py +++ b/test/others/rpc/restore-loop.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python import socket, os, sys import rpc_pb2 as rpc diff --git a/test/others/rpc/run.sh b/test/others/rpc/run.sh index b6158dfea..9be577587 100755 --- a/test/others/rpc/run.sh +++ b/test/others/rpc/run.sh @@ -1,9 +1,16 @@ #!/bin/bash -set -e +set -ex + +if [ -e /etc/os-release ]; then + . /etc/os-release + if [ "$ID" == "centos" ] && [[ "$VERSION_ID" == "7"* ]];then + echo "Skipping tests on CentOS 7 because they do not work in CI" + exit 0 + fi +fi CRIU=./criu -FAIL=1 export PROTODIR=`readlink -f "${PWD}/../../protobuf"` @@ -20,13 +27,6 @@ function stop_server { title_print "Shutdown service server" kill -SIGTERM $(cat build/pidfile) unlink build/pidfile - if [ "${FAIL}" == "1" ]; then - for i in build/output*; do - echo "File: $i" - cat $i - done - find . -name "*.log" -print -exec cat {} \; || true - fi } function test_c { @@ -59,7 +59,7 @@ function test_restore_loop { title_print "Dump loop process" # So theoretically '-j' (--shell-job) should not be necessary, but on alpine # this test fails without it. - ${CRIU} dump -j -v4 -o dump-loop.log --network-lock skip -D build/imgs_loop -t ${P} + ${CRIU} dump -j -v4 -o dump-loop.log -D build/imgs_loop -t ${P} title_print "Run restore-loop" ./restore-loop.py build/criu_service.socket build/imgs_loop @@ -88,8 +88,6 @@ test_restore_loop test_ps test_errno -FAIL=0 - stop_server trap 'echo "Success"' EXIT diff --git a/test/others/rpc/setup_swrk.py b/test/others/rpc/setup_swrk.py index ffaa01de4..c7f84f952 100644 --- a/test/others/rpc/setup_swrk.py +++ b/test/others/rpc/setup_swrk.py @@ -5,6 +5,12 @@ import subprocess def setup_swrk(): print('Connecting to CRIU in swrk mode.') s1, s2 = socket.socketpair(socket.AF_UNIX, socket.SOCK_SEQPACKET) - swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], pass_fds=[s1.fileno()]) + + kwargs = {} + if sys.version_info.major == 3: + kwargs["pass_fds"] = [s1.fileno()] + + swrk = subprocess.Popen(['./criu', "swrk", "%d" % s1.fileno()], **kwargs) s1.close() return swrk, s2 + diff --git a/test/others/rpc/test-c.c b/test/others/rpc/test-c.c index b3507975f..06f13b20f 100644 --- a/test/others/rpc/test-c.c +++ b/test/others/rpc/test-c.c @@ -99,8 +99,6 @@ int main(int argc, char *argv[]) req.opts->images_dir_fd = dir_fd; req.opts->has_log_level = true; req.opts->log_level = 4; - req.opts->has_network_lock = true; - req.opts->network_lock = CRIU_NETWORK_LOCK_METHOD__SKIP; /* * Connect to service socket @@ -120,7 +118,7 @@ int main(int argc, char *argv[]) ret = connect(fd, (struct sockaddr *)&addr, addr_len); if (ret == -1) { - perror("Can't connect to socket"); + perror("Cant connect to socket"); goto exit; } diff --git a/test/others/rpc/test.py b/test/others/rpc/test.py index 6f692f755..80f6338f4 100755 --- a/test/others/rpc/test.py +++ b/test/others/rpc/test.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python import socket, os, sys import rpc_pb2 as rpc @@ -24,7 +24,6 @@ req.type = rpc.DUMP req.opts.leave_running = True req.opts.log_level = 4 req.opts.images_dir_fd = os.open(args['dir'], os.O_DIRECTORY) -req.opts.network_lock = rpc.SKIP # Send request s.send(req.SerializeToString()) diff --git a/test/others/rpc/version.py b/test/others/rpc/version.py index a18cd5b7b..9d7fa745b 100755 --- a/test/others/rpc/version.py +++ b/test/others/rpc/version.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/python import sys import rpc_pb2 as rpc diff --git a/test/others/shell-job/run.py b/test/others/shell-job/run.py index 969965f00..a59945d6a 100755 --- a/test/others/shell-job/run.py +++ b/test/others/shell-job/run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import os, pty, sys, subprocess import termios, fcntl, time diff --git a/test/others/skip-file-rwx-check/Makefile b/test/others/skip-file-rwx-check/Makefile deleted file mode 100644 index 419d592b7..000000000 --- a/test/others/skip-file-rwx-check/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -.PHONY: run clean - -run: - ./run.sh - -clean: - rm -rf testfile *.img dump.log restore-expected-fail.log restore.log stats-dump stats-restore diff --git a/test/others/skip-file-rwx-check/run.sh b/test/others/skip-file-rwx-check/run.sh deleted file mode 100755 index 0776ebf61..000000000 --- a/test/others/skip-file-rwx-check/run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash - -set -o errexit -set -o nounset -set -o pipefail -set -o xtrace - -source ../env.sh - -make clean -touch testfile -chmod +w testfile -bash -c 'exec 3> "$(dirname "$0")/actions_called.txt" + >> "$(dirname $0)/actions_called.txt" diff --git a/test/zdtm.py b/test/zdtm.py index e21356c30..0a52e1b96 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python +from __future__ import absolute_import, division, print_function, unicode_literals import argparse import atexit @@ -13,7 +14,6 @@ import random import re import shutil import signal -import socket import stat import string import struct @@ -21,41 +21,28 @@ import subprocess import sys import tempfile import time -import uuid -import site -from builtins import input, int, open, range, str, zip +import socket +from builtins import (input, int, open, range, str, zip) + +import pycriu as crpc import yaml -from zdtm.criu_config import criu_config +os.chdir(os.path.dirname(os.path.abspath(__file__))) # File to store content of streamed images STREAMED_IMG_FILE_NAME = "img.criu" -# A library used to preload C functions to simulate -# cases such as partial read with pread(). -LIBFAULT_PATH = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "libfault", - "libfault.so" -) - -# A directory that contains the CRIU plugins. -PLUGINS_DIR = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "plugins" -) - prev_line = None -uuid = uuid.uuid4() - -NON_ROOT_UID = 65534 def alarm(*args): print("==== ALARM ====") +signal.signal(signal.SIGALRM, alarm) + + def traceit(f, e, a): if e == "line": lineno = f.f_lineno @@ -80,32 +67,18 @@ tests_root = None def clean_tests_root(): global tests_root if tests_root and tests_root[0] == os.getpid(): - subprocess.call(["./umount2", os.path.join(tests_root[1], "dev")]) - os.rmdir(os.path.join(tests_root[1], "root/root")) os.rmdir(os.path.join(tests_root[1], "root")) - os.rmdir(os.path.join(tests_root[1], "dev")) os.rmdir(tests_root[1]) def make_tests_root(): global tests_root if not tests_root: - tmpdir = os.environ.get("TMPDIR", "/tmp") - tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", tmpdir)) + tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", "/tmp")) atexit.register(clean_tests_root) os.mkdir(os.path.join(tests_root[1], "root")) - os.mkdir(os.path.join(tests_root[1], "root", "root")) - # The current file system can be mounted with nodev, so let's create a - # new tmpfs mount for /dev. - devpath = os.path.join(tests_root[1], "dev") - os.mkdir(devpath) - # zdtm wants to create files on this mount. User namespace tests are - # running with custom user and group mappings. - subprocess.check_call(["mount", "-t", "tmpfs", "criu-test-dev", devpath]) - os.chmod(devpath, 0o777) - os.chmod(tests_root[1], 0o755) - os.chmod(os.path.join(tests_root[1], "root"), 0o755) - return os.path.join(tests_root[1], "root", "root"), os.path.join(tests_root[1], "dev") + os.chmod(tests_root[1], 0o777) + return os.path.join(tests_root[1], "root") # Report generation @@ -201,16 +174,15 @@ class host_flavor: class ns_flavor: __root_dirs = [ - "/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", - "/tmp", "/usr", "/proc", "/run" + "/bin", "/sbin", "/etc", "/lib", "/lib64", "/dev", "/dev/pts", + "/dev/net", "/tmp", "/usr", "/proc", "/run" ] - __dev_dirs = ["pts", "net"] def __init__(self, opts): self.name = "ns" self.ns = True self.uns = False - self.root, self.devpath = make_tests_root() + self.root = make_tests_root() self.root_mounted = False def __copy_one(self, fname): @@ -230,8 +202,6 @@ class ns_flavor: def __copy_libs(self, binary): ldd = subprocess.Popen(["ldd", binary], stdout=subprocess.PIPE) - stdout, _ = ldd.communicate() - xl = re.compile( r'^(linux-gate.so|linux-vdso(64)?.so|not a dynamic|.*\s*ldd\s)') @@ -246,9 +216,11 @@ class ns_flavor: map( lambda x: str(x).strip(), filter(lambda x: str(x).startswith('\t'), - stdout.decode( + ldd.stdout.read().decode( 'ascii').splitlines()))))) + ldd.wait() + for lib in libs: if not os.access(lib, os.F_OK): raise test_fail_exc("Can't find lib %s required by %s" % @@ -256,19 +228,16 @@ class ns_flavor: self.__copy_one(lib) def __mknod(self, name, rdev=None): - tdev = stat.S_IFCHR + name = "/dev/" + name if not rdev: - if not os.access(os.path.join("/dev", name), os.F_OK): + if not os.access(name, os.F_OK): print("Skipping %s at root" % name) return else: - s = os.stat(os.path.join("/dev", name)) - rdev = s.st_rdev - if stat.S_ISBLK(s.st_mode): - tdev = stat.S_IFBLK + rdev = os.stat(name).st_rdev - name = os.path.join(self.devpath, name) - os.mknod(name, tdev, rdev) + name = self.root + name + os.mknod(name, stat.S_IFCHR, rdev) os.chmod(name, 0o666) def __construct_root(self): @@ -279,18 +248,11 @@ class ns_flavor: for ldir in ["/bin", "/sbin", "/lib", "/lib64"]: os.symlink(".." + ldir, self.root + "/usr" + ldir) - def __construct_dev(self): - for dir in self.__dev_dirs: - os.mkdir(os.path.join(self.devpath, dir)) - os.chmod(os.path.join(self.devpath, dir), 0o755) self.__mknod("tty", os.makedev(5, 0)) self.__mknod("null", os.makedev(1, 3)) self.__mknod("net/tun") self.__mknod("rtc") self.__mknod("autofs", os.makedev(10, 235)) - ext_dev = os.getenv("ZDTM_MNT_EXT_DEV") - if ext_dev: - self.__mknod(os.path.basename(ext_dev)) def __copy_deps(self, deps): for d in deps.split('|'): @@ -302,7 +264,7 @@ class ns_flavor: def init(self, l_bins, x_bins): subprocess.check_call( - ["mount", "--make-private", "--bind", ".", self.root]) + ["mount", "--make-slave", "--bind", ".", self.root]) self.root_mounted = True if not os.access(self.root + "/.constructed", os.F_OK): @@ -313,9 +275,6 @@ class ns_flavor: self.__construct_root() os.mknod(self.root + "/.constructed", stat.S_IFREG | 0o600) - if not os.access(self.devpath + "/.constructed", os.F_OK): - self.__construct_dev() - os.mknod(self.devpath + "/.constructed", stat.S_IFREG | 0o600) for b in l_bins: self.__copy_libs(b) for b in x_bins: @@ -372,7 +331,8 @@ def decode_flav(i): def tail(path): p = subprocess.Popen(['tail', '-n1', path], stdout=subprocess.PIPE) - out, _ = p.communicate() + out = p.stdout.readline() + p.wait() return out.decode() @@ -430,21 +390,19 @@ class test_fail_expected_exc(Exception): class zdtm_test: - def __init__(self, name, desc, flavor, freezer, rootless): + def __init__(self, name, desc, flavor, freezer): self.__name = name self.__desc = desc self.__freezer = None - self.__timeout = int(self.__desc.get('timeout') or 30) - self.__rootless = rootless self.__make_action('cleanout') self.__pid = 0 self.__flavor = flavor self.__freezer = freezer self._bins = [name] - self._env = {'TMPDIR': os.environ.get('TMPDIR', '/tmp')} + self._env = {} self._deps = desc.get('deps', []) - self._bind = desc.get('bind') self.auto_reap = True + self.__timeout = int(self.__desc.get('timeout') or 30) def __make_action(self, act, env=None, root=None): sys.stdout.flush() # Not to let make's messages appear before ours @@ -466,7 +424,7 @@ class zdtm_test: preexec_fn=self.__freezer and self.__freezer.attach or None) if act == "pid": try_run_hook(self, ["--post-start"]) - if s.wait(timeout=self.__timeout): + if s.wait(): raise test_fail_exc(str(s_args)) if self.__freezer: @@ -479,8 +437,6 @@ class zdtm_test: wait_pid_die(int(self.__pid), self.__name, self.__timeout) def __add_wperms(self): - if os.getuid() != 0: - return # Add write perms for .out and .pid files for b in self._bins: p = os.path.dirname(b) @@ -499,9 +455,6 @@ class zdtm_test: env['ZDTM_NOTIFY_FDIN'] = "100" env['ZDTM_NOTIFY_FDOUT'] = "101" - if self.__rootless: - env['ZDTM_ROOTLESS'] = "1" - if not test_flag(self.__desc, 'suid'): # Numbers should match those in criu env['ZDTM_UID'] = "18943" @@ -514,9 +467,6 @@ class zdtm_test: if self.__flavor.ns: env['ZDTM_NEWNS'] = "1" env['ZDTM_ROOT'] = self.__flavor.root - if self._bind: - env['ZDTM_BIND'] = self._bind - env['ZDTM_DEV'] = self.__flavor.devpath env['PATH'] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" if self.__flavor.uns: @@ -552,15 +502,8 @@ class zdtm_test: self.__freezer.thaw() if self.__pid: print("Send the %d signal to %s" % (sig, self.__pid)) - try: - os.kill(int(self.__pid), sig) - except ProcessLookupError: - if sig != signal.SIGKILL: - raise - print("The process %s doesn't exist" % self.__pid) - self.gone(True) - else: - self.gone(sig == signal.SIGKILL) + os.kill(int(self.__pid), sig) + self.gone(sig == signal.SIGKILL) self.__flavor.fini() @@ -618,24 +561,18 @@ class zdtm_test: opts += ["--root", self.__flavor.root] if test_flag(self.__desc, 'crlib'): opts += [ - "--libdir", + "-L", os.path.dirname(os.path.realpath(self.__name)) + '/lib' ] return opts def getdopts(self): - opts = self.__getcropts() + self.__freezer.getdopts() + \ - self.__desc.get('dopts', '').split() - if self.__flavor.ns: - opts += ["--external", "mnt[/dev]:ZDTM_DEV"] - return opts + return self.__getcropts() + self.__freezer.getdopts( + ) + self.__desc.get('dopts', '').split() def getropts(self): - opts = self.__getcropts() + self.__freezer.getropts() + \ - self.__desc.get('ropts', '').split() - if self.__flavor.ns: - opts += ["--external", "mnt[ZDTM_DEV]:%s" % self.__flavor.devpath] - return opts + return self.__getcropts() + self.__freezer.getropts( + ) + self.__desc.get('ropts', '').split() def unlink_pidfile(self): self.__pid = 0 @@ -653,12 +590,11 @@ class zdtm_test: os.unlink(self.__pidfile()) def print_output(self): - for postfix in ['.out', '.out.inprogress']: - if os.access(self.__name + postfix, os.R_OK): - print("Test output: " + "=" * 32) - with open(self.__name + postfix) as output: - print(output.read()) - print(" <<< " + "=" * 32) + if os.access(self.__name + '.out', os.R_OK): + print("Test output: " + "=" * 32) + with open(self.__name + '.out') as output: + print(output.read()) + print(" <<< " + "=" * 32) def static(self): return self.__name.split('/')[1] == 'static' @@ -672,48 +608,34 @@ class zdtm_test: @staticmethod def available(): if not os.access("umount2", os.X_OK): - subprocess.check_call( - ["make", "umount2"], env=dict(os.environ, MAKEFLAGS="")) + subprocess.check_call(["make", "umount2"]) if not os.access("zdtm_ct", os.X_OK): - subprocess.check_call( - ["make", "zdtm_ct"], env=dict(os.environ, MAKEFLAGS="")) + subprocess.check_call(["make", "zdtm_ct"]) if not os.access("zdtm/lib/libzdtmtst.a", os.F_OK): subprocess.check_call(["make", "-C", "zdtm/"]) - if 'preload_libfault' in opts and opts['preload_libfault']: - subprocess.check_call(["make", "-C", "libfault/"]) - - subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", "clean"]) - if 'criu_plugin' in opts and opts['criu_plugin']: - for name in opts['criu_plugin']: - subprocess.check_call(["make", '--no-print-directory', "-C", "plugins/", f"{name}_plugin.so"]) - - if 'mocked_cuda_checkpoint' in opts and opts['mocked_cuda_checkpoint']: - subprocess.check_call(["make", "-C", "cuda-checkpoint/"]) - if 'rootless' in opts and opts['rootless']: - return subprocess.check_call( - ["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups", str(uuid)]) + ["flock", "zdtm_mount_cgroups.lock", "./zdtm_mount_cgroups"]) @staticmethod def cleanup(): - if 'rootless' in opts and opts['rootless']: - return subprocess.check_call( - ["flock", "zdtm_mount_cgroups.lock", "./zdtm_umount_cgroups", str(uuid)]) + ["flock", "zdtm_mount_cgroups.lock", "./zdtm_umount_cgroups"]) def load_module_from_file(name, path): - import importlib.util - spec = importlib.util.spec_from_file_location(name, path) - mod = importlib.util.module_from_spec(spec) - spec.loader.exec_module(mod) + if sys.version_info[0] == 3 and sys.version_info[1] >= 5: + import importlib.util + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + else: + import imp + mod = imp.load_source(name, path) return mod class inhfd_test: - def __init__(self, name, desc, flavor, freezer, rootless): - if rootless: - raise test_fail_exc("This kind of test does not currently support rootless mode") + def __init__(self, name, desc, flavor, freezer): self.__name = os.path.basename(name) print("Load %s" % name) self.__fdtyp = load_module_from_file(self.__name, name) @@ -874,12 +796,12 @@ class inhfd_test: class groups_test(zdtm_test): - def __init__(self, name, desc, flavor, freezer, rootless): - zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer, rootless) + def __init__(self, name, desc, flavor, freezer): + zdtm_test.__init__(self, 'zdtm/lib/groups', desc, flavor, freezer) if flavor.ns: self.__real_name = name with open(name) as fd: - self.__subs = list(map(lambda x: x.strip(), fd.readlines())) + self.__subs = map(lambda x: x.strip(), fd.readlines()) print("Subs:\n%s" % '\n'.join(self.__subs)) else: self.__real_name = '' @@ -887,7 +809,7 @@ class groups_test(zdtm_test): self._bins += self.__subs self._deps += get_test_desc('zdtm/lib/groups')['deps'] - self._env['ZDTM_TESTS'] = self.__real_name + self._env = {'ZDTM_TESTS': self.__real_name} def __get_start_cmd(self, name): tdir = os.path.dirname(name) @@ -897,8 +819,8 @@ class groups_test(zdtm_test): subprocess.check_call(s_args + [tname + '.cleanout']) s = subprocess.Popen(s_args + ['--dry-run', tname + '.pid'], stdout=subprocess.PIPE) - out, _ = s.communicate(timeout=self.__timeout) - cmd = out.decode().splitlines()[-1].strip() + cmd = s.stdout.readlines().pop().strip() + s.wait() return 'cd /' + tdir + ' && ' + cmd @@ -941,22 +863,15 @@ class criu_cli: fault=None, strace=[], preexec=None, - preload_libfault=False, - nowait=False, - timeout=60): + nowait=False): env = dict( os.environ, - ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0", - CRIU_LIBS_DIR=PLUGINS_DIR - ) + ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0") if fault: print("Forcing %s fault" % fault) env['CRIU_FAULT'] = fault - if preload_libfault: - env['LD_PRELOAD'] = LIBFAULT_PATH - cr = subprocess.Popen(strace + [criu_bin, action, "--no-default-config"] + args, env=env, @@ -964,11 +879,7 @@ class criu_cli: preexec_fn=preexec) if nowait: return cr - return cr.wait(timeout=timeout) - - @staticmethod - def exit_signal(ret): - return ret < 0 + return cr.wait() class criu_rpc_process: @@ -986,14 +897,14 @@ class criu_rpc: def __set_opts(criu, args, ctx): while len(args) != 0: arg = args.pop(0) - if "--verbosity=4" == arg: + if "-v4" == arg: criu.opts.log_level = 4 - elif "--log-file" == arg: + elif "-o" == arg: criu.opts.log_file = args.pop(0) - elif "--images-dir" == arg: + elif "-D" == arg: criu.opts.images_dir_fd = os.open(args.pop(0), os.O_DIRECTORY) ctx['imgd'] = criu.opts.images_dir_fd - elif "--tree" == arg: + elif "-t" == arg: criu.opts.pid = int(args.pop(0)) elif "--pidfile" == arg: ctx['pidf'] = args.pop(0) @@ -1039,8 +950,6 @@ class criu_rpc: if criu_rpc.pidfd_store_socket is None: criu_rpc.pidfd_store_socket = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) criu.opts.pidfd_store_sk = criu_rpc.pidfd_store_socket.fileno() - elif "--mntns-compat-mode" == arg: - criu.opts.mntns_compat_mode = True else: raise test_fail_exc('RPC for %s(%s) required' % (arg, args.pop(0))) @@ -1051,9 +960,7 @@ class criu_rpc: fault=None, strace=[], preexec=None, - preload_libfault=False, - nowait=False, - timeout=None): + nowait=False): if fault: raise test_fail_exc('RPC and FAULT not supported') if strace: @@ -1061,7 +968,7 @@ class criu_rpc: if preexec: raise test_fail_exc('RPC and PREEXEC not supported') - ctx = {} # Object used to keep info until action is done + ctx = {} # Object used to keep info untill action is done criu = crpc.criu() criu.use_binary(criu_bin) criu_rpc.__set_opts(criu, args, ctx) @@ -1090,11 +997,8 @@ class criu_rpc: else: raise test_fail_exc('RPC for %s required' % action) except crpc.CRIUExceptionExternal as e: - if e.typ != e.resp_typ: - ret = -2 - else: - print("Fail", e) - ret = -1 + print("Fail", e) + ret = -1 else: ret = 0 @@ -1107,10 +1011,6 @@ class criu_rpc: return ret - @staticmethod - def exit_signal(ret): - return ret == -2 - class criu: def __init__(self, opts): @@ -1132,46 +1032,17 @@ class criu: self.__dedup = bool(opts['dedup']) self.__mdedup = bool(opts['noauto_dedup']) self.__user = bool(opts['user']) - self.__rootless = bool(opts['rootless']) self.__leave_stopped = bool(opts['stop']) self.__stream = bool(opts['stream']) + self.__criu = (opts['rpc'] and criu_rpc or criu_cli) self.__show_stats = bool(opts['show_stats']) self.__lazy_pages_p = None self.__page_server_p = None self.__dump_process = None - self.__img_streamer_process = None self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] - - global crpc - pycriu_search_path = opts.get('pycriu_search_path') - if pycriu_search_path: - sys.path.insert(0, pycriu_search_path) - - try: - import pycriu as crpc - if pycriu_search_path: - print(f"pycriu loaded from: {crpc.__file__}") - except ImportError: - if not pycriu_search_path: - print("Consider building CRIU or using '--pycriu-search-path' option.") - raise - finally: - if pycriu_search_path: - sys.path.pop(0) - self.__crit_bin = opts['crit_bin'] self.__pre_dump_mode = opts['pre_dump_mode'] - self.__preload_libfault = bool(opts['preload_libfault']) - self.__mntns_compat_mode = bool(opts['mntns_compat_mode']) - self.__cuda_checkpoint = bool(opts['mocked_cuda_checkpoint']) - - if opts['rpc']: - self.__criu = criu_rpc - elif opts['criu_config']: - self.__criu = criu_config - else: - self.__criu = criu_cli def fini(self): if self.__lazy_migrate: @@ -1194,11 +1065,6 @@ class criu: self.__dump_process = None if ret: raise test_fail_exc("criu dump exited with %s" % ret) - if self.__img_streamer_process: - ret = self.wait_for_criu_image_streamer() - if ret: - raise test_fail_exc("criu-image-streamer exited with %s" % ret) - return def logs(self): @@ -1244,20 +1110,13 @@ class criu: if not log: log = action + ".log" - s_args = ["--log-file", log, "--images-dir", self.__ddir(), - "--verbosity=4"] + opts - - if self.__cuda_checkpoint: - s_args += [ "--libdir" , os.path.join(os.getcwd(), "..", "plugins", "cuda") ] + s_args = ["-o", log, "-D", self.__ddir(), "-v4"] + opts with open(os.path.join(self.__ddir(), action + '.cropt'), 'w') as f: f.write(' '.join(s_args) + '\n') print("Run criu " + action) - if self.__rootless: - s_args += ["--unprivileged"] - strace = [] if self.__sat: fname = os.path.join(self.__ddir(), action + '.strace') @@ -1276,10 +1135,7 @@ class criu: if action == "restore": preexec = None else: - if os.getuid(): - preexec = None - else: - preexec = self.__user and self.set_user_id or None + preexec = self.__user and self.set_user_id or None __ddir = self.__ddir() @@ -1294,10 +1150,8 @@ class criu: with open("/proc/sys/kernel/ns_last_pid") as ns_last_pid_fd: ns_last_pid = ns_last_pid_fd.read() - preload_libfault = self.__preload_libfault and action in ['dump', 'pre-dump', 'restore'] - ret = self.__criu.run(action, s_args, self.__criu_bin, self.__fault, - strace, preexec, preload_libfault, nowait) + strace, preexec, nowait) if nowait: os.close(status_fds[1]) @@ -1337,8 +1191,8 @@ class criu: return rst_succeeded = os.access( os.path.join(__ddir, "restore-succeeded"), os.F_OK) - if (self.__test.blocking() and not self.__criu.exit_signal(ret)) or \ - (self.__sat and action == 'restore' and rst_succeeded): + if self.__test.blocking() or (self.__sat and action == 'restore' and + rst_succeeded): raise test_fail_expected_exc(action) else: raise test_fail_exc("CRIU %s" % action) @@ -1365,10 +1219,8 @@ class criu: stent['pages_written']) if self.__stream: - self.spawn_criu_image_streamer("extract") - ret = self.wait_for_criu_image_streamer() - if ret: - raise test_fail_exc("criu-image-streamer (extract) exited with %s" % ret) + p = self.spawn_criu_image_streamer("extract") + p.wait() real_written = 0 for f in os.listdir(self.__ddir()): @@ -1410,8 +1262,6 @@ class criu: "--progress-fd {progress_fd}", action] - log = open(os.path.join(self.__ddir(), "img-streamer.log"), "w") - # * As we are using a shell pipe command, we want to use pipefail. # Otherwise, failures stay unnoticed. For this, we use bash as sh # doesn't support that feature. @@ -1420,9 +1270,7 @@ class criu: progress_fd=progress_w, images_dir=self.__ddir(), img_file=os.path.join(self.__ddir(), STREAMED_IMG_FILE_NAME) - )], stderr=log, close_fds=False) - - log.close() + )], close_fds=False) os.close(progress_w) progress = os.fdopen(progress_r, "r") @@ -1439,22 +1287,14 @@ class criu: raise test_fail_exc( "criu-image-streamer is not starting (exit_code=%d)" % p.wait()) - progress.close() - - self.__img_streamer_process = p - - def wait_for_criu_image_streamer(self): - ret = self.__img_streamer_process.wait() - grep_errors(os.path.join(self.__ddir(), "img-streamer.log")) - self.__img_streamer_process = None - return ret + return p def dump(self, action, opts=[]): self.__iter += 1 os.mkdir(self.__ddir()) os.chmod(self.__ddir(), 0o777) - a_opts = ["--tree", self.__test.getpid()] + a_opts = ["-t", self.__test.getpid()] if self.__prev_dump_iter: a_opts += [ "--prev-images-dir", @@ -1479,7 +1319,7 @@ class criu: a_opts += self.__test.getdopts() if self.__stream: - self.spawn_criu_image_streamer("capture") + streamer_p = self.spawn_criu_image_streamer("capture") a_opts += ["--stream"] if self.__dedup: @@ -1507,9 +1347,9 @@ class criu: opts=a_opts + opts, nowait=nowait) if self.__stream: - ret = self.wait_for_criu_image_streamer() + ret = streamer_p.wait() if ret: - raise test_fail_exc("criu-image-streamer (capture) exited with %d" % ret) + raise test_fail_exc("criu-image-streamer exited with %d" % ret) if self.__mdedup and self.__iter > 1: self.__criu_act("dedup", opts=[]) @@ -1542,7 +1382,7 @@ class criu: r_opts += ['--action-script', os.getcwd() + '/empty-netns-prep.sh'] if self.__stream: - self.spawn_criu_image_streamer("serve") + streamer_p = self.spawn_criu_image_streamer("serve") r_opts += ["--stream"] if self.__dedup: @@ -1574,17 +1414,14 @@ class criu: nowait=True) r_opts += ["--lazy-pages"] - if self.__mntns_compat_mode: - r_opts = ['--mntns-compat-mode'] + r_opts - if self.__leave_stopped: r_opts += ['--leave-stopped'] self.__criu_act("restore", opts=r_opts + ["--restore-detached"]) if self.__stream: - ret = self.wait_for_criu_image_streamer() + ret = streamer_p.wait() if ret: - raise test_fail_exc("criu-image-streamer (serve) exited with %d" % ret) + raise test_fail_exc("criu-image-streamer exited with %d" % ret) self.show_stats("restore") @@ -1601,17 +1438,14 @@ class criu: except Exception: return False - args = ["--no-default-config", "-verbosity=0", "--feature", feature] - if opts['rootless']: - args += ["--unprivileged"] - - return criu_cli.run("check", args, opts['criu_bin']) == 0 + return criu_cli.run( + "check", ["--no-default-config", "-v0", "--feature", feature], + opts['criu_bin']) == 0 @staticmethod def available(): if not os.access(opts['criu_bin'], os.X_OK): print("CRIU binary not found at %s" % opts['criu_bin']) - print("Consider building CRIU or using '--criu-bin' option.") sys.exit(1) def kill(self): @@ -1619,23 +1453,19 @@ class criu: self.__lazy_pages_p.terminate() print("criu lazy-pages exited with %s" % self.__lazy_pages_p.wait()) - grep_errors(os.path.join(self.__ddir(), "lazy-pages.log"), err=True) + grep_errors(os.path.join(self.__ddir(), "lazy-pages.log")) self.__lazy_pages_p = None if self.__page_server_p: self.__page_server_p.terminate() print("criu page-server exited with %s" % self.__page_server_p.wait()) - grep_errors(os.path.join(self.__ddir(), "page-server.log"), err=True) + grep_errors(os.path.join(self.__ddir(), "page-server.log")) self.__page_server_p = None if self.__dump_process: self.__dump_process.terminate() print("criu dump exited with %s" % self.__dump_process.wait()) - grep_errors(os.path.join(self.__ddir(), "dump.log"), err=True) + grep_errors(os.path.join(self.__ddir(), "dump.log")) self.__dump_process = None - if self.__img_streamer_process: - self.__img_streamer_process.terminate() - ret = self.wait_for_criu_image_streamer() - print("criu-image-streamer exited with %s" % ret) def try_run_hook(test, args): @@ -1683,8 +1513,8 @@ def cr(cr_api, test, opts): iters = iter_parm(opts['iters'], 1) for i in iters[0]: - pre = iter_parm(opts['pre'], 0) - for p in pre[0]: + pres = iter_parm(opts['pre'], 0) + for p in pres[0]: if opts['snaps']: sbs('before snap %d' % p) cr_api.dump("dump", opts=["--leave-running", "--track-mem"]) @@ -1693,7 +1523,7 @@ def cr(cr_api, test, opts): cr_api.dump("pre-dump") try_run_hook(test, ["--post-pre-dump"]) test.pre_dump_notify() - time.sleep(pre[1]) + time.sleep(pres[1]) sbs('before dump') @@ -1778,15 +1608,6 @@ def get_visible_state(test): return files, maps, mounts -def has_vsyscall(maps): - vsyscall = u"ffffffffff600000-ffffffffff601000" - for i in maps: - if vsyscall in i: - return i - - return None - - def check_visible_state(test, state, opts): new = get_visible_state(test) @@ -1802,9 +1623,9 @@ def check_visible_state(test, state, opts): new_maps = new[1][pid] if os.getenv("COMPAT_TEST"): # the vsyscall vma isn't unmapped from x32 processes - entry = has_vsyscall(new_maps) - if entry and has_vsyscall(old_maps) is None: - new_maps.remove(entry) + vsyscall = u"ffffffffff600000-ffffffffff601000 r-xp" + if vsyscall in new_maps and vsyscall not in old_maps: + new_maps.remove(vsyscall) if old_maps != new_maps: print("%s: Old maps lost: %s" % (pid, old_maps - new_maps)) print("%s: New maps appeared: %s" % (pid, new_maps - old_maps)) @@ -2027,7 +1848,7 @@ def do_run_test(tname, tdesc, flavs, opts): if opts['dry_run']: continue flav = flavors[f](opts) - t = tclass(tname, tdesc, flav, fcg, opts['rootless']) + t = tclass(tname, tdesc, flav, fcg) cr_api = criu(opts) try: @@ -2078,6 +1899,8 @@ class Launcher: self.__subs = {} self.__fail = False self.__file_report = None + self.__junit_file = None + self.__junit_test_cases = None self.__failed = [] self.__nr_skip = 0 if self.__max > 1 and self.__total > 1: @@ -2089,14 +1912,22 @@ class Launcher: if opts['report'] and (opts['keep_going'] or self.__total == 1): global TestSuite, TestCase + from junit_xml import TestSuite, TestCase now = datetime.datetime.now() att = 0 reportname = os.path.join(report_dir, "criu-testreport.tap") - while os.access(reportname, os.F_OK): + junitreport = os.path.join(report_dir, "criu-testreport.xml") + while os.access(reportname, os.F_OK) or os.access( + junitreport, os.F_OK): reportname = os.path.join(report_dir, "criu-testreport" + ".%d.tap" % att) + junitreport = os.path.join(report_dir, + "criu-testreport" + ".%d.xml" % att) att += 1 + self.__junit_file = open(junitreport, 'a') + self.__junit_test_cases = [] + self.__file_report = open(reportname, 'a') print(u"TAP version 13", file=self.__file_report) print(u"# Hardware architecture: " + arch, file=self.__file_report) @@ -2105,20 +1936,12 @@ class Launcher: file=self.__file_report) print(u"# ", file=self.__file_report) print(u"1.." + str(nr_tests), file=self.__file_report) - self.__taint = self.__read_kernel_tainted() - if int(self.__taint, 0) != 0: - self.__report_kernel_taint("The kernel is tainted: %r" % self.__taint) - - @staticmethod - def __read_kernel_tainted(): with open("/proc/sys/kernel/tainted") as taintfd: - return taintfd.read().strip() - - @staticmethod - def __report_kernel_taint(msg): - print(msg) - if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != "1": - raise Exception(msg) + self.__taint = taintfd.read() + if int(self.__taint, 0) != 0: + print("The kernel is tainted: %r" % self.__taint) + if not opts["ignore_taint"] and os.getenv("ZDTM_IGNORE_TAINT") != '1': + raise Exception("The kernel is tainted: %r" % self.__taint) def __show_progress(self, msg): perc = int(self.__nr * 16 / self.__total) @@ -2131,6 +1954,10 @@ class Launcher: self.__runtest += 1 self.__nr_skip += 1 + if self.__junit_test_cases is not None: + tc = TestCase(name) + tc.add_skipped_info(reason) + self.__junit_test_cases.append(tc) if self.__file_report: testline = u"ok %d - %s # SKIP %s" % (self.__runtest, name, reason) print(testline, file=self.__file_report) @@ -2140,29 +1967,13 @@ class Launcher: if len(self.__subs) >= self.__max: self.wait() - taint = self.__read_kernel_tainted() + with open("/proc/sys/kernel/tainted") as taintfd: + taint = taintfd.read() if self.__taint != taint: - prev_taint = self.__taint - self.__taint = taint - self.__report_kernel_taint( - "The kernel is tainted: %r (was %r)" % (taint, prev_taint)) + raise Exception("The kernel is tainted: %r (%r)" % + (taint, self.__taint)) - ''' - The option --link-remap allows criu to hardlink open files back to the - file-system on dump (should be removed on restore) and we have a sanity - check in check_visible_state that they were actually removed at least - from the root test directory after restore. - - As zdtm runs all tests from the same cwd (e.g.: test/zdtm/static) in - parallel, hardlinks from one test can mess up with sanity checks of - another test or even one test can by mistake use hardlinks created by - another test which is even worse. - - So let's make all tests using --link-remap option non parallel. - ''' - link_remap_excl = '--link-remap' in desc.get('opts', '').split() + desc.get('dopts', '').split() + desc.get('ropts', '').split() - - if test_flag(desc, 'excl') or link_remap_excl: + if test_flag(desc, 'excl'): self.wait_all() self.__nr += 1 @@ -2170,12 +1981,10 @@ class Launcher: nd = ('nocr', 'norst', 'pre', 'iters', 'page_server', 'sibling', 'stop', 'empty_ns', 'fault', 'keep_img', 'report', 'snaps', - 'sat', 'script', 'rpc', 'criu_config', 'lazy_pages', 'join_ns', - 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', + 'sat', 'script', 'rpc', 'lazy_pages', 'join_ns', 'dedup', 'sbs', + 'freezecg', 'user', 'dry_run', 'noauto_dedup', 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'stream', - 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode', 'mntns_compat_mode', - 'rootless', 'preload_libfault', 'mocked_cuda_checkpoint', - 'pycriu_search_path') + 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2185,14 +1994,8 @@ class Launcher: logf = None log = None - if opts['rootless'] and os.getuid() == 0: - os.setgid(NON_ROOT_UID) - os.setuid(NON_ROOT_UID) - env = dict(os.environ, CR_CT_TEST_INFO=arg) - if opts['mocked_cuda_checkpoint']: - env['PATH'] = os.path.join(os.getcwd(), "cuda-checkpoint") + ":" + env["PATH"] sub = subprocess.Popen(["./zdtm_ct", "zdtm.py"], - env=env, + env=dict(os.environ, CR_CT_TEST_INFO=arg), stdout=log, stderr=subprocess.STDOUT, close_fds=True) @@ -2203,10 +2006,7 @@ class Launcher: "start": time.time() } - if log: - log.close() - - if test_flag(desc, 'excl') or link_remap_excl: + if test_flag(desc, 'excl'): self.wait() def __wait_one(self, flags): @@ -2229,9 +2029,11 @@ class Launcher: self.__runtest += 1 if pid != 0: sub = self.__subs.pop(pid) - # The following wait() is not useful for our domain logic. - # It's useful for taming warnings in subprocess.Popen.__del__() - sub['sub'].wait() + tc = None + if self.__junit_test_cases is not None: + tc = TestCase(sub['name'], + elapsed_sec=time.time() - sub['start']) + self.__junit_test_cases.append(tc) if status != 0: self.__fail = True failed_flavor = decode_flav(os.WEXITSTATUS(status)) @@ -2242,6 +2044,7 @@ class Launcher: with open(sub['log']) as sublog: output = sublog.read() details = {'output': output} + tc.add_error_info(output=output) print(testline, file=self.__file_report) print("%s" % yaml.safe_dump(details, explicit_start=True, @@ -2287,6 +2090,10 @@ class Launcher: if not opts['fault'] and check_core_files(): self.__fail = True if self.__file_report: + ts = TestSuite(opts['title'], self.__junit_test_cases, + os.getenv("NODE_NAME")) + self.__junit_file.write(TestSuite.to_xml_string([ts])) + self.__junit_file.close() self.__file_report.close() if opts['keep_going']: @@ -2322,21 +2129,9 @@ def all_tests(opts): continue files.append(fp) excl = list(map(lambda x: os.path.join(desc['dir'], x), desc['exclude'])) - tlist = list(sorted(filter( + tlist = filter( lambda x: not x.endswith('.checkskip') and not x.endswith('.hook') and - x not in excl, map(lambda x: x.strip(), files)))) - - if opts.get('test_shard_count'): - if opts.get('test_shard_index') is None: - raise KeyError('--test_shard_count > 0 must come with --test_shard_index') - slice_idx = opts['test_shard_index'] - slices = opts['test_shard_count'] - if slice_idx >= slices: - raise IndexError('--test_shard_index not less than --test_shard_count ({} >= {})'.format(slice_idx, slices)) - slist = list(tlist[slice_idx::slices]) - print("We're shard #{} of {}. Running {} of {} tests.\n".format(slice_idx, slices, len(slist), len(tlist))) - tlist = slist - + x not in excl, map(lambda x: x.strip(), files)) return tlist @@ -2447,6 +2242,11 @@ def run_tests(opts): return torun = list(torun) + if opts['keep_going'] and len(torun) < 2: + print( + "[WARNING] Option --keep-going is more useful when running multiple tests" + ) + opts['keep_going'] = False if opts['exclude']: excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") @@ -2489,7 +2289,6 @@ def run_tests(opts): "Specify --criu-image-streamer-dir or modify PATH to provide an alternate location") .format(streamer_dir)) - usernsIsSupported = criu.check("userns") launcher = Launcher(opts, len(torun)) try: for t in torun: @@ -2559,7 +2358,7 @@ def run_tests(opts): run_flavs = set(test_flavs) & set(opts_flavs) else: run_flavs = set([test_flavs.pop()]) - if not usernsIsSupported: + if not criu.check("userns"): run_flavs -= set(['uns']) if opts['user']: # FIXME -- probably uns will make sense @@ -2665,19 +2464,19 @@ class group: self.__dump_meta(fname, '.hook') -def group_tests(cli_opts): +def group_tests(opts): excl = None groups = [] pend_groups = [] - maxs = int(cli_opts['max_size']) + maxs = int(opts['max_size']) if not os.access("groups", os.F_OK): os.mkdir("groups") - tlist = all_tests(cli_opts) + tlist = all_tests(opts) random.shuffle(tlist) - if cli_opts['exclude']: - excl = re.compile(".*(" + "|".join(cli_opts['exclude']) + ")") + if opts['exclude']: + excl = re.compile(".*(" + "|".join(opts['exclude']) + ")") print("Compiled exclusion list") for t in tlist: @@ -2699,7 +2498,7 @@ def group_tests(cli_opts): groups += pend_groups nr = 0 - suf = cli_opts['name'] or 'group' + suf = opts['name'] or 'group' for g in groups: if maxs > 1 and g.size() == 1: # Not much point in group test for this @@ -2720,243 +2519,179 @@ def clean_stuff(opts): f.clean() -def set_nr_hugepages(nr): - try: - orig_hugepages = 0 - with open("/proc/sys/vm/nr_hugepages", "r") as f: - orig_hugepages = int(f.read()) - with open("/proc/sys/vm/nr_hugepages", "w") as f: - f.write("{}\n".format(nr)) - return orig_hugepages - except PermissionError as err: - # EACCES is expected when running as non-root, otherwise re-raise the exception. - if err.errno != errno.EACCES or os.getuid() == 0: - raise - except OSError as err: - if err.errno != errno.EOPNOTSUPP: - raise +# +# main() starts here +# - return 0 +if 'CR_CT_TEST_INFO' in os.environ: + # Fork here, since we're new pidns init and are supposed to + # collect this namespace's zombies + status = 0 + pid = os.fork() + if pid == 0: + tinfo = eval(os.environ['CR_CT_TEST_INFO']) + do_run_test(tinfo[0], tinfo[1], tinfo[2], tinfo[3]) + else: + while True: + wpid, status = os.wait() + if wpid == pid: + if os.WIFEXITED(status): + status = os.WEXITSTATUS(status) + else: + status = 1 + break + sys.exit(status) -def get_cli_args(): - """ - Parse command-line arguments - """ - p = argparse.ArgumentParser("CRIU test suite") - p.add_argument("--debug", - help="Print what's being executed", - action='store_true') - p.add_argument("--set", help="Which set of tests to use", default='zdtm') +p = argparse.ArgumentParser("CRIU test suite") +p.add_argument("--debug", + help="Print what's being executed", + action='store_true') +p.add_argument("--set", help="Which set of tests to use", default='zdtm') - sp = p.add_subparsers(help="Use --help for list of actions") +sp = p.add_subparsers(help="Use --help for list of actions") - rp = sp.add_parser("run", help="Run test(s)") - rp.set_defaults(action=run_tests) - rp.add_argument("-a", "--all", action='store_true') - rp.add_argument("-t", "--test", help="Test name", action='append') - rp.add_argument("-T", "--tests", help="Regexp") - rp.add_argument("-F", "--from", help="From file") - rp.add_argument("-f", "--flavor", help="Flavor to run") - rp.add_argument("-x", - "--exclude", - help="Exclude tests from --all run", - action='append') +rp = sp.add_parser("run", help="Run test(s)") +rp.set_defaults(action=run_tests) +rp.add_argument("-a", "--all", action='store_true') +rp.add_argument("-t", "--test", help="Test name", action='append') +rp.add_argument("-T", "--tests", help="Regexp") +rp.add_argument("-F", "--from", help="From file") +rp.add_argument("-f", "--flavor", help="Flavor to run") +rp.add_argument("-x", + "--exclude", + help="Exclude tests from --all run", + action='append') - rp.add_argument("--sibling", - help="Restore tests as siblings", - action='store_true') - rp.add_argument("--join-ns", - help="Restore tests and join existing namespace", - action='store_true') - rp.add_argument("--empty-ns", - help="Restore tests in empty net namespace", - action='store_true') - rp.add_argument("--pre", help="Do some pre-dumps before dump (n[:pause])") - rp.add_argument("--snaps", - help="Instead of pre-dumps do full dumps", - action='store_true') - rp.add_argument("--dedup", - help="Auto-deduplicate images on iterations", - action='store_true') - rp.add_argument("--noauto-dedup", - help="Manual deduplicate images on iterations", - action='store_true') - rp.add_argument("--nocr", - help="Do not CR anything, just check test works", - action='store_true') - rp.add_argument("--norst", - help="Don't restore tasks, leave them running after dump", - action='store_true') - rp.add_argument("--stop", - help="Check that --leave-stopped option stops ps tree.", - action='store_true') - rp.add_argument("--iters", - help="Do CR cycle several times before check (n[:pause])") - rp.add_argument("--fault", help="Test fault injection") - rp.add_argument( - "--sat", - help="Generate criu strace-s for sat tool (restore is fake, images are kept)", - action='store_true') - rp.add_argument( - "--sbs", - help="Do step-by-step execution, asking user for keypress to continue", - action='store_true') - rp.add_argument("--freezecg", help="Use freeze cgroup (path:state)") - rp.add_argument("--user", help="Run CRIU as regular user", - action='store_true') - rp.add_argument( - "--rootless", - help="Run CRIU rootless (uid!=0) (needs CAP_CHECKPOINT_RESTORE)", - action='store_true') - rp.add_argument("--rpc", - help="Run CRIU via RPC rather than CLI", - action='store_true') +rp.add_argument("--sibling", + help="Restore tests as siblings", + action='store_true') +rp.add_argument("--join-ns", + help="Restore tests and join existing namespace", + action='store_true') +rp.add_argument("--empty-ns", + help="Restore tests in empty net namespace", + action='store_true') +rp.add_argument("--pre", help="Do some pre-dumps before dump (n[:pause])") +rp.add_argument("--snaps", + help="Instead of pre-dumps do full dumps", + action='store_true') +rp.add_argument("--dedup", + help="Auto-deduplicate images on iterations", + action='store_true') +rp.add_argument("--noauto-dedup", + help="Manual deduplicate images on iterations", + action='store_true') +rp.add_argument("--nocr", + help="Do not CR anything, just check test works", + action='store_true') +rp.add_argument("--norst", + help="Don't restore tasks, leave them running after dump", + action='store_true') +rp.add_argument("--stop", + help="Check that --leave-stopped option stops ps tree.", + action='store_true') +rp.add_argument("--iters", + help="Do CR cycle several times before check (n[:pause])") +rp.add_argument("--fault", help="Test fault injection") +rp.add_argument( + "--sat", + help="Generate criu strace-s for sat tool (restore is fake, images are kept)", + action='store_true') +rp.add_argument( + "--sbs", + help="Do step-by-step execution, asking user for keypress to continue", + action='store_true') +rp.add_argument("--freezecg", help="Use freeze cgroup (path:state)") +rp.add_argument("--user", help="Run CRIU as regular user", action='store_true') +rp.add_argument("--rpc", + help="Run CRIU via RPC rather than CLI", + action='store_true') - rp.add_argument("--criu-config", - help="Use config file to set CRIU options", - action='store_true') - rp.add_argument("--page-server", - help="Use page server dump", - action='store_true') - rp.add_argument("--stream", - help="Use criu-image-streamer", - action='store_true') - rp.add_argument("-p", "--parallel", help="Run test in parallel") - rp.add_argument("--dry-run", - help="Don't run tests, just pretend to", - action='store_true') - rp.add_argument("--script", help="Add script to get notified by criu") - rp.add_argument("-k", - "--keep-img", - help="Whether or not to keep images after test", - choices=['always', 'never', 'failed'], - default='failed') - rp.add_argument("--report", help="Generate summary report in directory") - rp.add_argument("--keep-going", - help="Keep running tests in spite of failures", - action='store_true') - rp.add_argument("--ignore-taint", - help="Don't care about a non-zero kernel taint flag", - action='store_true') - rp.add_argument("--lazy-pages", - help="restore pages on demand", - action='store_true') - rp.add_argument("--lazy-migrate", - help="restore pages on demand", - action='store_true') - rp.add_argument("--remote-lazy-pages", - help="simulate lazy migration", - action='store_true') - rp.add_argument("--tls", help="use TLS for migration", action='store_true') - rp.add_argument("--title", help="A test suite title", default="criu") - rp.add_argument("--show-stats", - help="Show criu statistics", - action='store_true') - rp.add_argument("--criu-bin", - help="Path to criu binary", - default='../criu/criu') - rp.add_argument("--pycriu-search-path", - help=f"Path to search for pycriu module first (e.g., {site.getsitepackages()[0]})", - default=None) - rp.add_argument("--crit-bin", - help="Path to crit binary", - default='../crit/crit') - rp.add_argument("--criu-image-streamer-dir", - help="Directory where the criu-image-streamer binary is located", - default="../../criu-image-streamer") - rp.add_argument("--pre-dump-mode", - help="Use splice or read mode of pre-dumping", - choices=['splice', 'read'], - default='splice') - rp.add_argument("--mntns-compat-mode", - help="Use old compat mounts restore engine", - action='store_true') - rp.add_argument("--test-shard-index", type=int, default=None, - help="Select tests for a shard (0-based)") - rp.add_argument("--test-shard-count", type=int, default=0, - help="Specify how many shards are being run (0=sharding disabled; must be the same for all shards)") - rp.add_argument("--preload-libfault", action="store_true", help="Run criu with library preload to simulate special cases") - rp.add_argument("--criu-plugin", - help="Run tests with CRIU plugin", - choices=['amdgpu', 'cuda', 'inventory_test_enabled', 'inventory_test_disabled'], - nargs='+', - default=None) - rp.add_argument("--mocked-cuda-checkpoint", - action="store_true", - help="Run criu with the cuda plugin and the mocked cuda-checkpoint tool") +rp.add_argument("--page-server", + help="Use page server dump", + action='store_true') +rp.add_argument("--stream", + help="Use criu-image-streamer", + action='store_true') +rp.add_argument("-p", "--parallel", help="Run test in parallel") +rp.add_argument("--dry-run", + help="Don't run tests, just pretend to", + action='store_true') +rp.add_argument("--script", help="Add script to get notified by criu") +rp.add_argument("-k", + "--keep-img", + help="Whether or not to keep images after test", + choices=['always', 'never', 'failed'], + default='failed') +rp.add_argument("--report", help="Generate summary report in directory") +rp.add_argument("--keep-going", + help="Keep running tests in spite of failures", + action='store_true') +rp.add_argument("--ignore-taint", + help="Don't care about a non-zero kernel taint flag", + action='store_true') +rp.add_argument("--lazy-pages", + help="restore pages on demand", + action='store_true') +rp.add_argument("--lazy-migrate", + help="restore pages on demand", + action='store_true') +rp.add_argument("--remote-lazy-pages", + help="simulate lazy migration", + action='store_true') +rp.add_argument("--tls", help="use TLS for migration", action='store_true') +rp.add_argument("--title", help="A test suite title", default="criu") +rp.add_argument("--show-stats", + help="Show criu statistics", + action='store_true') +rp.add_argument("--criu-bin", + help="Path to criu binary", + default='../criu/criu') +rp.add_argument("--crit-bin", + help="Path to crit binary", + default='../crit/crit') +rp.add_argument("--criu-image-streamer-dir", + help="Directory where the criu-image-streamer binary is located", + default="../../criu-image-streamer") +rp.add_argument("--pre-dump-mode", + help="Use splice or read mode of pre-dumping", + choices=['splice', 'read'], + default='splice') - lp = sp.add_parser("list", help="List tests") - lp.set_defaults(action=list_tests) - lp.add_argument('-i', - '--info', - help="Show more info about tests", - action='store_true') +lp = sp.add_parser("list", help="List tests") +lp.set_defaults(action=list_tests) +lp.add_argument('-i', + '--info', + help="Show more info about tests", + action='store_true') - gp = sp.add_parser("group", help="Generate groups") - gp.set_defaults(action=group_tests) - gp.add_argument("-m", "--max-size", - help="Maximum number of tests in group") - gp.add_argument("-n", "--name", help="Common name for group tests") - gp.add_argument("-x", - "--exclude", - help="Exclude tests from --all run", - action='append') +gp = sp.add_parser("group", help="Generate groups") +gp.set_defaults(action=group_tests) +gp.add_argument("-m", "--max-size", help="Maximum number of tests in group") +gp.add_argument("-n", "--name", help="Common name for group tests") +gp.add_argument("-x", + "--exclude", + help="Exclude tests from --all run", + action='append') - cp = sp.add_parser("clean", help="Clean something") - cp.set_defaults(action=clean_stuff) - cp.add_argument("what", choices=['nsroot']) +cp = sp.add_parser("clean", help="Clean something") +cp.set_defaults(action=clean_stuff) +cp.add_argument("what", choices=['nsroot']) - return vars(p.parse_args()) +opts = vars(p.parse_args()) +if opts.get('sat', False): + opts['keep_img'] = 'always' +if opts['debug']: + sys.settrace(traceit) -def waitpid_and_rip_zombies(pid): - """ - Collect this namespace's zombies - """ - while True: - wpid, status = os.wait() - if wpid == pid: - if os.WIFEXITED(status): - return os.WEXITSTATUS(status) - return 1 +if opts['action'] == 'run': + criu.available() +for tst in test_classes.values(): + tst.available() +opts['action'](opts) -def fork_zdtm(): - """ - Fork here, since we're new pidns init and are supposed to - collect this namespace's zombies - """ - if 'CR_CT_TEST_INFO' in os.environ: - status = 0 - pid = os.fork() - if pid == 0: - tinfo = eval(os.environ['CR_CT_TEST_INFO']) - do_run_test(tinfo[0], tinfo[1], tinfo[2], tinfo[3]) - else: - status = waitpid_and_rip_zombies(pid) - sys.exit(status) - - -if __name__ == '__main__': - os.chdir(os.path.dirname(os.path.abspath(__file__))) - signal.signal(signal.SIGALRM, alarm) - fork_zdtm() - opts = get_cli_args() - if opts.get('sat', False): - opts['keep_img'] = 'always' - - if opts['debug']: - sys.settrace(traceit) - - if opts['action'] == run_tests: - criu.available() - for tst in test_classes.values(): - tst.available() - - orig_hugepages = set_nr_hugepages(20) - opts['action'](opts) - set_nr_hugepages(orig_hugepages) - - for tst in test_classes.values(): - tst.cleanup() +for tst in test_classes.values(): + tst.cleanup() diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index c95b4ef6a..69154fdc9 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -23,12 +23,12 @@ ifeq ($(ARCH),arm) ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) - ARCHCFLAGS += -march=armv6 + USERCFLAGS += -march=armv6 else ifeq ($(ARMV),7) - ARCHCFLAGS += -march=armv7-a+fp + USERCFLAGS += -march=armv7-a else ifeq ($(ARMV),8) - # To build aarch32 on armv8 (see criu Makefile) - ARCHCFLAGS += -march=armv7-a + # To build aarch32 on armv8 Travis-CI (see criu Makefile) + USERCFLAGS += -march=armv7-a ARMV := 7 endif endif @@ -40,8 +40,8 @@ endif PKG_CONFIG ?= pkg-config CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes -CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) -CFLAGS += -D_GNU_SOURCE -D_LARGEFILE64_SOURCE +CFLAGS += $(USERCFLAGS) +CFLAGS += -D_GNU_SOURCE CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include ifeq ($(strip $(V)),) @@ -66,11 +66,6 @@ endif export PKG_CONFIG_PATH endif -ifeq ($(SHSTK_ENABLE),1) - CFLAGS += -mshstk - LDFLAGS += -Wl,-z,shstk -endif - define pkg-libs $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --libs $(1)) endef @@ -79,17 +74,9 @@ define pkg-cflags $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" $(PKG_CONFIG) --cflags $(1)) endef -ifeq ($(GCS_ENABLE),1) - CFLAGS += -mbranch-protection=standard - LDFLAGS += -z experimental-gcs=check - TEST_ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1:glibc.cpu.aarch64_gcs_policy=2 -else - TEST_ENV = -endif - %.d: %.c $(E) " DEP " $@ - $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP $< -o $@ + $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP -c $< -o $@ %.o: %.c | %.d $(E) " CC " $@ diff --git a/test/zdtm/__init__.py b/test/zdtm/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/zdtm/criu_config.py b/test/zdtm/criu_config.py deleted file mode 100644 index 9fd292747..000000000 --- a/test/zdtm/criu_config.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import tempfile -import subprocess - - -class criu_config: - @staticmethod - def run(action, - args, - criu_bin, - fault=None, - strace=[], - preexec=None, - preload=False, - nowait=False): - - config_path = tempfile.mktemp(".conf", "criu-%s-" % action) - with open(config_path, "w") as config_fd: - for arg in args: - if arg.startswith("--"): - config_fd.write("\n") - arg = arg.strip("-") - config_fd.write("%s " % arg) - - env = dict( - os.environ, - ASAN_OPTIONS="log_path=asan.log:disable_coredump=0:detect_leaks=0" - ) - - if fault: - print("Forcing %s fault" % fault) - env['CRIU_FAULT'] = fault - - cr = subprocess.Popen( - strace + - [criu_bin, action, "--no-default-config", "--config", config_path], - env=env, - close_fds=False, - preexec_fn=preexec - ) - if nowait: - return cr - return cr.wait() - - @staticmethod - def exit_signal(ret): - return ret < 0 diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index 428d726d6..ceec2b878 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -1,10 +1,10 @@ LIBDIR := . -CFLAGS += $(USERCFLAGS) $(ARCHCFLAGS) +CFLAGS += $(USERCFLAGS) LIB := libzdtmtst.a -LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c mem.c file.c mountinfo.c +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') @@ -34,4 +34,4 @@ clean: clean-more $(LIB): $(LIBOBJ) $(E) " AR " $@ - $(Q)$(AR) rcs $@ $^ + $(Q)ar rcs $@ $^ diff --git a/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h b/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h deleted file mode 100644 index 1803aaeb4..000000000 --- a/test/zdtm/lib/arch/loongarch64/include/asm/atomic.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef __CR_ATOMIC_H__ -#define __CR_ATOMIC_H__ - -typedef uint32_t atomic_t; - -#define atomic_get(v) (*(volatile int *)v) -#define atomic_set(v, i) (*(v) = (i)) - -static inline int __atomic_add(int i, atomic_t *v) -{ - int result; - asm volatile("amadd_db.w %1, %2, %0" : "+ZB"(*v), "=&r"(result) : "r"(i) : "memory"); - return result + i; -} - -static inline void atomic_add(int i, atomic_t *v) -{ - __atomic_add(i, v); -} - -static inline int atomic_add_return(int i, atomic_t *v) -{ - return __atomic_add(i, v); -} - -#define atomic_sub(i, v) atomic_add(-(int)i, v) -#define atomic_sub_return(i, v) atomic_add_return(-(int)i, v) -#define atomic_inc(v) atomic_add_return(1, v) -#define atomic_dec(v) atomic_sub_return(1, v) -#define atomic_dec_return(v) atomic_sub_return(1, v) - -static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) -{ - int ret; - asm volatile("1: \n" - " ll.w %0, %1 \n" - " bne %0, %2, 2f \n" - " or $t0, %3, $zero \n" - " sc.w $t0, %1 \n" - " beqz $t0, 1b \n" - "2: \n" - " dbar 0 \n" - : "=&r"(ret), "+ZB"(*ptr) - : "r"(old), "r"(new) - : "t0", "memory"); - return ret; -} - -#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/arch/riscv64/include/asm/atomic.h b/test/zdtm/lib/arch/riscv64/include/asm/atomic.h deleted file mode 100644 index a4faf1322..000000000 --- a/test/zdtm/lib/arch/riscv64/include/asm/atomic.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef __CR_ATOMIC_H__ -#define __CR_ATOMIC_H__ - -typedef uint32_t atomic_t; - -/* Copied from the Linux header arch/riscv/include/asm/barrier.h */ - -#define nop() __asm__ __volatile__("nop") - -#define RISCV_FENCE(p, s) __asm__ __volatile__("fence " #p "," #s : : : "memory") - -/* These barriers need to enforce ordering on both devices or memory. */ -#define mb() RISCV_FENCE(iorw, iorw) -#define rmb() RISCV_FENCE(ir, ir) -#define wmb() RISCV_FENCE(ow, ow) - -/* These barriers do not need to enforce ordering on devices, just memory. */ -#define __smp_mb() RISCV_FENCE(rw, rw) -#define __smp_rmb() RISCV_FENCE(r, r) -#define __smp_wmb() RISCV_FENCE(w, w) - -#define __smp_store_release(p, v) \ - do { \ - compiletime_assert_atomic_type(*p); \ - RISCV_FENCE(rw, w); \ - WRITE_ONCE(*p, v); \ - } while (0) - -#define __smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ - compiletime_assert_atomic_type(*p); \ - RISCV_FENCE(r, rw); \ - ___p1; \ - }) - -/* Copied from the Linux kernel header arch/riscv/include/asm/atomic.h */ - -static inline int atomic_read(const atomic_t *v) -{ - return (*(volatile int *)v); -} - -static inline void atomic_set(atomic_t *v, int i) -{ - *v = i; -} - -#define atomic_get atomic_read - -static inline int atomic_add_return(int i, atomic_t *v) -{ - int result; - - asm volatile("amoadd.w.aqrl %1, %2, %0" : "+A"(*v), "=r"(result) : "r"(i) : "memory"); - __smp_mb(); - return result + i; -} - -static inline int atomic_sub_return(int i, atomic_t *v) -{ - return atomic_add_return(-i, v); -} - -static inline int atomic_inc(atomic_t *v) -{ - return atomic_add_return(1, v) - 1; -} - -static inline int atomic_add(int val, atomic_t *v) -{ - return atomic_add_return(val, v) - val; -} - -static inline int atomic_dec(atomic_t *v) -{ - return atomic_sub_return(1, v) + 1; -} - -/* true if the result is 0, or false for all other cases. */ -#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0) -#define atomic_dec_return(v) (atomic_sub_return(1, v)) - -#define atomic_inc_return(v) (atomic_add_return(1, v)) - -static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new) -{ - unsigned long tmp; - int oldval; - - __smp_mb(); - - asm volatile("1:\n" - " lr.w %1, %2\n" - " bne %1, %3, 2f\n" - " sc.w %0, %4, %2\n" - " bnez %0, 1b\n" - "2:" - : "=&r"(tmp), "=&r"(oldval), "+A"(*ptr) - : "r"(old), "r"(new) - : "memory"); - - __smp_mb(); - return oldval; -} - -#endif /* __CR_ATOMIC_H__ */ diff --git a/test/zdtm/lib/file.c b/test/zdtm/lib/file.c deleted file mode 100644 index 57d85421d..000000000 --- a/test/zdtm/lib/file.c +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include -#include "zdtmtst.h" - -int write_value(const char *path, const char *value) -{ - int fd, l; - - fd = open(path, O_WRONLY); - if (fd < 0) { - pr_perror("open %s", path); - return -1; - } - - l = write(fd, value, strlen(value)); - if (l < 0) { - pr_perror("failed to write %s to %s", value, path); - close(fd); - return -1; - } - - close(fd); - return 0; -} - -int read_value(const char *path, char *value, int size) -{ - int fd, ret; - - fd = open(path, O_RDONLY); - if (fd < 0) { - pr_perror("open %s", path); - return -1; - } - - ret = read(fd, (void *)value, size); - if (ret < 0) { - pr_perror("read %s", path); - close(fd); - return -1; - } - - value[ret] = '\0'; - close(fd); - return 0; -} diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index efcc7a1d0..7b8be5f9f 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -54,7 +54,7 @@ mnt_info_t *get_cwd_mnt_info(void) while (fgets(str, sizeof(str), f)) { char *hyphen = strchr(str, '-'); - ret = sscanf(str, "%i %i %u:%u %4095s %4095s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); + ret = sscanf(str, "%i %i %u:%u %s %s", &mnt_id, &parent_mnt_id, &kmaj, &kmin, root, mountpoint); if (ret != 6 || !hyphen) goto err; ret = sscanf(hyphen + 1, " %ms", &fsname); @@ -108,7 +108,6 @@ int get_cwd_check_perm(char **result) "Bit 'x' should be set in all path components of " "this directory\n", cwd, getuid(), getgid(), errno, strerror(errno)); - free(cwd); return -1; } diff --git a/test/zdtm/lib/list.h b/test/zdtm/lib/list.h deleted file mode 100644 index 97d0f1e06..000000000 --- a/test/zdtm/lib/list.h +++ /dev/null @@ -1,389 +0,0 @@ -#ifndef __ZDTM_LIST_H__ -#define __ZDTM_LIST_H__ - -/* - * Double linked lists. - */ - -#include -#include "zdtmtst.h" - -#define POISON_POINTER_DELTA 0 -#define LIST_POISON1 ((void *)0x00100100 + POISON_POINTER_DELTA) -#define LIST_POISON2 ((void *)0x00200200 + POISON_POINTER_DELTA) - -struct list_head { - struct list_head *prev, *next; -}; - -#define LIST_HEAD_INIT(name) \ - { \ - &(name), &(name) \ - } -#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name) - -static inline void INIT_LIST_HEAD(struct list_head *list) -{ - list->next = list; - list->prev = list; -} - -static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) -{ - next->prev = new; - new->next = next; - new->prev = prev; - prev->next = new; -} - -static inline void list_add(struct list_head *new, struct list_head *head) -{ - __list_add(new, head, head->next); -} - -static inline void list_add_tail(struct list_head *new, struct list_head *head) -{ - __list_add(new, head->prev, head); -} - -static inline void __list_del(struct list_head *prev, struct list_head *next) -{ - next->prev = prev; - prev->next = next; -} - -static inline void __list_del_entry(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); -} - -static inline void list_del(struct list_head *entry) -{ - __list_del(entry->prev, entry->next); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} - -static inline void list_replace(struct list_head *old, struct list_head *new) -{ - new->next = old->next; - new->next->prev = new; - new->prev = old->prev; - new->prev->next = new; -} - -static inline void list_replace_init(struct list_head *old, struct list_head *new) -{ - list_replace(old, new); - INIT_LIST_HEAD(old); -} - -static inline void list_del_init(struct list_head *entry) -{ - __list_del_entry(entry); - INIT_LIST_HEAD(entry); -} - -static inline void list_move(struct list_head *list, struct list_head *head) -{ - __list_del_entry(list); - list_add(list, head); -} - -static inline void list_move_tail(struct list_head *list, struct list_head *head) -{ - __list_del_entry(list); - list_add_tail(list, head); -} - -static inline int list_is_last(const struct list_head *list, const struct list_head *head) -{ - return list->next == head; -} - -static inline int list_is_first(const struct list_head *list, const struct list_head *head) -{ - return list->prev == head; -} - -static inline int list_empty(const struct list_head *head) -{ - return head->next == head; -} - -static inline int list_empty_careful(const struct list_head *head) -{ - struct list_head *next = head->next; - return (next == head) && (next == head->prev); -} -static inline void list_rotate_left(struct list_head *head) -{ - struct list_head *first; - - if (!list_empty(head)) { - first = head->next; - list_move_tail(first, head); - } -} - -static inline int list_is_singular(const struct list_head *head) -{ - return !list_empty(head) && (head->next == head->prev); -} - -static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) -{ - struct list_head *new_first = entry->next; - list->next = head->next; - list->next->prev = list; - list->prev = entry; - entry->next = list; - head->next = new_first; - new_first->prev = head; -} - -static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) -{ - if (list_empty(head)) - return; - if (list_is_singular(head) && (head->next != entry && head != entry)) - return; - if (entry == head) - INIT_LIST_HEAD(list); - else - __list_cut_position(list, head, entry); -} - -static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) -{ - struct list_head *first = list->next; - struct list_head *last = list->prev; - - first->prev = prev; - prev->next = first; - - last->next = next; - next->prev = last; -} - -static inline void list_splice(const struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) - __list_splice(list, head, head->next); -} - -static inline void list_splice_tail(struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) - __list_splice(list, head->prev, head); -} - -static inline void list_splice_init(struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice(list, head, head->next); - INIT_LIST_HEAD(list); - } -} - -static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) -{ - if (!list_empty(list)) { - __list_splice(list, head->prev, head); - INIT_LIST_HEAD(list); - } -} - -#define list_entry(ptr, type, member) container_of(ptr, type, member) - -#define list_first_entry(ptr, type, member) list_entry((ptr)->next, type, member) - -#define list_for_each(pos, head) for (pos = (head)->next; pos != (head); pos = pos->next) - -#define list_for_each_prev(pos, head) for (pos = (head)->prev; pos != (head); pos = pos->prev) - -#define list_for_each_safe(pos, n, head) for (pos = (head)->next, n = pos->next; pos != (head); pos = n, n = pos->next) - -#define list_for_each_prev_safe(pos, n, head) \ - for (pos = (head)->prev, n = pos->prev; pos != (head); pos = n, n = pos->prev) - -#define list_for_each_entry(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member); &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) - -#define list_for_each_entry_reverse(pos, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member); &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member)) - -#define list_prepare_entry(pos, head, member) ((pos) ?: list_entry(head, typeof(*pos), member)) - -#define list_for_each_entry_continue(pos, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member); &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) - -#define list_for_each_entry_continue_reverse(pos, head, member) \ - for (pos = list_entry(pos->member.prev, typeof(*pos), member); &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member)) - -#define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); pos = list_entry(pos->member.next, typeof(*pos), member)) - -#define list_for_each_entry_safe(pos, n, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); pos = n, n = list_entry(n->member.next, typeof(*n), member)) - -#define list_for_each_entry_safe_continue(pos, n, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); pos = n, n = list_entry(n->member.next, typeof(*n), member)) - -#define list_for_each_entry_safe_from(pos, n, head, member) \ - for (n = list_entry(pos->member.next, typeof(*pos), member); &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) - -#define list_for_each_entry_safe_reverse(pos, n, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member), \ - n = list_entry(pos->member.prev, typeof(*pos), member); \ - &pos->member != (head); pos = n, n = list_entry(n->member.prev, typeof(*n), member)) - -#define list_safe_reset_next(pos, n, member) n = list_entry(pos->member.next, typeof(*pos), member) - -/* - * Double linked lists with a single pointer list head. - */ - -struct hlist_head { - struct hlist_node *first; -}; - -struct hlist_node { - struct hlist_node *next, **pprev; -}; - -#define HLIST_HEAD_INIT \ - { \ - .first = NULL \ - } -#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } -#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) - -static inline void INIT_HLIST_NODE(struct hlist_node *h) -{ - h->next = NULL; - h->pprev = NULL; -} - -static inline int hlist_unhashed(const struct hlist_node *h) -{ - return !h->pprev; -} - -static inline int hlist_empty(const struct hlist_head *h) -{ - return !h->first; -} - -static inline void __hlist_del(struct hlist_node *n) -{ - struct hlist_node *next = n->next; - struct hlist_node **pprev = n->pprev; - *pprev = next; - if (next) - next->pprev = pprev; -} - -static inline void hlist_del(struct hlist_node *n) -{ - __hlist_del(n); - n->next = LIST_POISON1; - n->pprev = LIST_POISON2; -} - -static inline void hlist_del_init(struct hlist_node *n) -{ - if (!hlist_unhashed(n)) { - __hlist_del(n); - INIT_HLIST_NODE(n); - } -} - -static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) -{ - struct hlist_node *first = h->first; - n->next = first; - if (first) - first->pprev = &n->next; - h->first = n; - n->pprev = &h->first; -} - -/* next must be != NULL */ -static inline void hlist_add_before(struct hlist_node *n, struct hlist_node *next) -{ - n->pprev = next->pprev; - n->next = next; - next->pprev = &n->next; - *(n->pprev) = n; -} - -static inline void hlist_add_after(struct hlist_node *n, struct hlist_node *next) -{ - next->next = n->next; - n->next = next; - next->pprev = &n->next; - - if (next->next) - next->next->pprev = &next->next; -} - -/* after that we'll appear to be on some hlist and hlist_del will work */ -static inline void hlist_add_fake(struct hlist_node *n) -{ - n->pprev = &n->next; -} - -/* - * Move a list from one list head to another. Fixup the pprev - * reference of the first entry if it exists. - */ -static inline void hlist_move_list(struct hlist_head *old, struct hlist_head *new) -{ - new->first = old->first; - if (new->first) - new->first->pprev = &new->first; - old->first = NULL; -} - -#define hlist_entry(ptr, type, member) container_of(ptr, type, member) - -#define hlist_for_each(pos, head) for (pos = (head)->first; pos; pos = pos->next) - -#define hlist_for_each_safe(pos, n, head) \ - for (pos = (head)->first; pos && ({ \ - n = pos->next; \ - 1; \ - }); \ - pos = n) - -#define hlist_entry_safe(ptr, type, member) (ptr) ? hlist_entry(ptr, type, member) : NULL - -#define hlist_for_each_entry(pos, head, member) \ - for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member); pos; \ - pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) - -#define hlist_for_each_entry_continue(pos, member) \ - for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member); pos; \ - pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) - -#define hlist_for_each_entry_from(pos, member) \ - for (; pos; pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) - -#define hlist_for_each_entry_safe(pos, n, head, member) \ - for (pos = hlist_entry_safe((head)->first, typeof(*pos), member); pos && ({ \ - n = pos->member.next; \ - 1; \ - }); \ - pos = hlist_entry_safe(n, typeof(*pos), member)) - -#endif /* __ZDTM_LIST_H__ */ diff --git a/test/zdtm/lib/lock.h b/test/zdtm/lib/lock.h index cc5306e06..2b23550be 100644 --- a/test/zdtm/lib/lock.h +++ b/test/zdtm/lib/lock.h @@ -7,7 +7,6 @@ #include #include #include -#include #include "asm/atomic.h" #define BUG_ON(condition) \ diff --git a/test/zdtm/lib/mem.c b/test/zdtm/lib/mem.c deleted file mode 100644 index f612e7a15..000000000 --- a/test/zdtm/lib/mem.c +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include -#include - -#include "zdtmtst.h" - -dev_t get_mapping_dev(void *addr) -{ - char buf[1024]; - FILE *f; - unsigned int major, minor; - int ret; - - f = fopen("/proc/self/maps", "r"); - if (f == NULL) { - pr_perror("Failed to open maps file"); - return (dev_t)-1; - } - - while (fgets(buf, sizeof(buf), f)) { - if ((unsigned long)addr == strtoul(buf, NULL, 16)) { - ret = sscanf(buf, "%*x-%*x %*c%*c%*c%*c %*x %x:%x", &major, &minor); - if (ret != 2) { - pr_err("Can't parse /proc/self/maps\n"); - return (dev_t)-1; - } - return makedev(major, minor); - } - } - - return (dev_t)-1; -} diff --git a/test/zdtm/lib/mountinfo.c b/test/zdtm/lib/mountinfo.c deleted file mode 100644 index d6ab67a3f..000000000 --- a/test/zdtm/lib/mountinfo.c +++ /dev/null @@ -1,490 +0,0 @@ -#include -#include - -#include "mountinfo.h" -#include "fs.h" -#include "xmalloc.h" - -/* - * mountinfo contains mangled paths. space, tab and back slash were replaced - * with usual octal escape. This function replaces these symbols back. - */ -static void cure_path(char *path) -{ - int i, len, off = 0; - - if (strchr(path, '\\') == NULL) /* fast path */ - return; - - len = strlen(path); - for (i = 0; i < len; i++) { - if (!strncmp(path + i, "\\040", 4)) { - path[i - off] = ' '; - goto replace; - } else if (!strncmp(path + i, "\\011", 4)) { - path[i - off] = '\t'; - goto replace; - } else if (!strncmp(path + i, "\\134", 4)) { - path[i - off] = '\\'; - goto replace; - } - if (off) - path[i - off] = path[i]; - continue; - replace: - off += 3; - i += 3; - } - path[len - off] = 0; -} - -static struct mountinfo_zdtm *mountinfo_zdtm_alloc(struct mntns_zdtm *mntns) -{ - struct mountinfo_zdtm *new; - - new = xzalloc(sizeof(struct mountinfo_zdtm)); - if (new) - list_add_tail(&new->list, &mntns->mountinfo_list); - return new; -} - -static void mountinfo_zdtm_free(struct mountinfo_zdtm *mountinfo) -{ - list_del(&mountinfo->list); - xfree(mountinfo->mountpoint); - xfree(mountinfo->root); - xfree(mountinfo->fstype); - xfree(mountinfo); -} - -static void mountinfo_zdtm_free_all(struct mntns_zdtm *mntns) -{ - struct mountinfo_zdtm *mountinfo, *tmp; - - list_for_each_entry_safe(mountinfo, tmp, &mntns->mountinfo_list, list) - mountinfo_zdtm_free(mountinfo); -} - -#define BUF_SIZE 4096 -char buf[BUF_SIZE]; - -int mntns_parse_mountinfo(struct mntns_zdtm *mntns) -{ - FILE *f; - int ret; - - INIT_LIST_HEAD(&mntns->mountinfo_list); - - f = fopen("/proc/self/mountinfo", "r"); - if (!f) { - pr_perror("Failed to open mountinfo"); - return -1; - } - - while (fgets(buf, BUF_SIZE, f)) { - struct mountinfo_zdtm *new; - unsigned int kmaj, kmin; - char *str, *hyphen, *shared, *master; - int n; - - new = mountinfo_zdtm_alloc(mntns); - if (!new) { - pr_perror("Failed to alloc mountinfo_zdtm"); - goto free; - } - - ret = sscanf(buf, "%i %i %u:%u %ms %ms %*s %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, - &new->root, &new->mountpoint, &n); - if (ret != 6) { - pr_perror("Failed to parse mountinfo line \"%s\"", buf); - goto free; - } - cure_path(new->root); - cure_path(new->mountpoint); - new->s_dev = MKKDEV(kmaj, kmin); - - str = buf + n; - hyphen = strstr(buf, " - "); - if (!hyphen) { - pr_perror("Failed to find \" - \" in mountinfo line \"%s\"", buf); - goto free; - } - *hyphen++ = '\0'; - - shared = strstr(str, "shared:"); - if (shared) - new->shared_id = atoi(shared + 7); - master = strstr(str, "master:"); - if (master) - new->master_id = atoi(master + 7); - - ret = sscanf(hyphen, "- %ms", &new->fstype); - if (ret != 1) { - pr_perror("Failed to parse fstype in mountinfo tail \"%s\"", hyphen); - goto free; - } - } - - fclose(f); - return 0; -free: - mountinfo_zdtm_free_all(mntns); - fclose(f); - return -1; -} - -static struct mountinfo_topology *mountinfo_topology_alloc(struct mntns_zdtm *mntns, struct mountinfo_zdtm *mountinfo) -{ - struct mountinfo_topology *new; - - new = xzalloc(sizeof(struct mountinfo_topology)); - if (new) { - new->mountinfo = mountinfo; - new->topology_id = -1; - INIT_LIST_HEAD(&new->children); - INIT_LIST_HEAD(&new->siblings); - list_add_tail(&new->list, &mntns->topology_list); - INIT_LIST_HEAD(&new->sharing_list); - } - return new; -} - -static void mountinfo_topology_free(struct mountinfo_topology *topology) -{ - list_del(&topology->list); - xfree(topology); -} - -static void mountinfo_topology_free_all(struct mntns_zdtm *mntns) -{ - struct mountinfo_topology *topology, *tmp; - - list_for_each_entry_safe(topology, tmp, &mntns->topology_list, list) - mountinfo_topology_free(topology); -} - -static struct mountinfo_topology *mountinfo_topology_lookup_parent(struct mntns_zdtm *mntns, - struct mountinfo_topology *topology) -{ - struct mountinfo_topology *parent; - - list_for_each_entry(parent, &mntns->topology_list, list) { - if (parent->mountinfo->mnt_id == topology->mountinfo->parent_mnt_id) - return parent; - } - - return NULL; -} - -static struct mountinfo_topology *mt_subtree_next(struct mountinfo_topology *mt, struct mountinfo_topology *root) -{ - if (!list_empty(&mt->children)) - return list_entry(mt->children.next, struct mountinfo_topology, siblings); - - while (mt->parent && mt != root) { - if (mt->siblings.next == &mt->parent->children) - mt = mt->parent; - else - return list_entry(mt->siblings.next, struct mountinfo_topology, siblings); - } - - return NULL; -} - -static void __mt_resort_siblings(struct mountinfo_topology *parent) -{ - LIST_HEAD(list); - - while (!list_empty(&parent->children)) { - struct mountinfo_topology *m, *p; - - m = list_first_entry(&parent->children, struct mountinfo_topology, siblings); - list_del(&m->siblings); - - list_for_each_entry(p, &list, siblings) - if (strcmp(p->mountinfo->mountpoint, m->mountinfo->mountpoint) < 0) - break; - - list_add_tail(&m->siblings, &p->siblings); - } - - list_splice(&list, &parent->children); -} - -static void mntns_mt_resort_siblings(struct mntns_zdtm *mntns) -{ - struct mountinfo_topology *mt = mntns->tree; - LIST_HEAD(mtlist); - int i = 0; - - while (1) { - /* Assign topology id to mt in dfs order */ - mt->topology_id = i++; - list_move_tail(&mt->list, &mtlist); - __mt_resort_siblings(mt); - mt = mt_subtree_next(mt, mntns->tree); - if (!mt) - break; - } - - /* Update mntns->topology_list in dfs order */ - list_splice(&mtlist, &mntns->topology_list); -} - -static struct sharing_group *sharing_group_find_or_alloc(struct mntns_zdtm *mntns, int shared_id, int master_id, - unsigned int s_dev) -{ - struct sharing_group *sg; - - list_for_each_entry(sg, &mntns->sharing_groups_list, list) { - if ((sg->shared_id == shared_id) && (sg->master_id == master_id)) { - if (sg->s_dev != s_dev) { - pr_err("Sharing/devid inconsistency\n"); - return NULL; - } - return sg; - } - } - - sg = xzalloc(sizeof(struct sharing_group)); - if (!sg) - return NULL; - - sg->shared_id = shared_id; - sg->master_id = master_id; - sg->s_dev = s_dev; - sg->topology_id = -1; - - INIT_LIST_HEAD(&sg->children); - INIT_LIST_HEAD(&sg->siblings); - INIT_LIST_HEAD(&sg->mounts_list); - - list_add_tail(&sg->list, &mntns->sharing_groups_list); - - return sg; -} - -static void sharing_group_free(struct sharing_group *sg) -{ - list_del(&sg->list); - xfree(sg); -} - -static void sharing_group_free_all(struct mntns_zdtm *mntns) -{ - struct sharing_group *sg, *tmp; - - list_for_each_entry_safe(sg, tmp, &mntns->sharing_groups_list, list) - sharing_group_free(sg); -} - -static struct sharing_group *sharing_group_lookup_parent(struct mntns_zdtm *mntns, struct sharing_group *sg) -{ - struct sharing_group *parent; - - list_for_each_entry(parent, &mntns->sharing_groups_list, list) { - if (parent->shared_id == sg->master_id) - return parent; - } - - /* Create "external" sharing */ - parent = sharing_group_find_or_alloc(mntns, sg->master_id, 0, sg->s_dev); - if (parent) - return parent; - - return NULL; -} - -static int mntns_build_tree(struct mntns_zdtm *mntns) -{ - struct mountinfo_topology *topology, *parent, *tree = NULL; - struct mountinfo_zdtm *mountinfo; - struct sharing_group *sg, *sg_parent; - - INIT_LIST_HEAD(&mntns->topology_list); - - /* Prealloc mount tree */ - list_for_each_entry(mountinfo, &mntns->mountinfo_list, list) { - topology = mountinfo_topology_alloc(mntns, mountinfo); - if (!topology) - goto err; - } - - /* Build mount tree */ - list_for_each_entry(topology, &mntns->topology_list, list) { - parent = mountinfo_topology_lookup_parent(mntns, topology); - if (!parent) { - if (tree) { - pr_err("Bad mount tree with too roots %d and %d\n", tree->mountinfo->mnt_id, - parent->mountinfo->mnt_id); - goto err; - } - tree = topology; - } else { - topology->parent = parent; - list_add_tail(&topology->siblings, &parent->children); - } - } - mntns->tree = tree; - - /* Sort mounts by mountpoint */ - mntns_mt_resort_siblings(mntns); - - INIT_LIST_HEAD(&mntns->sharing_groups_list); - - /* Prealloc sharing groups */ - list_for_each_entry(topology, &mntns->topology_list, list) { - if (!topology->mountinfo->shared_id && !topology->mountinfo->master_id) - continue; - - /* - * Due to mntns->topology_list is sorted in dfs order - * sharing groups are also sorted the same - */ - sg = sharing_group_find_or_alloc(mntns, topology->mountinfo->shared_id, topology->mountinfo->master_id, - topology->mountinfo->s_dev); - if (!sg) - goto err; - - list_add_tail(&topology->sharing_list, &sg->mounts_list); - topology->sharing = sg; - - /* Set sharing group topology id to minimal topology id of it's mounts */ - if (sg->topology_id == -1 || topology->topology_id < sg->topology_id) - sg->topology_id = topology->topology_id; - } - - /* Build sharing group trees */ - list_for_each_entry(sg, &mntns->sharing_groups_list, list) { - if (sg->master_id) { - sg_parent = sharing_group_lookup_parent(mntns, sg); - sg->parent = sg_parent; - list_add(&sg->siblings, &sg_parent->children); - } - } - - return 0; -err: - mountinfo_topology_free_all(mntns); - sharing_group_free_all(mntns); - return -1; -} - -static int mountinfo_topology_list_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) -{ - struct mountinfo_topology *topology_a, *topology_b; - - topology_a = list_first_entry(&mntns_a->topology_list, struct mountinfo_topology, list); - topology_b = list_first_entry(&mntns_b->topology_list, struct mountinfo_topology, list); - - while (&topology_a->list != &mntns_a->topology_list && &topology_b->list != &mntns_b->topology_list) { - if (topology_a->topology_id != topology_b->topology_id) { - pr_err("Mounts %d and %d have different topology id %d and %d\n", topology_a->mountinfo->mnt_id, - topology_b->mountinfo->mnt_id, topology_a->topology_id, topology_b->topology_id); - return -1; - } - - if (topology_a->parent && topology_b->parent) { - if (topology_a->parent->topology_id != topology_b->parent->topology_id) { - pr_err("Mounts %d and %d have different parent topology id %d and %d\n", - topology_a->mountinfo->mnt_id, topology_b->mountinfo->mnt_id, - topology_a->parent->topology_id, topology_b->parent->topology_id); - return -1; - } - } else if (topology_a->parent || topology_b->parent) { - pr_err("One of mounts %d and %d has parent and other doesn't\n", topology_a->mountinfo->mnt_id, - topology_b->mountinfo->mnt_id); - return -1; - } - - if (topology_a->sharing && topology_b->sharing) { - if (topology_a->sharing->topology_id != topology_b->sharing->topology_id) { - pr_err("Mounts %d and %d have different sharing topology id %d and %d\n", - topology_a->mountinfo->mnt_id, topology_b->mountinfo->mnt_id, - topology_a->sharing->topology_id, topology_b->sharing->topology_id); - return -1; - } - } else if (topology_a->sharing || topology_b->sharing) { - pr_err("One of mounts %d and %d has sharing and other doesn't\n", topology_a->mountinfo->mnt_id, - topology_b->mountinfo->mnt_id); - return -1; - } - - topology_a = list_entry(topology_a->list.next, struct mountinfo_topology, list); - topology_b = list_entry(topology_b->list.next, struct mountinfo_topology, list); - } - if (&topology_a->list != &mntns_a->topology_list || &topology_b->list != &mntns_b->topology_list) { - pr_err("Mount tree topology length mismatch\n"); - return -1; - } - - return 0; -} - -static int sharing_group_list_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) -{ - struct sharing_group *sg_a, *sg_b; - - sg_a = list_first_entry(&mntns_a->sharing_groups_list, struct sharing_group, list); - sg_b = list_first_entry(&mntns_b->sharing_groups_list, struct sharing_group, list); - - while (&sg_a->list != &mntns_a->sharing_groups_list && &sg_b->list != &mntns_b->sharing_groups_list) { - if (sg_a->topology_id != sg_b->topology_id) { - pr_err("Sharings (%d,%d) and (%d,%d) have different sharing topology id %d and %d\n", - sg_a->shared_id, sg_a->master_id, sg_b->shared_id, sg_b->master_id, sg_a->topology_id, - sg_b->topology_id); - return -1; - } - - if (sg_a->parent && sg_b->parent) { - if (sg_a->parent->topology_id != sg_b->parent->topology_id) { - pr_err("Sharings (%d,%d) and (%d,%d) have different parent topology id %d and %d\n", - sg_a->shared_id, sg_a->master_id, sg_b->shared_id, sg_b->master_id, - sg_a->parent->topology_id, sg_b->parent->topology_id); - return -1; - } - } else if (sg_a->parent || sg_b->parent) { - pr_err("One of sharings (%d,%d) and (%d,%d) has parent and other doesn't\n", sg_a->shared_id, - sg_a->master_id, sg_b->shared_id, sg_b->master_id); - return -1; - } - - sg_a = list_entry(sg_a->list.next, struct sharing_group, list); - sg_b = list_entry(sg_b->list.next, struct sharing_group, list); - } - - if (&sg_a->list != &mntns_a->sharing_groups_list || &sg_b->list != &mntns_b->sharing_groups_list) { - pr_err("Mount tree sharing topology length mismatch\n"); - return -1; - } - - return 0; -} - -int mntns_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b) -{ - if (mntns_build_tree(mntns_a)) { - pr_err("Failed to build first mountinfo topology tree\n"); - return -1; - } - - if (mntns_build_tree(mntns_b)) { - pr_err("Failed to build second mountinfo topology tree\n"); - return -1; - } - - if (mountinfo_topology_list_compare(mntns_a, mntns_b)) - return -1; - - if (sharing_group_list_compare(mntns_a, mntns_b)) - return -1; - - return 0; -} - -void mntns_free_all(struct mntns_zdtm *mntns) -{ - mountinfo_zdtm_free_all(mntns); - mountinfo_topology_free_all(mntns); - sharing_group_free_all(mntns); -} diff --git a/test/zdtm/lib/mountinfo.h b/test/zdtm/lib/mountinfo.h deleted file mode 100644 index 6d90e2c10..000000000 --- a/test/zdtm/lib/mountinfo.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __ZDTM_MOUNTINFO__ -#define __ZDTM_MOUNTINFO__ - -#include "list.h" - -struct mountinfo_zdtm { - int mnt_id; - int parent_mnt_id; - char *mountpoint; - char *root; - unsigned int s_dev; - int shared_id; - int master_id; - char *fstype; - - /* list of all mounts */ - struct list_head list; -}; - -struct mntns_zdtm { - struct list_head mountinfo_list; - struct list_head topology_list; - struct mountinfo_topology *tree; - struct list_head sharing_groups_list; -}; - -#define MNTNS_ZDTM_INIT(name) \ - { \ - .mountinfo_list = LIST_HEAD_INIT(name.mountinfo_list), \ - .topology_list = LIST_HEAD_INIT(name.topology_list), \ - .sharing_groups_list = LIST_HEAD_INIT(name.sharing_groups_list), \ - } -#define MNTNS_ZDTM(name) struct mntns_zdtm name = MNTNS_ZDTM_INIT(name) - -struct sharing_group { - int shared_id; - int master_id; - unsigned int s_dev; - - struct sharing_group *parent; - struct list_head children; - struct list_head siblings; - - int topology_id; - - struct list_head mounts_list; - - struct list_head list; -}; - -struct mountinfo_topology { - struct mountinfo_zdtm *mountinfo; - - struct mountinfo_topology *parent; - struct list_head children; - struct list_head siblings; - - int topology_id; - - struct sharing_group *sharing; - struct list_head sharing_list; - - struct list_head list; -}; - -extern int mntns_parse_mountinfo(struct mntns_zdtm *mntns); -extern void mntns_free_all(struct mntns_zdtm *mntns); -extern int mntns_compare(struct mntns_zdtm *mntns_a, struct mntns_zdtm *mntns_b); - -#endif diff --git a/test/zdtm/lib/msg.c b/test/zdtm/lib/msg.c index 9ba1c47a4..1cf92e3e0 100644 --- a/test/zdtm/lib/msg.c +++ b/test/zdtm/lib/msg.c @@ -1,5 +1,4 @@ #include -#include #include #include #include @@ -56,7 +55,7 @@ void test_msg(const char *format, ...) off += strftime(buf, sizeof(buf), "%H:%M:%S", tm); } - off += sprintf(buf + off, ".%.3" PRId64 ": ", (int64_t)(tv.tv_usec / 1000)); + off += sprintf(buf + off, ".%.3ld: ", tv.tv_usec / 1000); off += sprintf(buf + off, "%5d: ", getpid()); skip: diff --git a/test/zdtm/lib/ns.c b/test/zdtm/lib/ns.c index 822e09c92..6f6cccc99 100644 --- a/test/zdtm/lib/ns.c +++ b/test/zdtm/lib/ns.c @@ -1,7 +1,6 @@ #include #include #include -#include #include #include #include @@ -28,9 +27,8 @@ extern int pivot_root(const char *new_root, const char *put_old); static int prepare_mntns(void) { int dfd, ret; - char *root, *criu_path, *dev_path, *zdtm_bind; + char *root, *criu_path; char path[PATH_MAX]; - char bind_path[PATH_MAX]; root = getenv("ZDTM_ROOT"); if (!root) { @@ -53,34 +51,6 @@ static int prepare_mntns(void) return -1; } - zdtm_bind = getenv("ZDTM_BIND"); - if (zdtm_bind) { - /* - * Bindmount the directory to itself. - * e.g.: The mnt_ro_root test makes "/" mount readonly, but we - * still want to write logs to /zdtm/static/ so let's make it - * separate writable bind mount. - */ - snprintf(bind_path, sizeof(bind_path), "%s/%s", root, zdtm_bind); - if (mount(bind_path, bind_path, NULL, MS_BIND, NULL)) { - fprintf(stderr, "Can't bind-mount ZDTM_BIND: %m\n"); - return -1; - } - } - - dev_path = getenv("ZDTM_DEV"); - if (dev_path) { - snprintf(path, sizeof(path), "%s/dev", root); - if (mount(dev_path, path, NULL, MS_BIND, NULL)) { - pr_perror("Unable to mount %s", path); - return -1; - } - if (mount(NULL, path, NULL, MS_PRIVATE, NULL)) { - pr_perror("Unable to mount %s", path); - return -1; - } - } - criu_path = getenv("ZDTM_CRIU"); if (criu_path) { snprintf(path, sizeof(path), "%s%s", root, criu_path); @@ -248,7 +218,7 @@ static inline int _settime(clockid_t clk_id, time_t offset) if (clk_id == CLOCK_MONOTONIC_COARSE || clk_id == CLOCK_MONOTONIC_RAW) clk_id = CLOCK_MONOTONIC; - len = snprintf(buf, sizeof(buf), "%d %" PRId64 " 0", clk_id, (int64_t)offset); + len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset); fd = open("/proc/self/timens_offsets", O_WRONLY); if (fd < 0) { diff --git a/test/zdtm/lib/sysctl.c b/test/zdtm/lib/sysctl.c index 3b1ebc168..9583ec3df 100644 --- a/test/zdtm/lib/sysctl.c +++ b/test/zdtm/lib/sysctl.c @@ -3,49 +3,6 @@ #include "zdtmtst.h" #include "sysctl.h" -int sysctl_read_str(const char *name, char *data, size_t size) -{ - int fd, ret; - - fd = open(name, O_RDONLY); - if (fd < 0) { - pr_perror("Can't open %s", name); - return -1; - } - - ret = read(fd, data, size - 1); - if (ret < 0) { - pr_perror("Can't read %s", name); - close(fd); - return -1; - } - data[ret] = '\0'; - close(fd); - - return 0; -} - -int sysctl_write_str(const char *name, char *data) -{ - int fd, ret; - - fd = open(name, O_WRONLY); - if (fd < 0) { - pr_perror("Can't open %s", name); - return -1; - } - - ret = write(fd, data, strlen(data)); - if (ret < 0) { - pr_perror("Can't write %s into %s", data, name); - close(fd); - return -1; - } - close(fd); - - return 0; -} - int sysctl_read_int(const char *name, int *data) { int fd; diff --git a/test/zdtm/lib/sysctl.h b/test/zdtm/lib/sysctl.h index d435bd7e9..67129102f 100644 --- a/test/zdtm/lib/sysctl.h +++ b/test/zdtm/lib/sysctl.h @@ -3,7 +3,5 @@ extern int sysctl_read_int(const char *name, int *data); extern int sysctl_write_int(const char *name, int val); -extern int sysctl_read_str(const char *name, char *data, size_t size); -extern int sysctl_write_str(const char *name, char *data); #endif diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index 95017e42e..81da81eba 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -20,11 +20,9 @@ #include "ns.h" futex_t sig_received; -/* clang-format off */ static struct { futex_t stage; -} *test_shared_state; -/* clang-format on */ +} * test_shared_state; enum { TEST_INIT_STAGE = 0, @@ -239,37 +237,34 @@ void test_init(int argc, char **argv) exit(1); } - val = getenv("ZDTM_ROOTLESS"); - if (!val) { - val = getenv("ZDTM_GROUPS"); - if (val) { - char *tok = NULL; - unsigned int size = 0, groups[NGROUPS_MAX]; + val = getenv("ZDTM_GROUPS"); + if (val) { + char *tok = NULL; + unsigned int size = 0, groups[NGROUPS_MAX]; - tok = strtok(val, " "); - while (tok) { - size++; - groups[size - 1] = atoi(tok); - tok = strtok(NULL, " "); - } - - if (setgroups(size, groups)) { - fprintf(stderr, "Can't set groups: %m"); - exit(1); - } + tok = strtok(val, " "); + while (tok) { + size++; + groups[size - 1] = atoi(tok); + tok = strtok(NULL, " "); } - val = getenv("ZDTM_GID"); - if (val && (setgid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); + if (setgroups(size, groups)) { + fprintf(stderr, "Can't set groups: %m"); exit(1); } + } - val = getenv("ZDTM_UID"); - if (val && (setuid(atoi(val)) == -1)) { - fprintf(stderr, "Can't set gid: %m"); - exit(1); - } + val = getenv("ZDTM_GID"); + if (val && (setgid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); + } + + val = getenv("ZDTM_UID"); + if (val && (setuid(atoi(val)) == -1)) { + fprintf(stderr, "Can't set gid: %m"); + exit(1); } if (prctl(PR_SET_DUMPABLE, 1)) { @@ -406,7 +401,7 @@ pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid { #ifdef __x86_64__ return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, child_tid, newtls); -#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__ || __loongarch64 || __riscv) +#elif (__i386__ || __arm__ || __aarch64__ || __powerpc64__ || __mips__) return (pid_t)syscall(__NR_clone, flags, child_stack, parent_tid, newtls, child_tid); #elif __s390x__ return (pid_t)syscall(__NR_clone, child_stack, flags, parent_tid, child_tid, newtls); diff --git a/test/zdtm/lib/unix.c b/test/zdtm/lib/unix.c index 288f1df24..49773dedd 100644 --- a/test/zdtm/lib/unix.c +++ b/test/zdtm/lib/unix.c @@ -5,7 +5,7 @@ int unix_fill_sock_name(struct sockaddr_un *name, char *relFilename) { - cleanup_free char *cwd = NULL; + char *cwd; if (get_cwd_check_perm(&cwd)) { pr_err("failed to get current working directory with valid permissions.\n"); diff --git a/test/zdtm/lib/xmalloc.h b/test/zdtm/lib/xmalloc.h deleted file mode 100644 index 95e0d4043..000000000 --- a/test/zdtm/lib/xmalloc.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef __ZDTM_XMALLOC_H__ -#define __ZDTM_XMALLOC_H__ - -#include -#include - -#ifndef pr_err -#error "Macro pr_err is needed." -#endif - -#define __xalloc(op, size, ...) \ - ({ \ - void *___p = op(__VA_ARGS__); \ - if (!___p) \ - pr_err("%s: Can't allocate %li bytes\n", __func__, (long)(size)); \ - ___p; \ - }) - -#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str) -#define xmalloc(size) __xalloc(malloc, size, size) -#define xzalloc(size) __xalloc(calloc, size, 1, size) -#define xrealloc(p, size) __xalloc(realloc, size, p, size) - -#define xfree(p) free(p) - -#define xrealloc_safe(pptr, size) \ - ({ \ - int __ret = -1; \ - void *new = xrealloc(*pptr, size); \ - if (new) { \ - *pptr = new; \ - __ret = 0; \ - } \ - __ret; \ - }) - -#define xmemdup(ptr, size) \ - ({ \ - void *new = xmalloc(size); \ - if (new) \ - memcpy(new, ptr, size); \ - new; \ - }) - -#define memzero_p(p) memset(p, 0, sizeof(*p)) -#define memzero(p, size) memset(p, 0, size) - -/* - * Helper for allocating trees with single xmalloc. - * This one advances the void *pointer on s bytes and - * returns the previous value. Use like this - * - * m = xmalloc(total_size); - * a = xptr_pull(&m, tree_root_t); - * a->b = xptr_pull(&m, leaf_a_t); - * a->c = xptr_pull(&m, leaf_c_t); - * ... - */ -static inline void *xptr_pull_s(void **m, size_t s) -{ - void *ret = (*m); - (*m) += s; - return ret; -} - -#define xptr_pull(m, type) xptr_pull_s(m, sizeof(type)) - -#endif /* __CR_XMALLOC_H__ */ diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index b0e25702e..c6d77011d 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -126,25 +126,11 @@ extern int write_pidfile(int pid); /* message helpers */ extern int test_log_init(const char *outfile, const char *suffix); extern int zdtm_seccomp; -#define pr_err(format, arg...) \ - ({ \ - test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg); \ - 1; \ - }) - -#define pr_perror(format, arg...) \ - ({ \ - test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ - strerror(errno)); \ - 1; \ - }) - -#define fail(format, arg...) \ - ({ \ - test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, \ - strerror(errno)); \ - 1; \ - }) +#define pr_err(format, arg...) test_msg("ERR: %s:%d: " format, __FILE__, __LINE__, ##arg) +#define pr_perror(format, arg...) \ + test_msg("ERR: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) +#define fail(format, arg...) \ + test_msg("FAIL: %s:%d: " format " (errno = %d (%s))\n", __FILE__, __LINE__, ##arg, errno, strerror(errno)) #define skip(format, arg...) test_msg("SKIP: %s:%d: " format "\n", __FILE__, __LINE__, ##arg) #define pass() test_msg("PASS\n") @@ -178,7 +164,6 @@ extern const char *test_doc; extern int tcp_init_server_with_opts(int family, int *port, struct zdtm_tcp_opts *opts); extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, unsigned long newtls); -extern dev_t get_mapping_dev(void *addr); #define ssprintf(s, fmt, ...) \ ({ \ @@ -190,39 +175,4 @@ extern dev_t get_mapping_dev(void *addr); ___ret; \ }) -#ifndef TEMP_FAILURE_RETRY -#define TEMP_FAILURE_RETRY(expression) \ - (__extension__({ \ - long int __result; \ - do \ - __result = (long int)(expression); \ - while (__result < 0 && errno == EINTR); \ - __result; \ - })) -#endif - -#define cleanup_close __attribute__((cleanup(cleanup_closep))) -#define cleanup_free __attribute__((cleanup(cleanup_freep))) -static inline void cleanup_freep(void *p) -{ - void **pp = (void **)p; - free(*pp); -} - -static inline void cleanup_closep(void *p) -{ - int *pp = (int *)p; - if (*pp >= 0) - TEMP_FAILURE_RETRY(close(*pp)); -} - -extern int write_value(const char *path, const char *value); -extern int read_value(const char *path, char *value, int size); - -#define container_of(ptr, type, member) \ - ({ \ - const typeof(((type *)0)->member) *__mptr = (ptr); \ - (type *)((char *)__mptr - offsetof(type, member)); \ - }) - #endif /* _VIMITESU_H_ */ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index e1df2e5fa..c9e6589f0 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -8,7 +8,6 @@ TST_NOFILE := \ sleeping00 \ pid00 \ caps00 \ - caps01 \ wait00 \ zombie00 \ zombie01 \ @@ -24,7 +23,6 @@ TST_NOFILE := \ sse20 \ mprotect00 \ timers \ - timers01 \ timerfd \ unbound_sock \ sched_prio00 \ @@ -37,8 +35,6 @@ TST_NOFILE := \ socket_udp-corked \ socket6_udp \ socket_udp_shutdown \ - socket_icmp \ - socket6_icmp \ sk-freebind \ sk-freebind-false \ socket_udplite \ @@ -57,24 +53,14 @@ TST_NOFILE := \ shm \ shm-mp \ ptrace_sig \ - pidfd_self \ - pidfd_of_thread \ - pidfd_dead \ - pidfd_diffdead \ - pidfd_child \ - pidfd_kill \ - fd_from_pidfd \ pipe00 \ pipe01 \ pipe02 \ pthread00 \ - pthread00-pac \ pthread01 \ pthread02 \ pthread_timers \ pthread_timers_h \ - rseq00 \ - membarrier \ vdso00 \ vdso01 \ vdso02 \ @@ -82,13 +68,9 @@ TST_NOFILE := \ utsname \ pstree \ sockets01 \ - sockets01-seqpacket \ sockets02 \ - sockets02-seqpacket \ sockets_spair \ - sockets_spair_seqpacket \ socket_queues \ - socket_queues_seqpacket \ socket-raw \ socket-tcp \ socket-tcp-listen \ @@ -97,8 +79,7 @@ TST_NOFILE := \ socket-tcp4v6 \ socket-tcp-local \ socket-tcp-reuseport \ - socket-tcp-ipt-nfconntrack \ - socket-tcp-nft-nfconntrack \ + socket-tcp-nfconntrack \ socket-tcp6-local \ socket-tcp4v6-local \ socket-tcpbuf \ @@ -126,7 +107,6 @@ TST_NOFILE := \ socket-tcp4v6-closed \ socket-tcp-close0 \ socket-tcp-close1 \ - socket-tcp-close2 \ socket-dump-tcp-close \ socket-tcp-unconn \ socket-tcp6-unconn \ @@ -136,13 +116,7 @@ TST_NOFILE := \ socket-linger \ sock_opts00 \ sock_opts01 \ - sock_opts02 \ - sock_ip_opts00 \ - sock_ip_opts01 \ - sock_tcp_opts00 \ - sock_tcp_opts01 \ sk-unix-unconn \ - sk-unix-unconn-seqpacket \ ipc_namespace \ selfexe00 \ sem \ @@ -150,9 +124,6 @@ TST_NOFILE := \ maps02 \ maps04 \ maps05 \ - maps09 \ - maps10 \ - maps11 \ mlock_setuid \ xids00 \ groups \ @@ -203,8 +174,6 @@ TST_NOFILE := \ stopped01 \ stopped02 \ stopped12 \ - stopped03 \ - stopped04 \ rtc \ clean_mntns \ mntns_rw_ro_rw \ @@ -216,11 +185,9 @@ TST_NOFILE := \ scm01 \ scm02 \ scm03 \ - scm03-seqpacket \ scm04 \ scm05 \ scm06 \ - scm09 \ aio00 \ aio01 \ fd \ @@ -232,7 +199,6 @@ TST_NOFILE := \ seccomp_filter_tsync \ seccomp_filter_threads \ seccomp_filter_inheritance \ - seccomp_no_new_privs \ different_creds \ vsx \ bridge \ @@ -261,6 +227,7 @@ TST_NOFILE := \ netns_sub_veth \ netns_sub_sysctl \ unlink_multiple_largefiles \ + config_inotify_irmap \ thp_disable \ pid_file \ selinux00 \ @@ -272,10 +239,7 @@ TST_NOFILE := \ memfd00 \ memfd01 \ memfd02 \ - memfd02-hugetlb \ memfd03 \ - memfd04 \ - memfd05 \ shmemfd \ shmemfd-priv \ time \ @@ -285,24 +249,19 @@ TST_NOFILE := \ sigtrap \ sigtrap01 \ change_mnt_context \ - fd_offset \ # jobctl00 \ PKG_CONFIG ?= pkg-config pkg-config-check = $(shell sh -c '$(PKG_CONFIG) $(1) && echo y') -pkg-config-atleast-version = $(shell sh -c '$(PKG_CONFIG) --atleast-version=$(2) $(1) && echo y') ifeq ($(call pkg-config-check,libbpf),y) TST_NOFILE += \ bpf_hash \ - bpf_array + bpf_array endif ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) - TST_NOFILE += maps03 -ifeq ($(call pkg-config-atleast-version,libtracefs,1.7),y) - TST_NOFILE += uprobes -endif + TST_NOFILE += maps03 endif endif @@ -319,12 +278,10 @@ TST_FILE = \ write_read02 \ write_read10 \ maps00 \ - maps12 \ link10 \ file_attr \ deleted_unix_sock \ sk-unix-rel \ - sk-unix-rel-seqpacket \ deleted_dev \ unlink_fstat00 \ unlink_fstat01 \ @@ -334,10 +291,6 @@ TST_FILE = \ ghost_holes00 \ ghost_holes01 \ ghost_holes02 \ - ghost_holes_large00 \ - ghost_holes_large01 \ - ghost_multi_hole00 \ - ghost_multi_hole01 \ unlink_largefile \ mtime_mmap \ fifo \ @@ -356,9 +309,7 @@ TST_FILE = \ cow01 \ fdt_shared \ sockets00 \ - sockets00-seqpacket \ sockets03 \ - sockets03-seqpacket \ sockets_dgram \ file_lease00 \ file_lease01 \ @@ -382,12 +333,6 @@ TST_FILE = \ socket_close_data01 \ fifo_upon_unix_socket00 \ fifo_upon_unix_socket01 \ - sk-unix-listen01 \ - sk-unix-listen02 \ - sk-unix-listen03 \ - sk-unix-listen04 \ - sk-unix-restore-fs-share \ - mnt_ext_file_bind_auto \ TST_DIR = \ cwd00 \ @@ -418,13 +363,9 @@ TST_DIR = \ cgroup02 \ cgroup03 \ cgroup04 \ - cgroupv2_00 \ - cgroupv2_01 \ cgroup_ifpriomap \ - cgroup_ignore \ cgroup_stray \ cgroup_yard \ - cgroup_threads \ unlink_fstat04 \ unlink_fstat041 \ mntns_remap \ @@ -433,7 +374,6 @@ TST_DIR = \ mntns_ghost \ mntns_ghost01 \ mntns_ro_root \ - mnt_ro_root \ mntns_link_ghost \ mntns_shared_bind \ mntns_shared_bind02 \ @@ -446,24 +386,14 @@ TST_DIR = \ mnt_ext_auto \ mnt_ext_master \ mnt_ext_dev \ - mnt_ext_root \ - mnt_root_ext \ - mnt_ext_collision \ - mntns_pivot_root \ - mntns_pivot_root_ro \ - mnt_ext_sharing \ - mnt_ext_multiple \ - mount_complex_sharing \ mnt_tracefs \ mntns_deleted \ unlink_regular00 \ mnt_enablefs \ autofs \ del_standalone_un \ - del_standalone_un_seqpacket \ sk-unix-mntns \ sk-unix01 \ - sk-unix01-seqpacket \ sk-unix-dgram-ghost \ unsupported_children_collision \ shared_slave_mount_children \ @@ -491,7 +421,6 @@ TST = \ umask00 \ cmdlinenv00 \ shm-unaligned \ - shm-hugetlb \ TST_STATE = \ conntracks \ @@ -509,41 +438,32 @@ STATE_OUT = $(TST_STATE:%=%.out) include ../Makefile.inc -ifeq ($(ARCH),aarch64) - PAC_CFLAGS := -mbranch-protection=standard -else - PAC_CFLAGS := -endif - all: $(TST) criu-rtc.so install: all .PHONY: all install $(TST_NOFILE:%=%.pid): %.pid: % - $(TEST_ENV) $(> .gitignore $(Q)echo $(@:%.c=%.h) >> .gitignore $(E) " PBCC " $@ - $(Q)protoc --proto_path=. --c_out=. criu-rtc.proto + $(Q)protoc-c --proto_path=. --c_out=. criu-rtc.proto criu-rtc.so: criu-rtc.c criu-rtc.pb-c.c $(E) " LD " $@ diff --git a/test/zdtm/static/aio01.c b/test/zdtm/static/aio01.c index 100069b03..ed45192b9 100644 --- a/test/zdtm/static/aio01.c +++ b/test/zdtm/static/aio01.c @@ -14,8 +14,8 @@ const char *test_doc = "Check head and tail restore correct"; const char *test_author = "Kirill Tkhai "; struct aio_ring { - unsigned id; /* kernel internal index number */ - unsigned nr; /* number of io_events */ + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ unsigned head; /* Written to by userland or under ring_lock * mutex by aio_read_events_ring(). */ unsigned tail; diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index dc1636821..713ffaa46 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -59,7 +59,7 @@ int checkprofile(void) return -1; } - len = fscanf(f, "%1023[^ \n]s", profile); + len = fscanf(f, "%[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/apparmor_stacking.c b/test/zdtm/static/apparmor_stacking.c index 0bc36048c..76de8b8b4 100644 --- a/test/zdtm/static/apparmor_stacking.c +++ b/test/zdtm/static/apparmor_stacking.c @@ -56,7 +56,7 @@ static int checkprofile(pid_t pid, char *expected) return -1; } - len = fscanf(f, "%1023[^ \n]s", profile); + len = fscanf(f, "%[^ \n]s", profile); fclose(f); if (len != 1) { fail("wrong number of items scanned %d", len); diff --git a/test/zdtm/static/auto_dev-ioctl.h b/test/zdtm/static/auto_dev-ioctl.h index 1b35fe2f7..e65259b30 100644 --- a/test/zdtm/static/auto_dev-ioctl.h +++ b/test/zdtm/static/auto_dev-ioctl.h @@ -95,7 +95,7 @@ struct args_ismountpoint { struct autofs_dev_ioctl { __u32 ver_major; __u32 ver_minor; - __u32 size; /* total size of data passed in + __u32 size; /* total size of data passed in * including this struct */ __s32 ioctlfd; /* automount command fd */ diff --git a/test/zdtm/static/autofs.c b/test/zdtm/static/autofs.c index ad1795842..2d6078627 100644 --- a/test/zdtm/static/autofs.c +++ b/test/zdtm/static/autofs.c @@ -47,7 +47,6 @@ static char *xvstrcat(char *str, const char *fmt, va_list args) ret = -ENOMEM; new = realloc(str, offset + delta); if (new) { - str = new; va_copy(tmp, args); ret = vsnprintf(new + offset, delta, fmt, tmp); va_end(tmp); @@ -55,6 +54,7 @@ static char *xvstrcat(char *str, const char *fmt, va_list args) /* NOTE: vsnprintf returns the amount of bytes * * to allocate. */ delta = ret + 1; + str = new; ret = 0; } } @@ -266,7 +266,6 @@ static int check_automount(struct autofs_params *p) return err; free(mountpoint); - mountpoint = NULL; err = p->setup(p); if (err) { @@ -275,7 +274,7 @@ static int check_automount(struct autofs_params *p) } if (close(p->fd)) { - pr_perror("mountpoint failed to close fd %d", p->fd); + pr_perror("%s: failed to close fd %d", mountpoint, p->fd); return -errno; } diff --git a/test/zdtm/static/bpf_array.c b/test/zdtm/static/bpf_array.c index febe3e190..ec74a3e3b 100644 --- a/test/zdtm/static/bpf_array.c +++ b/test/zdtm/static/bpf_array.c @@ -1,14 +1,10 @@ +#include #include #include #include "zdtmtst.h" #include "bpfmap_zdtm.h" -#ifndef LIBBPF_OPTS -#define LIBBPF_OPTS DECLARE_LIBBPF_OPTS -#define LEGACY_LIBBPF /* Using libbpf < 0.7 */ -#endif - const char *test_doc = "Check that data and meta-data for BPF_MAP_TYPE_ARRAY" "is correctly restored"; const char *test_author = "Abhishek Vijeev "; @@ -16,7 +12,7 @@ const char *test_author = "Abhishek Vijeev "; static int map_batch_update(int map_fd, uint32_t max_entries, int *keys, int *values) { int i, ret; - LIBBPF_OPTS(bpf_map_batch_opts, opts); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, ); for (i = 0; i < max_entries; i++) { keys[i] = i; @@ -65,8 +61,6 @@ int main(int argc, char **argv) struct bpfmap_fdinfo_obj old_fdinfo = {}; struct bpfmap_fdinfo_obj new_fdinfo = {}; uint32_t info_len = sizeof(struct bpf_map_info); - -#ifdef LEGACY_LIBBPF struct bpf_create_map_attr xattr = { .name = "array_test_map", .map_type = BPF_MAP_TYPE_ARRAY, @@ -75,10 +69,7 @@ int main(int argc, char **argv) .max_entries = max_entries, .map_flags = BPF_F_NUMA_NODE, }; -#else - LIBBPF_OPTS(bpf_map_create_opts, bpf_mapfd_opts, .map_flags = BPF_F_NUMA_NODE); -#endif - LIBBPF_OPTS(bpf_map_batch_opts, opts); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, ); keys = mmap(NULL, max_entries * sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); values = mmap(NULL, max_entries * sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); @@ -91,13 +82,7 @@ int main(int argc, char **argv) test_init(argc, argv); -#ifdef LEGACY_LIBBPF map_fd = bpf_create_map_xattr(&xattr); -#else - map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "array_test_map", sizeof(int), sizeof(int), max_entries, - &bpf_mapfd_opts); -#endif - if (map_fd == -1) { pr_perror("Can't create BPF map"); goto err; diff --git a/test/zdtm/static/bpf_hash.c b/test/zdtm/static/bpf_hash.c index 296a87325..612f69665 100644 --- a/test/zdtm/static/bpf_hash.c +++ b/test/zdtm/static/bpf_hash.c @@ -1,14 +1,10 @@ -#include +#include #include +#include #include "zdtmtst.h" #include "bpfmap_zdtm.h" -#ifndef LIBBPF_OPTS -#define LIBBPF_OPTS DECLARE_LIBBPF_OPTS -#define LEGACY_LIBBPF /* Using libbpf < 0.7 */ -#endif - const char *test_doc = "Check that data and meta-data for BPF_MAP_TYPE_HASH" "is correctly restored"; const char *test_author = "Abhishek Vijeev "; @@ -16,7 +12,7 @@ const char *test_author = "Abhishek Vijeev "; static int map_batch_update(int map_fd, uint32_t max_entries, int *keys, int *values) { int ret; - LIBBPF_OPTS(bpf_map_batch_opts, opts); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, ); for (int i = 0; i < max_entries; i++) { keys[i] = i + 1; @@ -63,8 +59,6 @@ int main(int argc, char **argv) struct bpfmap_fdinfo_obj old_fdinfo = {}; struct bpfmap_fdinfo_obj new_fdinfo = {}; uint32_t info_len = sizeof(struct bpf_map_info); - -#ifdef LEGACY_LIBBPF struct bpf_create_map_attr xattr = { .name = "hash_test_map", .map_type = BPF_MAP_TYPE_HASH, @@ -73,10 +67,7 @@ int main(int argc, char **argv) .max_entries = max_entries, .map_flags = BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE, }; -#else - LIBBPF_OPTS(bpf_map_create_opts, bpf_mapfd_opts, .map_flags = BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE); -#endif - LIBBPF_OPTS(bpf_map_batch_opts, opts); + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, .elem_flags = 0, .flags = 0, ); keys = mmap(NULL, max_entries * sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); values = mmap(NULL, max_entries * sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0, 0); @@ -89,12 +80,7 @@ int main(int argc, char **argv) test_init(argc, argv); -#ifdef LEGACY_LIBBPF map_fd = bpf_create_map_xattr(&xattr); -#else - map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, "hash_test_map", sizeof(int), sizeof(int), max_entries, - &bpf_mapfd_opts); -#endif if (!map_fd) { pr_perror("Can't create BPF map"); goto err; diff --git a/test/zdtm/static/caps01.c b/test/zdtm/static/caps01.c deleted file mode 100644 index 0f8a7101e..000000000 --- a/test/zdtm/static/caps01.c +++ /dev/null @@ -1,168 +0,0 @@ -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check that CapAmb are preserved"; -const char *test_author = "Liu Chao "; - -struct cap_hdr { - unsigned int version; - int pid; -}; - -struct cap_data { - unsigned int eff; - unsigned int prm; - unsigned int inh; -}; - -#define _LINUX_CAPABILITY_VERSION_3 0x20080522 -#define _LINUX_CAPABILITY_U32S_3 2 -#define CAP_DAC_OVERRIDE 1 -#define PR_CAP_AMBIENT 47 -#define PR_CAP_AMBIENT_IS_SET 1 -#define PR_CAP_AMBIENT_RAISE 2 -#define PR_CAP_AMBIENT_LOWER 3 - -int capget(struct cap_hdr *hdrp, struct cap_data *datap); -int capset(struct cap_hdr *hdrp, const struct cap_data *datap); - -static int cap_last_cap = 63; - -int main(int argc, char **argv) -{ - task_waiter_t t; - int pid, result_pipe[2]; - unsigned int amb[_LINUX_CAPABILITY_U32S_3]; - unsigned int amb_2[_LINUX_CAPABILITY_U32S_3]; - char res = 'x'; - FILE *f; - - test_init(argc, argv); - task_waiter_init(&t); - - f = fopen("/proc/sys/kernel/cap_last_cap", "r"); - if (f) { - if (fscanf(f, "%d", &cap_last_cap) != 1) { - pr_perror("Unable to read cal_last_cap"); - fclose(f); - return 1; - } - fclose(f); - } else - test_msg("/proc/sys/kernel/cap_last_cap is not available\n"); - - if (pipe(result_pipe)) { - pr_perror("Can't create pipe"); - return 1; - } - - pid = test_fork(); - if (pid == 0) { - int b, i, ret; - struct cap_hdr hdr; - struct cap_data data[_LINUX_CAPABILITY_U32S_3]; - - hdr.version = _LINUX_CAPABILITY_VERSION_3; - hdr.pid = 0; - - if (capget(&hdr, data) < 0) { - pr_perror("capget"); - return -1; - } - - hdr.version = _LINUX_CAPABILITY_VERSION_3; - hdr.pid = 0; - - data[0].eff &= ~((1 << CAP_CHOWN) | (1 << CAP_DAC_OVERRIDE)); - data[0].prm &= ~(1 << CAP_DAC_OVERRIDE); - data[0].inh = data[0].prm; - data[1].inh = data[1].prm; - - if (capset(&hdr, data) < 0) { - pr_perror("capset"); - return -1; - } - - for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { - amb[b] = data[b].prm; - for (i = 0; i < 32; i++) { - if (b * 32 + i > cap_last_cap) - break; - if ((amb[b] & (1 << i)) > 0) - ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i + b * 32, 0, 0); - else - ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, i + b * 32, 0, 0); - if (ret) { - pr_perror("Unable to set ambient capability %d to %d: %d", i + b * 32, amb[b] & (1 << i), ret); - return -1; - } - } - } - - task_waiter_complete_current(&t); - task_waiter_wait4(&t, getppid()); - - for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { - amb_2[b] = 0; - for (i = 0; i < 32; i++) { - if (b * 32 + i > cap_last_cap) - break; - ret = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, i + b * 32, 0, 0); - if (ret < 0) { - pr_perror("Unable to read ambient capability %d: %d", i + b * 32, ret); - goto bad; - } - - amb_2[b] |= (ret << i); - } - } - - for (b = 0; b < _LINUX_CAPABILITY_U32S_3; b++) { - if (amb[b] != amb_2[b]) { - res = '1'; - goto bad; - } - } - - res = '0'; - bad: - write(result_pipe[1], &res, 1); - - if (res != '0') { - write(result_pipe[1], amb, sizeof(amb)); - write(result_pipe[1], amb_2, sizeof(amb_2)); - } - - close(result_pipe[0]); - close(result_pipe[1]); - _exit(0); - } - - task_waiter_wait4(&t, pid); - - test_daemon(); - test_waitsig(); - - task_waiter_complete_current(&t); - - read(result_pipe[0], &res, 1); - - if (res == '0') - pass(); - else { - read(result_pipe[0], amb, sizeof(amb)); - read(result_pipe[0], amb_2, sizeof(amb_2)); - test_msg("amb[]=%08x, %08x\n", amb[0], amb[1]); - test_msg("amb[]=%08x, %08x\n", amb_2[0], amb_2[1]); - fail("Fail: %c", res); - } - close(result_pipe[0]); - close(result_pipe[1]); - - return 0; -} diff --git a/test/zdtm/static/caps01.desc b/test/zdtm/static/caps01.desc deleted file mode 100644 index 2eac7e654..000000000 --- a/test/zdtm/static/caps01.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'suid'} diff --git a/test/zdtm/static/cgroup00.desc b/test/zdtm/static/cgroup00.desc index 42a3f2b73..3c6c4a7e2 100644 --- a/test/zdtm/static/cgroup00.desc +++ b/test/zdtm/static/cgroup00.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup01.c b/test/zdtm/static/cgroup01.c index 7bfb67762..bc8515264 100644 --- a/test/zdtm/static/cgroup01.c +++ b/test/zdtm/static/cgroup01.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) if (!s) continue; - sscanf(paux, "%*d %*d %*d:%*d %*s %1023s", aux); + sscanf(paux, "%*d %*d %*d:%*d %*s %s", aux); test_msg("found cgroup at %s\n", aux); for (i = 0; i < 2; i++) { diff --git a/test/zdtm/static/cgroup01.desc b/test/zdtm/static/cgroup01.desc index 42a3f2b73..3c6c4a7e2 100644 --- a/test/zdtm/static/cgroup01.desc +++ b/test/zdtm/static/cgroup01.desc @@ -1 +1 @@ -{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} +{'flavor': 'h', 'flags': 'suid', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup02.c b/test/zdtm/static/cgroup02.c index 8a925c0a4..6229a8a08 100644 --- a/test/zdtm/static/cgroup02.c +++ b/test/zdtm/static/cgroup02.c @@ -75,7 +75,7 @@ bool test_exists(char *mountinfo_line, char *path) char aux[1024], paux[1024]; struct stat st; - sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %1023s", aux); + sscanf(mountinfo_line, "%*d %*d %*d:%*d %*s %s", aux); test_msg("found cgroup at %s\n", aux); ssprintf(paux, "%s/%s", aux, path); diff --git a/test/zdtm/static/cgroup02.desc b/test/zdtm/static/cgroup02.desc index eb5a9dd37..df17a5789 100644 --- a/test/zdtm/static/cgroup02.desc +++ b/test/zdtm/static/cgroup02.desc @@ -1,4 +1,4 @@ { 'dopts': '--manage-cgroups --cgroup-root name=zdtmtst:/prefix', - 'flags': 'suid excl', + 'flags': 'suid', 'flavor': 'h', 'ropts': '--manage-cgroups --cgroup-root /newroot --cgroup-root name=zdtmtst:/prefix'} diff --git a/test/zdtm/static/cgroup04.c b/test/zdtm/static/cgroup04.c index f586a0628..5a424be12 100644 --- a/test/zdtm/static/cgroup04.c +++ b/test/zdtm/static/cgroup04.c @@ -17,25 +17,45 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *const cgname = "zdtmtst"; +static const char *cgname = "zdtmtst"; + +int write_value(const char *path, const char *value) +{ + int fd, l; + + fd = open(path, O_WRONLY); + if (fd < 0) { + pr_perror("open %s", path); + return -1; + } + + l = write(fd, value, strlen(value)); + close(fd); + if (l < 0) { + pr_perror("failed to write %s to %s", value, path); + return -1; + } + + return 0; +} int mount_and_add(const char *controller, const char *path, const char *prop, const char *value) { char aux[1024], paux[1024], subdir[1024]; if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir %s", dirname); + pr_perror("Can't make dir"); return -1; } sprintf(subdir, "%s/%s", dirname, controller); if (mkdir(subdir, 0700) < 0) { - pr_perror("Can't make dir %s", subdir); + pr_perror("Can't make dir"); return -1; } if (mount("none", subdir, "cgroup", 0, controller)) { - pr_perror("Can't mount cgroup controller %s at %s", controller, subdir); + pr_perror("Can't mount cgroups"); goto err_rd; } @@ -52,8 +72,7 @@ int mount_and_add(const char *controller, const char *path, const char *prop, co goto err_rs; ssprintf(paux, "%s/%s/special_prop_check", subdir, path); - if (mkdir(paux, 0600) < 0) - pr_perror("Can't make dir %s", paux); + mkdir(paux, 0600); return 0; err_rs: @@ -75,11 +94,11 @@ bool checkval(char *path, char *val) } n = read(fd, buf, sizeof(buf) - 1); - if (n < 0) - pr_perror("read %s", path); close(fd); - if (n < 0) + if (n < 0) { + pr_perror("read"); return false; + } buf[n] = 0; if (strcmp(val, buf)) { @@ -96,7 +115,7 @@ int main(int argc, char **argv) char buf[1024], path[PATH_MAX]; struct stat sb; - const char *const dev_allow[] = { + char *dev_allow[] = { "c *:* m", "b *:* m", "c 1:3 rwm", "c 1:5 rwm", "c 1:7 rwm", "c 5:0 rwm", "c 5:2 rwm", "c 1:8 rwm", "c 1:9 rwm", "c 136:* rwm", "c 10:229 rwm", }; @@ -127,14 +146,12 @@ int main(int argc, char **argv) sprintf(path, "%s/devices/%s/devices.list", dirname, cgname); if (!checkval(path, buf)) { - errno = 0; fail(); goto out; } sprintf(path, "%s/memory/%s/memory.limit_in_bytes", dirname, cgname); if (!checkval(path, "268435456\n")) { - errno = 0; fail(); goto out; } @@ -146,7 +163,6 @@ int main(int argc, char **argv) } if (!S_ISDIR(sb.st_mode)) { - errno = 0; fail("special_prop_check not a directory?"); goto out; } diff --git a/test/zdtm/static/cgroup04.checkskip b/test/zdtm/static/cgroup04.checkskip index 1ccbada4d..205f8fc53 100755 --- a/test/zdtm/static/cgroup04.checkskip +++ b/test/zdtm/static/cgroup04.checkskip @@ -1,20 +1,3 @@ #!/bin/bash -set -e -test ! -f /sys/fs/cgroup/cgroup.controllers - -for ctl in devices memory; do - # Check that the controller is available. - - grep -q "^${ctl}\\s" /proc/cgroups - - # Check that the controller is not co-mounted with any other. - - # /proc/self/cgroup may have: - # "1:devices:/sys" - if ! grep -q "^[0-9]*:${ctl}:" /proc/self/cgroup; then - # but not eg: - # "1:devices,job:/sys" - grep -qE "^[0-9]*:([^:]*,)?${ctl}(,[^:]*)?:" /proc/self/cgroup && exit 1 - fi -done +! test -f /sys/fs/cgroup/cgroup.controllers diff --git a/test/zdtm/static/cgroup_ifpriomap.checkskip b/test/zdtm/static/cgroup_ifpriomap.checkskip index f401ad1b2..205f8fc53 100755 --- a/test/zdtm/static/cgroup_ifpriomap.checkskip +++ b/test/zdtm/static/cgroup_ifpriomap.checkskip @@ -1,6 +1,3 @@ #!/bin/bash -set -e -test ! -f /sys/fs/cgroup/cgroup.controllers - -grep -q '^net_prio\s' /proc/cgroups +! test -f /sys/fs/cgroup/cgroup.controllers diff --git a/test/zdtm/static/cgroup_ignore.c b/test/zdtm/static/cgroup_ignore.c deleted file mode 100644 index ca2b30316..000000000 --- a/test/zdtm/static/cgroup_ignore.c +++ /dev/null @@ -1,161 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "zdtmtst.h" - -const char *test_doc = "Check that cgroups are correctly ignored"; -const char *test_author = "Adrian Reber "; - -char *dirname; -TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *cgname = "zdtmtst"; - -static size_t read_all(int fd, char *buf, size_t size) -{ - ssize_t r = 0, ret; - - while (r < size) { - ret = read(fd, buf + r, size - r); - if (ret < 0) { - pr_perror("Read failed"); - return -1; - } else if (ret == 0) { - return 0; - } - r += ret; - } - - return 0; -} - -int main(int argc, char **argv) -{ - cleanup_free char *cgroup_procs = NULL; - cleanup_close int cgroup_procs_fd = -1; - cleanup_free char *destination = NULL; - cleanup_free char *buffer_old = NULL; - cleanup_free char *buffer_new = NULL; - cleanup_close int fd = -1; - int ret = 1; - - test_init(argc, argv); - - buffer_old = malloc(PAGE_SIZE); - if (!buffer_old) { - pr_err("Could not allocate memory\n"); - return 1; - } - memset(buffer_old, 0, PAGE_SIZE); - buffer_new = malloc(PAGE_SIZE); - if (!buffer_new) { - pr_err("Could not allocate memory\n"); - return 1; - } - memset(buffer_new, 0, PAGE_SIZE); - - // Read /proc/self/cgroup to later compare against it - fd = open("/proc/self/cgroup", O_RDONLY); - if (fd < 0) { - pr_err("Could not open /proc/self/cgroup\n"); - return 1; - } - - if (read_all(fd, buffer_old, PAGE_SIZE)) { - pr_err("Could not read data from /proc/self/cgroup\n"); - return 1; - } - - // Create the cgroup root directory - if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_err("Cannot make directory %s\n", dirname); - return 1; - } - - // Mount cgroup2, skip if cgroup2 is not available - if (mount("none", dirname, "cgroup2", 0, 0)) { - if (errno == ENODEV) { - skip("Test relies on cgroup2 semantics which this system does not support. Skipping"); - test_daemon(); - test_waitsig(); - pass(); - return 0; - } else { - pr_err("Could not mount cgroup2 at %s\n", dirname); - } - return 1; - } - - // Create the cgroup cgname (if it does not already exist) - if (asprintf(&destination, "%s/%s", dirname, cgname) == -1) { - pr_err("Could not allocate memory\n"); - goto err; - } - if (mkdir(destination, 0700) < 0 && errno != EEXIST) { - pr_err("Failed to create temporary cgroup directory %s\n", destination); - goto err; - } - - // Move this process to the newly created cgroup - if (asprintf(&cgroup_procs, "%s/cgroup.procs", destination) == -1) { - pr_err("Could not allocate memory\n"); - goto err; - } - cgroup_procs_fd = open(cgroup_procs, O_RDWR); - if (cgroup_procs_fd < 0) { - pr_err("Could not open %s\n", cgroup_procs); - goto err; - } - if (write(cgroup_procs_fd, "0", 1) != 1) { - pr_err("Writing to %s failed\n", cgroup_procs); - goto err; - } - - // Read /proc/self/cgroup (should have changed) - lseek(fd, 0, SEEK_SET); - if (read_all(fd, buffer_new, PAGE_SIZE)) { - pr_err("Could not read data from /proc/self/cgroup\n"); - goto err; - } - - // Test if /proc/self/cgroup has changed - if (!memcmp(buffer_new, buffer_old, PAGE_SIZE)) { - fail("/proc/self/cgroup should differ after move to another cgroup"); - pr_err("original /proc/self/cgroup content %s\n", buffer_old); - pr_err("new /proc/self/cgroup content %s\n", buffer_new); - goto err; - } - - test_daemon(); - test_waitsig(); - - // Read /proc/self/cgroup. It should not be the same as after - // moving this process to another cgroup because of restore - // with '--manage-cgroups=ignore'. The process should be - // now in cgroup of the current session. - lseek(fd, 0, SEEK_SET); - memset(buffer_old, 0, PAGE_SIZE); - if (read_all(fd, buffer_old, PAGE_SIZE)) { - pr_err("Could not read data from /proc/self/cgroup\n"); - goto err; - } - - // Test if /proc/self/cgroup has changed again - if (!memcmp(buffer_new, buffer_old, PAGE_SIZE)) { - fail("/proc/self/cgroup should differ after restore"); - pr_err("original /proc/self/cgroup content %s\n", buffer_new); - pr_err("new /proc/self/cgroup content %s\n", buffer_old); - goto err; - } - - ret = 0; - pass(); -err: - rmdir(destination); - umount(dirname); - - return ret; -} diff --git a/test/zdtm/static/cgroup_ignore.checkskip b/test/zdtm/static/cgroup_ignore.checkskip deleted file mode 100755 index 7b1d7ced1..000000000 --- a/test/zdtm/static/cgroup_ignore.checkskip +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -dest=$(mktemp -d cg_ignore.XXXXXX) - -trap 'rmdir "$dest"' EXIT - -if ! mount -t cgroup2 none "$dest"; then - exit 1 -fi - -umount "$dest" - -exit 0 diff --git a/test/zdtm/static/cgroup_ignore.desc b/test/zdtm/static/cgroup_ignore.desc deleted file mode 100644 index 3fa9ceb76..000000000 --- a/test/zdtm/static/cgroup_ignore.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'flags': 'suid excl', 'ropts': '--manage-cgroups=ignore'} diff --git a/test/zdtm/static/cgroup_stray.c b/test/zdtm/static/cgroup_stray.c index f5754410f..0c0ed93cf 100644 --- a/test/zdtm/static/cgroup_stray.c +++ b/test/zdtm/static/cgroup_stray.c @@ -135,7 +135,7 @@ out: int main(int argc, char **argv) { int ret = -1, sk_pair[2], sk, status; - char path[PATH_MAX], c = 0; + char path[PATH_MAX], c; pid_t pid = 0; test_init(argc, argv); diff --git a/test/zdtm/static/cgroup_threads.c b/test/zdtm/static/cgroup_threads.c deleted file mode 100644 index 2c17e13a7..000000000 --- a/test/zdtm/static/cgroup_threads.c +++ /dev/null @@ -1,184 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "zdtmtst.h" - -const char *test_doc = "Check that cgroup layout of threads is preserved"; -const char *test_author = "Michał Cłapiński "; - -char *dirname; -TEST_OPTION(dirname, string, "cgroup directory name", 1); -static const char *cgname = "zdtmtst"; -#define SUBNAME "subcg_threads" -#define SUBNAME2 SUBNAME "/subsubcg" - -#define exit_group(code) syscall(__NR_exit_group, code) - -static int cg_move(char *name) -{ - int cgfd, l; - char paux[256]; - - sprintf(paux, "%s/%s", dirname, name); - if (mkdir(paux, 0600)) { - pr_perror("Can't create %s", paux); - return -1; - } - - sprintf(paux, "%s/%s/tasks", dirname, name); - - cgfd = open(paux, O_WRONLY); - if (cgfd < 0) { - pr_perror("Can't open tasks"); - return -1; - } - - l = write(cgfd, "0", 2); - close(cgfd); - - if (l < 0) { - pr_perror("Can't move self to subcg"); - return -1; - } - - return 0; -} - -static int cg_check(char *name) -{ - int found = 0; - FILE *cgf; - char paux[256], aux[128]; - - cgf = fopen("/proc/thread-self/cgroup", "r"); - if (cgf == NULL) - return -1; - - sprintf(aux, "name=%s:/%s", cgname, name); - while (fgets(paux, sizeof(paux), cgf)) { - char *s; - - s = strchr(paux, ':') + 1; - s[strlen(s) - 1] = '\0'; - test_msg("CMP [%s] vs [%s]\n", s, aux); - if (!strcmp(s, aux)) { - found = 1; - break; - } - } - - fclose(cgf); - - return found ? 0 : -1; -} - -int th_sync[2], rst_sync[2]; - -void *thread_fn(void *args) -{ - int status = cg_move(SUBNAME2); - - if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { - pr_perror("write"); - exit_group(1); - } - - if (status == 0) { - if (read(rst_sync[0], &status, sizeof(status)) < 0) { - pr_perror("read"); - exit_group(1); - } - - status = cg_check(SUBNAME2); - if (write(th_sync[1], &status, sizeof(status)) != sizeof(status)) { - pr_perror("write"); - exit_group(1); - } - } - - pthread_exit(0); -} - -int main(int argc, char **argv) -{ - int status, exit_code = 1; - pthread_t thread; - char aux[64]; - - test_init(argc, argv); - - /* - * Pipe to talk to the kid. - * First, it reports that it's ready (int), - * then it reports the restore status (int). - */ - - if (pipe(th_sync)) { - pr_perror("pipe"); - return 1; - } - - /* "Restore happened" pipe */ - if (pipe(rst_sync)) { - pr_perror("pipe"); - return 1; - } - - if (mkdir(dirname, 0700) < 0) { - pr_perror("Can't make dir"); - goto out; - } - - sprintf(aux, "none,name=%s", cgname); - if (mount("none", dirname, "cgroup", 0, aux)) { - pr_perror("Can't mount cgroups"); - goto out_rd; - } - - if (cg_move(SUBNAME)) - goto out_rs; - - if (pthread_create(&thread, NULL, thread_fn, NULL)) { - pr_perror("Can't create a new thread"); - goto out_rs; - } - - status = -1; - read(th_sync[0], &status, sizeof(status)); - if (status != 0) { - pr_perror("Error moving into cgroups"); - close(rst_sync[0]); - goto out_rs; - } - - test_daemon(); - test_waitsig(); - - close(rst_sync[1]); - - status = -1; - if (read(th_sync[0], &status, sizeof(status)) < 0) { - pr_perror("read"); - goto out_rs; - } - if (status != 0) { - fail("child cg changed"); - goto out_rs; - } - - pass(); - exit_code = 0; - -out_rs: - umount(dirname); -out_rd: - rmdir(dirname); -out: - return exit_code; -} diff --git a/test/zdtm/static/cgroup_threads.desc b/test/zdtm/static/cgroup_threads.desc deleted file mode 100644 index 42a3f2b73..000000000 --- a/test/zdtm/static/cgroup_threads.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'flags': 'suid excl', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroup_threads.hook b/test/zdtm/static/cgroup_threads.hook deleted file mode 100755 index f4b553d34..000000000 --- a/test/zdtm/static/cgroup_threads.hook +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -set -e - -[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 - -tname=$(mktemp -d cgclean.XXXXXX) -trap 'rmdir "${tname}"' EXIT - -mount -t cgroup none $tname -o "none,name=zdtmtst" -trap 'umount "${tname}"; rmdir "${tname}"' EXIT - -echo "Cleaning $tname" - -rmdir "$tname/subcg_threads/subsubcg/" || true -rmdir "$tname/subcg_threads/" || true - -echo "Left there is:" -ls "$tname" diff --git a/test/zdtm/static/cgroup_yard.desc b/test/zdtm/static/cgroup_yard.desc index 9ad4a9b57..8736d6780 100644 --- a/test/zdtm/static/cgroup_yard.desc +++ b/test/zdtm/static/cgroup_yard.desc @@ -1,6 +1,6 @@ { 'flavor': 'h', -'flags': 'suid excl', +'flags': 'suid', # We create the external cgroup yard in working directory during --pre-dump # hook. We have to go up a few directories to find the yard. 'opts': '--manage-cgroups --cgroup-yard ../../../../../../external_yard' diff --git a/test/zdtm/static/cgroup_yard.hook b/test/zdtm/static/cgroup_yard.hook index b70bd59e9..d06bc45fd 100755 --- a/test/zdtm/static/cgroup_yard.hook +++ b/test/zdtm/static/cgroup_yard.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import sys import os diff --git a/test/zdtm/static/cgroupns.desc b/test/zdtm/static/cgroupns.desc index dc61e36cf..80dd710e1 100644 --- a/test/zdtm/static/cgroupns.desc +++ b/test/zdtm/static/cgroupns.desc @@ -1,4 +1,4 @@ { 'feature': 'cgroupns', - 'flags': 'suid excl', + 'flags': 'suid', 'flavor': 'h', 'opts': '--manage-cgroups'} diff --git a/test/zdtm/static/cgroupv2_00.c b/test/zdtm/static/cgroupv2_00.c deleted file mode 100644 index 2c6780e0c..000000000 --- a/test/zdtm/static/cgroupv2_00.c +++ /dev/null @@ -1,86 +0,0 @@ -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check that some cgroup-v2 properties in kernel controllers are preserved"; -const char *test_author = "Bui Quang Minh "; - -char *dirname; -TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); -const char *cgname = "subcg00"; - -int main(int argc, char **argv) -{ - char path[1024], aux[1024]; - int ret = -1; - - test_init(argc, argv); - - if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); - return -1; - } - - if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { - pr_perror("Can't mount cgroup-v2"); - return -1; - } - - sprintf(path, "%s/%s", dirname, cgname); - if (mkdir(path, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); - goto out; - } - - /* Make cpuset controllers available in children directory */ - sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); - sprintf(aux, "%s", "+cpuset"); - if (write_value(path, aux)) - goto out; - - sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); - sprintf(aux, "%s", "+cpuset"); - if (write_value(path, aux)) - goto out; - - sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); - sprintf(aux, "%s", "threaded"); - if (write_value(path, aux)) - goto out; - - sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); - sprintf(aux, "%d", getpid()); - if (write_value(path, aux)) - goto out; - - test_daemon(); - test_waitsig(); - - sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); - if (read_value(path, aux, sizeof(aux))) - goto out; - - if (strcmp(aux, "cpuset\n")) { - fail("cgroup.subtree_control mismatches"); - goto out; - } - - sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.type"); - if (read_value(path, aux, sizeof(aux))) - goto out; - - if (strcmp(aux, "threaded\n")) { - fail("cgroup.type mismatches"); - goto out; - } - - pass(); - - ret = 0; - -out: - sprintf(path, "%s", dirname); - umount(path); - return ret; -} diff --git a/test/zdtm/static/cgroupv2_00.checkskip b/test/zdtm/static/cgroupv2_00.checkskip deleted file mode 100755 index 375ed3564..000000000 --- a/test/zdtm/static/cgroupv2_00.checkskip +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -if [ -f /sys/fs/cgroup/cgroup.controllers ]; then - grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 -fi - -if [ -d /sys/fs/cgroup/unified ]; then - grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 -fi - -exit 1 diff --git a/test/zdtm/static/cgroupv2_00.desc b/test/zdtm/static/cgroupv2_00.desc deleted file mode 100644 index e70c84df8..000000000 --- a/test/zdtm/static/cgroupv2_00.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_00.hook b/test/zdtm/static/cgroupv2_00.hook deleted file mode 100755 index 1002b1ec5..000000000 --- a/test/zdtm/static/cgroupv2_00.hook +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 - -set -e -cgname="subcg00" -tname=$(mktemp -d cgclean.XXXXXX) -mount -t cgroup2 cgroup2 $tname - -echo "Cleaning $tname" -echo "-cpuset" > "$tname/$cgname/cgroup.subtree_control" - -set +e -rmdir "$tname/$cgname" -umount "$tname" -rmdir "$tname" diff --git a/test/zdtm/static/cgroupv2_01.c b/test/zdtm/static/cgroupv2_01.c deleted file mode 100644 index f3a6d18ba..000000000 --- a/test/zdtm/static/cgroupv2_01.c +++ /dev/null @@ -1,180 +0,0 @@ -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check that cgroup-v2 threaded controllers"; -const char *test_author = "Bui Quang Minh "; - -char *dirname; -TEST_OPTION(dirname, string, "cgroup-v2 directory name", 1); -const char *cgname = "subcg01"; - -task_waiter_t t; - -#define gettid(code) syscall(__NR_gettid) - -void cleanup(void) -{ - char path[1024]; - - sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); - rmdir(path); - sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); - rmdir(path); - sprintf(path, "%s/%s", dirname, cgname); - rmdir(path); - sprintf(path, "%s", dirname); - umount(path); -} - -int is_in_cgroup(char *cgname) -{ - FILE *cgf; - char buffer[1024]; - - sprintf(buffer, "/proc/self/task/%ld/cgroup", gettid()); - cgf = fopen(buffer, "r"); - if (cgf == NULL) { - pr_err("Fail to open thread's cgroup procfs\n"); - return 0; - } - - while (fgets(buffer, sizeof(buffer), cgf)) { - if (strstr(buffer, cgname)) { - fclose(cgf); - return 1; - } - } - - fclose(cgf); - return 0; -} - -void *thread_func(void *arg) -{ - char path[1024], aux[1024]; - - sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.threads"); - sprintf(aux, "%ld", gettid()); - if (write_value(path, aux)) { - cleanup(); - exit(1); - } - - read_value(path, aux, sizeof(aux)); - - task_waiter_complete(&t, 1); - - /* Wait for restore */ - task_waiter_wait4(&t, 2); - - sprintf(path, "/%s/%s", cgname, "thread2"); - if (!is_in_cgroup(path)) { - fail("Thread2's cgroup is not restored"); - cleanup(); - exit(1); - } - - return NULL; -} - -int main(int argc, char **argv) -{ - char path[1024], aux[1024]; - pthread_t thread2; - int ret = 1; - - test_init(argc, argv); - task_waiter_init(&t); - - if (mkdir(dirname, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); - return -1; - } - - if (mount("cgroup2", dirname, "cgroup2", 0, NULL)) { - pr_perror("Can't mount cgroup-v2"); - return -1; - } - - sprintf(path, "%s/%s", dirname, cgname); - if (mkdir(path, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); - goto out; - } - - /* Make cpuset controllers available in children directory */ - sprintf(path, "%s/%s", dirname, "cgroup.subtree_control"); - sprintf(aux, "%s", "+cpuset"); - if (write_value(path, aux)) - goto out; - - sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.subtree_control"); - sprintf(aux, "%s", "+cpuset"); - if (write_value(path, aux)) - goto out; - - sprintf(path, "%s/%s/%s", dirname, cgname, "cgroup.procs"); - sprintf(aux, "%d", getpid()); - if (write_value(path, aux)) - goto out; - - sprintf(path, "%s/%s/%s", dirname, cgname, "thread1"); - if (mkdir(path, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); - goto out; - } - - sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.type"); - sprintf(aux, "%s", "threaded"); - if (write_value(path, aux)) - goto out; - - sprintf(path, "%s/%s/%s", dirname, cgname, "thread2"); - if (mkdir(path, 0700) < 0 && errno != EEXIST) { - pr_perror("Can't make dir"); - goto out; - } - - sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread2", "cgroup.type"); - sprintf(aux, "%s", "threaded"); - if (write_value(path, aux)) - goto out; - - ret = pthread_create(&thread2, NULL, thread_func, NULL); - if (ret < 0) { - pr_err("pthread_create %s\n", strerror(ret)); - ret = 1; - goto out; - } - - sprintf(path, "%s/%s/%s/%s", dirname, cgname, "thread1", "cgroup.threads"); - sprintf(aux, "%ld", gettid()); - if (write_value(path, aux)) - goto out; - - task_waiter_wait4(&t, 1); - - test_daemon(); - test_waitsig(); - - task_waiter_complete(&t, 2); - - sprintf(path, "/%s/%s", cgname, "thread1"); - if (!is_in_cgroup(path)) { - fail("Main thread's cgroup is not restored"); - cleanup(); - exit(1); - } - pthread_join(thread2, NULL); - pass(); - - ret = 0; - -out: - cleanup(); - return ret; -} diff --git a/test/zdtm/static/cgroupv2_01.checkskip b/test/zdtm/static/cgroupv2_01.checkskip deleted file mode 100755 index 375ed3564..000000000 --- a/test/zdtm/static/cgroupv2_01.checkskip +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -if [ -f /sys/fs/cgroup/cgroup.controllers ]; then - grep -q "cpuset" /sys/fs/cgroup/cgroup.controllers && exit 0 -fi - -if [ -d /sys/fs/cgroup/unified ]; then - grep -q "cpuset" /sys/fs/cgroup/unified/cgroup.controllers && exit 0 -fi - -exit 1 diff --git a/test/zdtm/static/cgroupv2_01.desc b/test/zdtm/static/cgroupv2_01.desc deleted file mode 100644 index e70c84df8..000000000 --- a/test/zdtm/static/cgroupv2_01.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h ns', 'flags': 'suid excl', 'opts': '--manage-cgroups=full'} diff --git a/test/zdtm/static/cgroupv2_01.hook b/test/zdtm/static/cgroupv2_01.hook deleted file mode 100755 index 2263fd014..000000000 --- a/test/zdtm/static/cgroupv2_01.hook +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -[ "$1" == "--clean" -o "$1" == "--pre-restore" ] || exit 0 - -set -e -cgname="subcg01" -tname=$(mktemp -d cgclean.XXXXXX) -mount -t cgroup2 cgroup2 $tname - -echo "Cleaning $tname" - -set +e -rmdir "$tname/$cgname/thread1" - -# When the test finishes, the cleanup() function removes this directory -# successfully because the thread in this controller exit and no other -# threads belong to this controller -if [ "$1" == "--pre-restore" ]; then - rmdir "$tname/$cgname/thread2" -fi - -rmdir "$tname/$cgname" -umount "$tname" -rmdir "$tname" diff --git a/test/zdtm/static/change_mnt_context.c b/test/zdtm/static/change_mnt_context.c index 8787ae5cf..6d436014b 100644 --- a/test/zdtm/static/change_mnt_context.c +++ b/test/zdtm/static/change_mnt_context.c @@ -46,7 +46,7 @@ int main(int argc, char **argv) if (!pos) continue; - result = sscanf(pos, " - %*s %*s %1023s", opts); + result = sscanf(pos, " - %*s %*s %s", opts); if (result != 1) { fail("Not able to sscanf line from mountinfo"); goto out; diff --git a/test/zdtm/static/child_opened_proc.c b/test/zdtm/static/child_opened_proc.c index cfe04fa4b..2125cd264 100644 --- a/test/zdtm/static/child_opened_proc.c +++ b/test/zdtm/static/child_opened_proc.c @@ -10,7 +10,7 @@ #include "zdtmtst.h" const char *test_doc = "Check that tree prior to files opening"; -const char *test_author = "Stanislav Kinsbursky "; +const char *test_author = "Stanislav Kinsbursky +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +/* + * This test reuses inotify_irmap test for testing configuration files + * functionality. For parts not related to configuration files, please + * refer to the original test case and it's author. + */ + +const char *test_doc = "Default configuration files usage"; +const char *test_author = "Veronika Kabatova "; + +#define TDIR "/etc" +char test_files[2][128] = { + TDIR "/zdtm-test", + TDIR "/zdtm-test1", +}; +#define CONFIG_PATH "../../zdtm_test_config.conf" + +#define BUFF_SIZE ((sizeof(struct inotify_event) + PATH_MAX)) + +int main(int argc, char *argv[]) +{ + FILE *configfile; + char buf[BUFF_SIZE]; + int fd, wd, i; + + test_init(argc, argv); + + for (i = 0; i < 2; i++) { + unlink(test_files[i]); + if (creat(test_files[i], 0600) < 0) { + pr_perror("Can't make test file"); + exit(1); + } + } + fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + pr_perror("inotify_init failed"); + goto err; + } + for (i = 0; i < 2; i++) { + wd = inotify_add_watch(fd, test_files[i], IN_OPEN); + if (wd < 0) { + pr_perror("inotify_add_watch failed"); + goto err; + } + } + + configfile = fopen(CONFIG_PATH, "w"); + if (configfile == NULL) { + pr_perror("Unable to create configuration file %s", CONFIG_PATH); + goto err; + } + fprintf(configfile, "force-irmap\t\nirmap-scan-path /zdtm/static\n"); + fclose(configfile); + + test_daemon(); + test_waitsig(); + + for (i = 0; i < 2; i++) { + memset(buf, 0, sizeof(buf)); + wd = open(test_files[i], O_RDONLY); + if (read(fd, buf, sizeof(buf)) <= 0) { + fail("No events in queue"); + unlink(CONFIG_PATH); + goto err; + } + } + + close(wd); + close(fd); + for (i = 0; i < 2; i++) + unlink(test_files[i]); + unlink(CONFIG_PATH); + pass(); + return 0; +err: + for (i = 0; i < 2; i++) + unlink(test_files[i]); + return 1; +} diff --git a/test/zdtm/static/config_inotify_irmap.desc b/test/zdtm/static/config_inotify_irmap.desc new file mode 100644 index 000000000..591ae7191 --- /dev/null +++ b/test/zdtm/static/config_inotify_irmap.desc @@ -0,0 +1,3 @@ +(lambda confpath: +{'flags': 'suid', 'opts': '--config %s' % (confpath) +}) (os.path.abspath('./zdtm_test_config.conf')) diff --git a/test/zdtm/static/conntracks b/test/zdtm/static/conntracks index ecd97356f..26220f97c 100755 --- a/test/zdtm/static/conntracks +++ b/test/zdtm/static/conntracks @@ -25,7 +25,7 @@ do_or_fail() do_start_ipt() { - [ -f "$statefile" ] && die "state file $statefile already exists" + [ -f "$statefile" ] && die "state file $statefile aleady exists" do_or_fail "can't install a state match" \ iptables -A INPUT \ @@ -47,7 +47,7 @@ do_stop_ipt() do_start_nft() { - [ -f "$statefile" ] && die "state file $statefile already exists" + [ -f "$statefile" ] && die "state file $statefile aleady exists" do_or_fail "can't install a state match" \ nft add rule filter INPUT \ @@ -83,7 +83,7 @@ tmpargs="$(../lib/parseargs.sh --name=$0 \ die "can't parse command line" eval "$tmpargs" -[ -f "$outfile" ] && die "out file $outfile already exists" +[ -f "$outfile" ] && die "out file $outfile aleady exists" # expect "start" or "stop" do_$1 diff --git a/test/zdtm/static/cow00.c b/test/zdtm/static/cow00.c index 456b6a7b4..cb0c6733e 100644 --- a/test/zdtm/static/cow00.c +++ b/test/zdtm/static/cow00.c @@ -29,7 +29,7 @@ static int is_cow(void *addr, pid_t p1, pid_t p2) snprintf(buf, sizeof(buf), "/proc/%d/pagemap", p2); fd2 = open(buf, O_RDONLY); - if (fd2 < 0) { + if (fd1 < 0) { pr_perror("Unable to open file %s", buf); return -1; } diff --git a/test/zdtm/static/del_standalone_un.c b/test/zdtm/static/del_standalone_un.c index b4f99e260..c9fa84870 100644 --- a/test/zdtm/static/del_standalone_un.c +++ b/test/zdtm/static/del_standalone_un.c @@ -16,17 +16,11 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - static int bind_and_listen(struct sockaddr_un *addr) { int sk; - sk = socket(PF_UNIX, SOCK_TYPE, 0); + sk = socket(PF_UNIX, SOCK_STREAM, 0); if (sk < 0) { fail("socket"); return -1; diff --git a/test/zdtm/static/del_standalone_un_seqpacket.c b/test/zdtm/static/del_standalone_un_seqpacket.c deleted file mode 120000 index d88fcbad8..000000000 --- a/test/zdtm/static/del_standalone_un_seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -del_standalone_un.c \ No newline at end of file diff --git a/test/zdtm/static/fanotify00.c b/test/zdtm/static/fanotify00.c index 0400cc74b..69ead43e7 100644 --- a/test/zdtm/static/fanotify00.c +++ b/test/zdtm/static/fanotify00.c @@ -22,7 +22,7 @@ #elif defined(__PPC64__) #define __NR_fanotify_init 323 #define __NR_fanotify_mark 324 -#elif (__aarch64__ || __riscv) +#elif __aarch64__ #define __NR_fanotify_init 262 #define __NR_fanotify_mark 263 #elif __s390x__ diff --git a/test/zdtm/static/fd_from_pidfd.c b/test/zdtm/static/fd_from_pidfd.c deleted file mode 100644 index 1f863d6c0..000000000 --- a/test/zdtm/static/fd_from_pidfd.c +++ /dev/null @@ -1,108 +0,0 @@ -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check if fd obtained from pidfd_get_fd is C/R correctly\n"; -const char *test_author = "Bhavik Sachdev "; - -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int pidfd_getfd(int pidfd, int targetfd, unsigned int flags) -{ - return syscall(__NR_pidfd_getfd, pidfd, targetfd, flags); -} - -static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) -{ - return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); -} - -int main(int argc, char* argv[]) -{ - #define READ 0 - #define WRITE 1 - - int pidfd, child, p[2], child_read, read_data, status; - int data = 42; - - test_init(argc, argv); - - if (pipe(p)) { - pr_perror("pipe"); - return 1; - } - - child = fork(); - if (child < 0) { - pr_perror("fork"); - return 1; - } - - if (child == 0) { - close(p[WRITE]); - test_waitsig(); - return 0; - } - - pidfd = pidfd_open(child, 0); - if (pidfd < 0) { - pr_perror("pidfd_open failed"); - return 1; - } - - close(p[READ]); - if (write(p[WRITE], &data, sizeof(data)) != sizeof(data)) { - pr_perror("write"); - return 1; - } - close(p[WRITE]); - - child_read = pidfd_getfd(pidfd, p[READ], 0); - if (child_read < 0) { - pr_perror("pidfd_getfd"); - return 1; - } - - test_daemon(); - test_waitsig(); - - if (read(child_read, &read_data, sizeof(read_data)) != sizeof(read_data)) { - pr_perror("read"); - goto err_close; - } - - if (read_data != data) { - fail("data from fd obtained using pidfd_getfd incorrect"); - goto err_close; - } - - if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { - pr_perror("Could not send signal"); - goto err_close; - } - - if (waitpid(child, &status, 0) != child) { - pr_perror("waitpid()"); - return 1; - } - - if (status != 0) { - fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); - return 1; - } - - pass(); - close(child_read); - close(pidfd); - return 0; -err_close: - close(child_read); - close(pidfd); - return 1; -} diff --git a/test/zdtm/static/fd_offset.c b/test/zdtm/static/fd_offset.c deleted file mode 100644 index 96255a4a1..000000000 --- a/test/zdtm/static/fd_offset.c +++ /dev/null @@ -1,42 +0,0 @@ -#include - -#include "zdtmtst.h" -#include "lock.h" - -const char *test_doc = "Check that criu properly restores offsets on ELF files"; -const char *test_author = "Michal Clapinski "; - -void check_offset(int fd) -{ - int offset = lseek(fd, 0, SEEK_CUR); - if (offset < 0) { - fail("lseek"); - exit(1); - } - if (offset != 0) { - fail("wrong offset; expected: 0, got: %d", offset); - exit(1); - } -} - -int main(int argc, char **argv) -{ - int fd; - - test_init(argc, argv); - - fd = open("/proc/self/exe", O_RDONLY); - if (fd < 0) { - fail("open"); - exit(1); - } - check_offset(fd); - - test_daemon(); - test_waitsig(); - - check_offset(fd); - - pass(); - return 0; -} diff --git a/test/zdtm/static/file_append.c b/test/zdtm/static/file_append.c index fff89a54b..aa93ae186 100644 --- a/test/zdtm/static/file_append.c +++ b/test/zdtm/static/file_append.c @@ -51,7 +51,7 @@ int main(int argc, char **argv) } tmp[2] = '\0'; if (strcmp(tmp, "xy")) { - fail("Smth's wrong with file contents (%s)", tmp); + fail("Smth's wron with file contents (%s)", tmp); return 1; } diff --git a/test/zdtm/static/file_fown.c b/test/zdtm/static/file_fown.c index 2c5ba82c2..eb42a826e 100644 --- a/test/zdtm/static/file_fown.c +++ b/test/zdtm/static/file_fown.c @@ -22,14 +22,12 @@ const char *test_doc = "Check for signal delivery on file owners"; const char *test_author = "Cyrill Gorcunov "; -/* clang-format off */ struct params { int sigio; int pipe_flags[2]; int pipe_pid[2]; int pipe_sig[2]; -} *shared; -/* clang-format on */ +} * shared; static void signal_handler_io(int status) { diff --git a/test/zdtm/static/file_locks00.c b/test/zdtm/static/file_locks00.c index 01782fa7a..0b5d1313b 100644 --- a/test/zdtm/static/file_locks00.c +++ b/test/zdtm/static/file_locks00.c @@ -23,10 +23,10 @@ static int lock_reg(int fd, int cmd, int type, int whence, off_t offset, off_t l { struct flock lock; - lock.l_type = type; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_type = type; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ - lock.l_start = offset; /* byte offset, relative to l_whence */ - lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ errno = 0; return fcntl(fd, cmd, &lock); @@ -40,10 +40,10 @@ static int check_read_lock(int fd, int whence, off_t offset, off_t len) struct flock lock; int ret; - lock.l_type = F_RDLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_type = F_RDLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ - lock.l_start = offset; /* byte offset, relative to l_whence */ - lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ lock.l_pid = -1; errno = 0; @@ -69,10 +69,10 @@ static int check_write_lock(int fd, int whence, off_t offset, off_t len) int ret; pid_t ppid = getppid(); - lock.l_type = F_WRLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ + lock.l_type = F_WRLCK; /* F_RDLCK, F_WRLCK, F_UNLCK */ lock.l_whence = whence; /* SEEK_SET, SEEK_CUR, SEEK_END */ - lock.l_start = offset; /* byte offset, relative to l_whence */ - lock.l_len = len; /* #bytes (0 means to EOF) */ + lock.l_start = offset; /* byte offset, relative to l_whence */ + lock.l_len = len; /* #bytes (0 means to EOF) */ lock.l_pid = -1; errno = 0; diff --git a/test/zdtm/static/file_locks01.c b/test/zdtm/static/file_locks01.c index bfdca51d9..6c2e54ff4 100644 --- a/test/zdtm/static/file_locks01.c +++ b/test/zdtm/static/file_locks01.c @@ -107,7 +107,7 @@ static int check_file_lock(int fd, char *expected_type, char *expected_option, u memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, + num = sscanf(buf, "%*s %*d:%s %s %s %d %x:%x:%ld %*d %*s", fl_flag, fl_type, fl_option, &fl_owner, &maj, &min, &i_no); if (num < 7) { pr_err("Invalid lock info\n"); @@ -159,6 +159,7 @@ int main(int argc, char **argv) flock(fd_0, LOCK_SH); flock(fd_1, LOCK_EX); + flock(fd_2, LOCK_MAND | LOCK_READ); test_daemon(); test_waitsig(); @@ -171,6 +172,11 @@ int main(int argc, char **argv) fail("Failed on fd %d", fd_1); ret |= 1; } + if (check_file_lock(fd_2, "MSNFS", "READ", dev, inodes[2])) { + fail("Failed on fd %d", fd_2); + ret |= 1; + } + if (!ret) pass(); diff --git a/test/zdtm/static/file_locks02.c b/test/zdtm/static/file_locks02.c index ae4827de9..d2049ebaa 100644 --- a/test/zdtm/static/file_locks02.c +++ b/test/zdtm/static/file_locks02.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks03.c b/test/zdtm/static/file_locks03.c index 228e66892..35ef41a21 100644 --- a/test/zdtm/static/file_locks03.c +++ b/test/zdtm/static/file_locks03.c @@ -41,7 +41,7 @@ static int check_file_lock(pid_t pid, pid_t child, int fd, char *expected_type, memset(fl_type, 0, sizeof(fl_type)); memset(fl_option, 0, sizeof(fl_option)); - num = sscanf(buf, "%*s %*d:%15s %15s %15s %d", fl_flag, fl_type, fl_option, &fl_owner); + num = sscanf(buf, "%*s %*d:%s %s %s %d", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { pr_perror("Invalid lock info."); break; diff --git a/test/zdtm/static/file_locks04.c b/test/zdtm/static/file_locks04.c index 7e0d2654e..11d224fa7 100644 --- a/test/zdtm/static/file_locks04.c +++ b/test/zdtm/static/file_locks04.c @@ -34,7 +34,7 @@ static int check_file_locks(pid_t child_pid, int fd, int child_fd) continue; test_msg("c: %s", buf); - num = sscanf(buf, "%*s %*d:%15s %15s %15s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, + num = sscanf(buf, "%*s %*d:%s %s %s %d %*02x:%*02x:%*d %*d %*s", fl_flag, fl_type, fl_option, &fl_owner); if (num < 4) { diff --git a/test/zdtm/static/file_locks06.checkskip b/test/zdtm/static/file_locks06.checkskip index c5039a2d2..06ab58521 100755 --- a/test/zdtm/static/file_locks06.checkskip +++ b/test/zdtm/static/file_locks06.checkskip @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import fcntl import tempfile import struct diff --git a/test/zdtm/static/get_smaps_bits.c b/test/zdtm/static/get_smaps_bits.c index 3d952ac95..31d0d92b2 100644 --- a/test/zdtm/static/get_smaps_bits.c +++ b/test/zdtm/static/get_smaps_bits.c @@ -6,10 +6,6 @@ #define MAP_HUGETLB 0x40000 #endif -#ifndef MAP_DROPPABLE -#define MAP_DROPPABLE 0x08 -#endif - #ifndef MADV_HUGEPAGE #define MADV_HUGEPAGE 14 #endif @@ -22,10 +18,6 @@ #define MADV_DONTDUMP 16 #endif -#ifndef MADV_WIPEONFORK -#define MADV_WIPEONFORK 18 -#endif - static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) { char *tok; @@ -49,8 +41,6 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *flags |= MAP_NORESERVE; else if (_vmflag_match(tok, "ht")) *flags |= MAP_HUGETLB; - else if (_vmflag_match(tok, "dp")) - *flags |= MAP_DROPPABLE; /* madvise() block */ if (_vmflag_match(tok, "sr")) @@ -67,8 +57,6 @@ static void parse_vmflags(char *buf, unsigned long *flags, unsigned long *madv) *madv |= (1ul << MADV_HUGEPAGE); else if (_vmflag_match(tok, "nh")) *madv |= (1ul << MADV_NOHUGEPAGE); - else if (_vmflag_match(tok, "wf")) - *madv |= (1ul << MADV_WIPEONFORK); /* * Anything else is just ignored. diff --git a/test/zdtm/static/ghost_holes_large00.c b/test/zdtm/static/ghost_holes_large00.c deleted file mode 100644 index 1a9739f8e..000000000 --- a/test/zdtm/static/ghost_holes_large00.c +++ /dev/null @@ -1,152 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Test ghost with one large hole(1GiB) in the middle"; -const char *test_author = "Liang-Chun Chen "; - -char *filename; -TEST_OPTION(filename, string, "file name", 1); - -/* Buffer that is suitable for data size */ -#ifdef LIMIT -#define BUFSIZE 1024 * 1024 -#else -#define BUFSIZE 4096 -#endif -static unsigned char buf[BUFSIZE]; - -#ifndef SEEK_DATA -#define SEEK_DATA 3 -#define SEEK_HOLE 4 -#endif - -#define DATA1_OFF 0 -#define HOLE_SIZE (1LL * 1 * 1024 * 1024 * 1024) -#define DATA2_OFF (BUFSIZE + HOLE_SIZE) -#define FILE_SIZE (2 * BUFSIZE + HOLE_SIZE) -#define ST_UNIT 512 - -int main(int argc, char **argv) -{ - int fd; - struct stat st; - uint32_t crc; - bool chk_hole = true; - - test_init(argc, argv); - - fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); - if (fd < 0) { - pr_perror("can't open %s", filename); - exit(1); - } - - if (unlink(filename) < 0) { - pr_perror("can't unlink %s", filename); - goto failed; - } - - crc = ~0; - datagen(buf, BUFSIZE, &crc); - if (pwrite(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { - pr_perror("can't write data1"); - goto failed; - } - - crc = ~0; - datagen(buf, BUFSIZE, &crc); - if (pwrite(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { - pr_perror("can't write data2"); - goto failed; - } - - if (ftruncate(fd, FILE_SIZE)) { - pr_perror("Can't fixup file size"); - goto failed; - } - - if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { - test_msg("Won't check for hole\n"); - chk_hole = false; - } - - test_daemon(); - test_waitsig(); - - if (fstat(fd, &st) < 0) { - fail("can't stat after"); - goto failed; - } - - if (st.st_size != FILE_SIZE) { - fail("file size changed to %ld", (long)st.st_size); - goto failed; - } - - test_msg("file size OK\n"); - - if (st.st_blocks * ST_UNIT != 2 * BUFSIZE) { - fail("actual file size changed to %ld", (long)st.st_blocks * ST_UNIT); - goto failed; - } - - test_msg("actual file size OK\n"); - - /* Data 1 */ - if (pread(fd, buf, BUFSIZE, DATA1_OFF) != BUFSIZE) { - fail("pread1 fail"); - goto failed; - } - - crc = ~0; - if (datachk(buf, BUFSIZE, &crc)) { - fail("datachk1 fail"); - goto failed; - } - - test_msg("Data1 OK\n"); - - /* Data 2 */ - if (pread(fd, buf, BUFSIZE, DATA2_OFF) != BUFSIZE) { - fail("pread2 fail"); - goto failed; - } - - crc = ~0; - if (datachk(buf, BUFSIZE, &crc)) { - fail("datachk2 fail"); - goto failed; - } - - test_msg("Data2 OK\n"); - - /* Hole */ - if (chk_hole) { - if (lseek(fd, DATA1_OFF, SEEK_HOLE) != DATA1_OFF + BUFSIZE) { - fail("Begin of mid hole not found"); - goto failed; - } - if (lseek(fd, DATA1_OFF + BUFSIZE, SEEK_DATA) != DATA2_OFF) { - fail("End of mid hole not found"); - goto failed; - } - test_msg("Mid hole OK\n"); - } - - close(fd); - pass(); - return 0; - -failed: - close(fd); - return 1; -} diff --git a/test/zdtm/static/ghost_holes_large01.c b/test/zdtm/static/ghost_holes_large01.c deleted file mode 120000 index 1b90363d4..000000000 --- a/test/zdtm/static/ghost_holes_large01.c +++ /dev/null @@ -1 +0,0 @@ -ghost_holes_large00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_holes_large01.desc b/test/zdtm/static/ghost_holes_large01.desc deleted file mode 100644 index 8e6a476bd..000000000 --- a/test/zdtm/static/ghost_holes_large01.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'crfail'} \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole00.c b/test/zdtm/static/ghost_multi_hole00.c deleted file mode 100644 index 0f78d4f14..000000000 --- a/test/zdtm/static/ghost_multi_hole00.c +++ /dev/null @@ -1,122 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Test ghost with a lot of holes(every 8K length contains only 4K data)"; -const char *test_author = "Liang-Chun Chen "; - -char *filename; -TEST_OPTION(filename, string, "file name", 1); - -/* Buffer that is suitable for hole size */ -#define BUFSIZE 4096 -static unsigned char buf4k[BUFSIZE]; - -#ifndef SEEK_DATA -#define SEEK_DATA 3 -#define SEEK_HOLE 4 -#endif - -#define FILE_SIZE (1 << 23) /* 8Mb */ - -#define FILE_INTERVAL (1 << 13) /* 8Kb */ - -int main(int argc, char **argv) -{ - int fd, off; - struct stat st; - uint32_t crc; - - test_init(argc, argv); - - fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0644); - if (fd < 0) { - pr_perror("can't open %s", filename); - exit(1); - } - - if (unlink(filename) < 0) { - pr_perror("can't unlink %s", filename); - goto failed; - } - - for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { - crc = ~0; - datagen(buf4k, BUFSIZE, &crc); - if (pwrite(fd, &buf4k, BUFSIZE, off) != BUFSIZE) { - perror("pwrite"); - goto failed; - } - - /* - * In some file system, such as xfs, - * only pwrite might not able to create highly sparse file, - * so we need to forcibly allocate hole inside the file. - */ - if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off + BUFSIZE, BUFSIZE)) { - perror("fallocate"); - goto failed; - } - } - - if (ftruncate(fd, FILE_SIZE)) { - pr_perror("Can't fixup file size"); - goto failed; - } - - test_daemon(); - test_waitsig(); - - if (fstat(fd, &st) < 0) { - fail("can't stat after"); - goto failed; - } - - if (st.st_size != FILE_SIZE) { - fail("file size changed to %ld", (long)st.st_size); - goto failed; - } - - test_msg("Size %u OK\n", FILE_SIZE); - - /* Data*/ - for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { - if (pread(fd, buf4k, BUFSIZE, off) != BUFSIZE) { - fail("pread failed @ %u", off / FILE_INTERVAL); - goto failed; - } - - crc = ~0; - if (datachk(buf4k, BUFSIZE, &crc)) { - fail("datachk failed @ %u", off / FILE_INTERVAL); - goto failed; - } - - test_msg("Data @%du OK\n", off / FILE_INTERVAL); - } - - /* Hole */ - for (off = 0; off < FILE_SIZE; off += FILE_INTERVAL) { - if (lseek(fd, off, SEEK_HOLE) != off + BUFSIZE) { - fail("failed to find hole @ %u", off / FILE_SIZE); - goto failed; - } - test_msg("Hole @%du OK\n", off / FILE_INTERVAL); - } - - close(fd); - pass(); - return 0; - -failed: - close(fd); - return 1; -} diff --git a/test/zdtm/static/ghost_multi_hole00.desc b/test/zdtm/static/ghost_multi_hole00.desc deleted file mode 100644 index 3981e8180..000000000 --- a/test/zdtm/static/ghost_multi_hole00.desc +++ /dev/null @@ -1 +0,0 @@ -{'dopts': '--ghost-limit 8M --no-ghost-fiemap'} diff --git a/test/zdtm/static/ghost_multi_hole01.c b/test/zdtm/static/ghost_multi_hole01.c deleted file mode 120000 index c75006a6b..000000000 --- a/test/zdtm/static/ghost_multi_hole01.c +++ /dev/null @@ -1 +0,0 @@ -ghost_multi_hole00.c \ No newline at end of file diff --git a/test/zdtm/static/ghost_multi_hole01.desc b/test/zdtm/static/ghost_multi_hole01.desc deleted file mode 100644 index d1dc68a54..000000000 --- a/test/zdtm/static/ghost_multi_hole01.desc +++ /dev/null @@ -1 +0,0 @@ -{'dopts': '--ghost-limit 8M --ghost-fiemap'} diff --git a/test/zdtm/static/inotify_system.c b/test/zdtm/static/inotify_system.c index 079d4b161..487062cab 100644 --- a/test/zdtm/static/inotify_system.c +++ b/test/zdtm/static/inotify_system.c @@ -14,14 +14,14 @@ #include "zdtmtst.h" const char *test_doc = "Inotify on symlink should be checked"; -#ifndef NO_DEL +#ifndef NODEL char filename[] = "file"; char linkname[] = "file.lnk"; const char *inot_dir = "./inotify"; #else -char filename[] = "file.no_del"; -char linkname[] = "file.no_del.lnk"; -const char *inot_dir = "./inotify.no_del"; +char filename[] = "file.nodel"; +char linkname[] = "file.nodel.lnk"; +const char *inot_dir = "./inotify.nodel"; #endif #ifdef __NR_inotify_init @@ -57,13 +57,13 @@ const char *inot_dir = "./inotify.no_del"; (MASK == IN_UNMOUNT) ? "IN_UNMOUNT" : \ (MASK == IN_Q_OVERFLOW) ? "IN_Q_OVERFLOW" : \ (MASK == IN_IGNORED) ? "IN_IGNORED" : \ - "UNKNOWN" + "UNKNOWN" #include #include typedef struct { - int infd; + int inot; int file; int link; int dir; @@ -165,8 +165,8 @@ desc init_env(const char *dir, char *file_path, char *link_path) pr_perror("mkdir(%s)", dir); return in_desc; } - in_desc.infd = inotify_init(); - if (in_desc.infd < 0) { + in_desc.inot = inotify_init(); + if (in_desc.inot < 0) { pr_perror("inotify_init() failed"); rmdir(dir); return in_desc; @@ -184,12 +184,12 @@ desc init_env(const char *dir, char *file_path, char *link_path) return in_desc; } - in_desc.dir = addWatcher(in_desc.infd, dir); + in_desc.dir = addWatcher(in_desc.inot, dir); if (createFiles(file_path, filename, link_path)) { return in_desc; } - in_desc.link = addWatcher(in_desc.infd, link_path); - in_desc.file = addWatcher(in_desc.infd, file_path); + in_desc.link = addWatcher(in_desc.inot, link_path); + in_desc.file = addWatcher(in_desc.inot, file_path); return in_desc; } @@ -216,7 +216,7 @@ int test_actions(const char *dir, char *file_path, char *link_path) { if (fChmod(link_path) == 0 && fWriteClose(link_path) == 0 && fNoWriteClose(link_path) == 0 && fMove(file_path, filename) == 0 && fMove(filename, file_path) == 0 -#ifndef NO_DEL +#ifndef NODEL && fDelete(file_path) == 0 && fDelete(link_path) == 0 && fRemDir(dir) == 0 #endif ) { @@ -299,9 +299,9 @@ int read_set(int inot_fd, char *event_set) void common_close(desc *descr) { - if (descr->infd > 0) { - close(descr->infd); - descr->infd = -1; + if (descr->inot > 0) { + close(descr->inot); + descr->inot = -1; descr->file = -1; descr->dir = -1; descr->link = -1; @@ -316,7 +316,7 @@ int get_event_set(char *event_set, int wait) desc common_desc; common_desc = init_env(inot_dir, file_path, link_path); - if ((common_desc.infd < 0) || (common_desc.file < 0) || (common_desc.dir < 0) || (common_desc.link < 0)) { + if ((common_desc.inot < 0) || (common_desc.file < 0) || (common_desc.dir < 0) || (common_desc.link < 0)) { common_close(&common_desc); return -1; } @@ -327,9 +327,9 @@ int get_event_set(char *event_set, int wait) if (wait) { do_wait(); } - len = read_set(common_desc.infd, event_set); + len = read_set(common_desc.inot, event_set); common_close(&common_desc); -#ifdef NO_DEL +#ifdef NODEL if (!(fDelete(file_path) == 0 && fDelete(link_path) == 0 && fRemDir(inot_dir) == 0)) return -1; #endif diff --git a/test/zdtm/static/ipc_namespace.c b/test/zdtm/static/ipc_namespace.c index b13b357ba..98241d816 100644 --- a/test/zdtm/static/ipc_namespace.c +++ b/test/zdtm/static/ipc_namespace.c @@ -19,28 +19,27 @@ extern int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf); struct ipc_ids { int in_use; /* TODO: Check for 0 */ - - // unsigned short seq; - // unsigned short seq_max; - // struct rw_semaphore rw_mutex; - // struct idr ipcs_idr; /* TODO */ + // unsigned short seq; + // unsigned short seq_max; + // struct rw_semaphore rw_mutex; + // struct idr ipcs_idr; /* TODO */ }; struct ipc_ns { struct ipc_ids ids[3]; - int sem_ctls[4]; - int used_sems; + int sem_ctls[4]; // + + int used_sems; // + - int msg_ctlmax; - int msg_ctlmnb; - int msg_ctlmni; - int msg_bytes; - int msg_hdrs; - int auto_msgmni; - int msg_next_id; - int sem_next_id; - int shm_next_id; + int msg_ctlmax; // + + int msg_ctlmnb; // + + int msg_ctlmni; // + + int msg_bytes; // + + int msg_hdrs; // + + int auto_msgmni; // + + int msg_next_id; // + + int sem_next_id; // + + int shm_next_id; // + size_t shm_ctlmax; size_t shm_ctlall; @@ -52,10 +51,10 @@ struct ipc_ns { // unsigned int mq_queues_count; - unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ - unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ - unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ - unsigned int mq_msg_default; /* initialized to DFLT_MSG */ + unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ + unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ + unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ + unsigned int mq_msg_default; /* initialized to DFLT_MSG */ unsigned int mq_msgsize_default; /* initialized to DFLT_MSGSIZE */ struct user_ns *user_ns; diff --git a/test/zdtm/static/macvlan.checkskip b/test/zdtm/static/macvlan.checkskip deleted file mode 100755 index f4e060953..000000000 --- a/test/zdtm/static/macvlan.checkskip +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -FAIL=0 - -create_macvlan_device() { - if ! ip link add test_mvlan1 type veth >/dev/null 2>&1; then - FAIL=1 - fi - if ! ip link add mymacvlan1 link test_mvlan1 type macvlan >/dev/null 2>&1; then - FAIL=1 - fi - - return "${FAIL}" -} - -cleanup() { - ip link del test_mvlan1 >/dev/null 2>&1 - ip link del mymacvlan1 >/dev/null 2>&1 -} - -trap "cleanup" QUIT TERM INT HUP EXIT - -# Test once without loading the module -if create_macvlan_device; then - exit 0 -fi - -# Test once more with explicitly loading the module -if ! modprobe macvlan >/dev/null 2>&1; then - exit 1 -fi -create_macvlan_device - -if [ "${FAIL}" == "1" ]; then - exit 1 -fi - -exit 0 diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c index f6989f3af..10a4cac79 100644 --- a/test/zdtm/static/maps00.c +++ b/test/zdtm/static/maps00.c @@ -137,7 +137,7 @@ static int check_map(struct map *map) } /* prot |= PROT_READ// need barrier before this line, because compiler change order commands. - I found one method: look at next lines*/ + I finded one method: look at next lines*/ } else prot &= PROT_WRITE | !PROT_READ | PROT_EXEC; @@ -158,13 +158,7 @@ static int check_map(struct map *map) if (!sigsetjmp(segv_ret, 1)) { if (map->prot & PROT_WRITE) { - memcpy(map->ptr, test_func, ONE_MAP_SIZE); - /* The ARM ARM architecture does not require the - * hardware to ensure coherency between instruction - * caches and memory, flushing dcache and icache is - * necessory to prevent SIGILL signal. - */ - __builtin___clear_cache(map->ptr, map->ptr + ONE_MAP_SIZE); + memcpy(map->ptr, test_func, getpagesize()); } else { if (!(map->flag & MAP_ANONYMOUS)) { uint8_t funlen = (uint8_t *)check_map - (uint8_t *)test_func; diff --git a/test/zdtm/static/maps02.c b/test/zdtm/static/maps02.c index 38244f020..29f1372c9 100644 --- a/test/zdtm/static/maps02.c +++ b/test/zdtm/static/maps02.c @@ -2,19 +2,11 @@ #include "zdtmtst.h" #include "get_smaps_bits.h" -#ifndef MAP_DROPPABLE -#define MAP_DROPPABLE 0x08 -#endif - #ifndef MADV_DONTDUMP #define MADV_DONTDUMP 16 #endif -#ifndef MADV_WIPEONFORK -#define MADV_WIPEONFORK 18 -#endif - -const char *test_doc = "Test private memory with advises"; +const char *test_doc = "Test shared memory with advises"; const char *test_author = "Cyrill Gorcunov "; struct mmap_data { @@ -31,14 +23,8 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) { m->start = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, flags, -1, 0); if (m->start == MAP_FAILED) { - if (errno == EINVAL) { - test_msg("mmap failed, no kernel support\n"); - *m = (struct mmap_data){}; - return 0; - } else { - pr_perror("mmap failed"); - return -1; - } + pr_perror("mmap failed"); + return -1; } if (madvise(m->start, MEM_SIZE, adv)) { @@ -57,12 +43,12 @@ static int alloc_anon_mmap(struct mmap_data *m, int flags, int adv) int main(int argc, char **argv) { - struct mmap_data m[7] = {}; + struct mmap_data m[5] = {}; size_t i; test_init(argc, argv); - test_msg("Alloc dontfork\n"); + test_msg("Alloc growsdown\n"); if (alloc_anon_mmap(&m[0], MAP_PRIVATE | MAP_ANONYMOUS, MADV_DONTFORK)) return -1; @@ -78,18 +64,10 @@ int main(int argc, char **argv) if (alloc_anon_mmap(&m[3], MAP_PRIVATE | MAP_ANONYMOUS, MADV_HUGEPAGE)) return -1; - test_msg("Alloc mergeable\n"); + test_msg("Alloc dontfork/random|mergeable\n"); if (alloc_anon_mmap(&m[4], MAP_PRIVATE | MAP_ANONYMOUS, MADV_MERGEABLE)) return -1; - test_msg("Alloc wipeonfork\n"); - if (alloc_anon_mmap(&m[5], MAP_PRIVATE | MAP_ANONYMOUS, MADV_WIPEONFORK)) - return -1; - - test_msg("Alloc droppable\n"); - if (alloc_anon_mmap(&m[6], MAP_DROPPABLE | MAP_ANONYMOUS, MADV_NORMAL)) - return -1; - test_msg("Fetch existing flags/adv\n"); for (i = 0; i < sizeof(m) / sizeof(m[0]); i++) { if (get_smaps_bits((unsigned long)m[i].start, &m[i].orig_flags, &m[i].orig_madv)) diff --git a/test/zdtm/static/maps09.c b/test/zdtm/static/maps09.c deleted file mode 100644 index 216263b4c..000000000 --- a/test/zdtm/static/maps09.c +++ /dev/null @@ -1,89 +0,0 @@ -#include - -#include "zdtmtst.h" - -#define MEM_SIZE (4UL * (1UL << 20)) /* 4MB */ -#define MEM_OFFSET (MEM_SIZE - PAGE_SIZE) - -const char *test_doc = "Test MAP_HUGETLB mapping"; -const char *test_author = "Bui Quang Minh "; - -int main(int argc, char **argv) -{ - void *m1, *m2; - dev_t dev1, dev2; - uint32_t crc; - - test_init(argc, argv); - m1 = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_HUGETLB | MAP_SHARED | MAP_ANONYMOUS, 0, 0); - if (m1 == MAP_FAILED) { - pr_perror("Failed to mmap %lu Mb anonymous shared memory", MEM_SIZE >> 20); - return 1; - } - - dev1 = get_mapping_dev(m1); - if (dev1 == (dev_t)-1) { - fail("Can't get mapping dev"); - return 1; - } - - m2 = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); - if (m2 == MAP_FAILED) { - pr_perror("Failed to mmap %lu Mb anonymous private memory", MEM_SIZE >> 20); - return 1; - } - - dev2 = get_mapping_dev(m2); - if (dev2 == (dev_t)-1) { - fail("Can't get mapping dev"); - return 1; - } - - crc = ~0; - datagen(m1, PAGE_SIZE, &crc); - crc = ~0; - datagen(m1 + MEM_OFFSET, PAGE_SIZE, &crc); - crc = ~0; - datagen(m2, PAGE_SIZE, &crc); - crc = ~0; - datagen(m2 + MEM_OFFSET, PAGE_SIZE, &crc); - crc = ~0; - - test_daemon(); - test_waitsig(); - - crc = ~0; - if (datachk(m1, PAGE_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - crc = ~0; - if (datachk(m1 + MEM_OFFSET, PAGE_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - crc = ~0; - if (datachk(m2, PAGE_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - crc = ~0; - if (datachk(m2 + MEM_OFFSET, PAGE_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - - if (dev1 != get_mapping_dev(m1)) { - fail("Mapping dev mismatch"); - return 1; - } - - if (dev2 != get_mapping_dev(m2)) { - fail("Mapping dev mismatch"); - return 1; - } - - pass(); - - return 0; -} diff --git a/test/zdtm/static/maps09.checkskip b/test/zdtm/static/maps09.checkskip deleted file mode 100755 index df2370815..000000000 --- a/test/zdtm/static/maps09.checkskip +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# will fail with EOPNOTSUPP -cat /proc/sys/vm/nr_hugepages &> /dev/null diff --git a/test/zdtm/static/maps10.c b/test/zdtm/static/maps10.c deleted file mode 100644 index 51e37863b..000000000 --- a/test/zdtm/static/maps10.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include -#include -#include -#include "zdtmtst.h" - -const char *test_doc = "Test MAP_HUGETLB mapping in parent-child relationship processes"; -const char *test_author = "Bui Quang Minh "; - -#define MEM_SIZE (2UL * (1UL << 20)) /* 2MB */ - -int main(int argc, char **argv) -{ - void *p1, *p2, *s1; - task_waiter_t t; - pid_t pid; - uint32_t crc, tmp_crc; - int status; - - test_init(argc, argv); - task_waiter_init(&t); - - p1 = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, 0, 0); - if (p1 == MAP_FAILED) { - pr_perror("Map failed"); - return 1; - } - - p2 = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, 0, 0); - if (p2 == MAP_FAILED) { - pr_perror("Map failed"); - return 1; - } - - s1 = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB, 0, 0); - if (s1 == MAP_FAILED) { - pr_perror("Map failed"); - return 1; - } - - crc = ~0; - datagen(p1, MEM_SIZE, &crc); - crc = ~0; - datagen(p2, MEM_SIZE, &crc); - tmp_crc = crc; - - pid = test_fork(); - if (pid < 0) { - pr_perror("fork failed"); - return 1; - } - - if (pid == 0) { - crc = ~0; - datagen(p2, MEM_SIZE, &crc); - tmp_crc = crc; - crc = ~0; - datagen(s1, MEM_SIZE, &crc); - - task_waiter_complete(&t, 1); - test_waitsig(); - - crc = ~0; - if (datachk(p1, MEM_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - - crc = ~0; - if (datachk(p2, MEM_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - - if (crc != tmp_crc) { - fail("Data mismatch"); - return 1; - } - - crc = ~0; - if (datachk(s1, MEM_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - - return 0; - } - - task_waiter_wait4(&t, 1); - - test_daemon(); - test_waitsig(); - - kill(pid, SIGTERM); - wait(&status); - if (WIFEXITED(status)) { - if (WEXITSTATUS(status)) - goto err; - } else { - goto err; - } - - crc = ~0; - if (datachk(p1, MEM_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - - crc = ~0; - if (datachk(p2, MEM_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - - if (crc != tmp_crc) { - fail("Data mismatch"); - return 1; - } - - crc = ~0; - if (datachk(s1, MEM_SIZE, &crc)) { - fail("Data mismatch"); - return 1; - } - - pass(); - - return 0; -err: - if (waitpid(-1, NULL, WNOHANG) == 0) { - kill(pid, SIGTERM); - wait(NULL); - } - return 1; -} \ No newline at end of file diff --git a/test/zdtm/static/maps10.checkskip b/test/zdtm/static/maps10.checkskip deleted file mode 120000 index fb42f0f44..000000000 --- a/test/zdtm/static/maps10.checkskip +++ /dev/null @@ -1 +0,0 @@ -maps09.checkskip \ No newline at end of file diff --git a/test/zdtm/static/maps11.c b/test/zdtm/static/maps11.c deleted file mode 100644 index df309714b..000000000 --- a/test/zdtm/static/maps11.c +++ /dev/null @@ -1,205 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "zdtmtst.h" - -#ifndef MAP_DROPPABLE -#define MAP_DROPPABLE 0x08 -#endif - -#ifndef MADV_WIPEONFORK -#define MADV_WIPEONFORK 18 -#endif - -const char *test_doc = "Test MAP_DROPPABLE/MADV_WIPEONFORK mappings with 2 processes"; -const char *test_author = "Alexander Mikhalitsyn "; - -bool mem_is_zero(const uint8_t *buffer, size_t length) -{ - size_t i; - - for (i = 0; i < length; i++) - if (buffer[i] != 0) - return false; - - return true; -} - -int main(int argc, char **argv) -{ - uint8_t *p1, *p2; - pid_t pid; - int status; - const char data[] = "MADV_WIPEONFORK vma data"; - bool criu_was_there = false; - struct stat st1, st2; - - test_init(argc, argv); - - p1 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, - MAP_DROPPABLE | MAP_ANONYMOUS, 0, 0); - if (p1 == MAP_FAILED) { - if (errno == EINVAL) { - skip("mmap failed, no kernel support for MAP_DROPPABLE\n"); - goto skip; - } else { - pr_perror("mmap failed"); - return -1; - } - } - - p2 = mmap(NULL, sizeof(data), PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); - if (p2 == MAP_FAILED) { - pr_perror("mmap failed"); - return 1; - } - - if (madvise(p2, sizeof(data), MADV_WIPEONFORK)) { - pr_perror("madvise failed"); - return -1; - } - - /* contents of this mapping is supposed to be dropped after C/R */ - memcpy(p1, data, sizeof(data)); - - /* contents of this mapping is supposed to be dropped after fork() */ - memcpy(p2, data, sizeof(data)); - - /* - * Let's spawn a process before C/R so our mappings get inherited - * then, after C/R we need to ensure that CRIU memory premapping - * machinery works properly. - * - * It is important, because we restore MADV_WIPEONFORK on a later - * stages (after vma premapping happens) and we need to ensure that - * CRIU handles everything in a right way. - */ - pid = test_fork(); - if (pid < 0) { - pr_perror("fork failed"); - return 1; - } - - if (pid == 0) { - test_waitsig(); - - /* - * Both mappings have VM_WIPEONFORK flag set, - * so we expect to have it null-ified after fork(). - */ - if (!mem_is_zero(p1, sizeof(data)) || - !mem_is_zero(p2, sizeof(data))) { - pr_err("1st child: memory check failed\n"); - return 1; - } - - return 0; - } - - /* - * A simple way to detect if C/R happened is to compare st_ino - * fields of stat() on the procfs files of the current task. - * - * Hopefully, this terrible hack is never used in real-world - * applications ;-) Here, we only need this to make test - * to pass with/without --nocr option. - */ - if (stat("/proc/self/status", &st1)) { - pr_perror("stat"); - return 1; - } - - test_daemon(); - test_waitsig(); - - /* signal a child process to continue */ - if (kill(pid, SIGTERM)) { - pr_perror("kill"); - goto err; - } - - if (waitpid(pid, &status, 0) != pid) { - pr_perror("1st waitpid"); - goto err; - } - - if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - fail("1st process didn't exit cleanly: status=%d", status); - goto err; - } - - if (stat("/proc/self/status", &st2)) { - pr_perror("stat"); - return 1; - } - - /* detect CRIU */ - criu_was_there = st1.st_ino != st2.st_ino; - - /* - * We should mark failure if one of the following happens: - * 1. MAP_DROPPABLE memory is not zero after C/R - * 2. MAP_DROPPABLE memory somehow changed without C/R - * (kernel issue? memory pressure?) - * 3. MADV_WIPEONFORK memory is not preserved - * - * We care about 2nd case only because we would like test - * to pass even with --nocr zdtm.py option. - */ - if ((criu_was_there && !mem_is_zero(p1, sizeof(data))) || - (!criu_was_there && memcmp(p1, data, sizeof(data))) || - memcmp(p2, data, sizeof(data))) { - fail("Data mismatch"); - return 1; - } - - /* contents of these mappings is supposed to be dropped after fork() */ - memcpy(p1, data, sizeof(data)); - memcpy(p2, data, sizeof(data)); - - pid = test_fork(); - if (pid < 0) { - pr_perror("fork failed"); - return 1; - } - - if (pid == 0) { - if (!mem_is_zero(p1, sizeof(data)) || - !mem_is_zero(p2, sizeof(data))) { - pr_err("2nd child: memory check failed\n"); - return 1; - } - - return 0; - } - - if (waitpid(pid, &status, 0) != pid) { - pr_perror("2nd waitpid"); - goto err; - } - - if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - fail("2nd process didn't exit cleanly: status=%d", status); - goto err; - } - - pass(); - - return 0; -err: - if (waitpid(-1, NULL, WNOHANG) == 0) { - kill(pid, SIGTERM); - wait(NULL); - } - return 1; - -skip: - test_daemon(); - test_waitsig(); - pass(); - return 0; -} diff --git a/test/zdtm/static/maps12.c b/test/zdtm/static/maps12.c deleted file mode 100644 index f0d6c2381..000000000 --- a/test/zdtm/static/maps12.c +++ /dev/null @@ -1,351 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "zdtmtst.h" - -const char *test_doc = "Test madvise(MADV_GUARD_INSTALL)"; -const char *test_author = "Alexander Mikhalitsyn "; -/* some parts of code were taken from Linux kernel's kselftest guard-pages.c - written by Lorenzo Stoakes */ - -char *filename; -int fd; -TEST_OPTION(filename, string, "file name", 1); - -#ifndef MADV_GUARD_INSTALL -#define MADV_GUARD_INSTALL 102 -#endif - -uint8_t *map_base; - -struct { - unsigned int pages_num; - bool filemap; -} vmas[] = { - { 2, false }, - { 2, false }, - { 2, false }, - { 2, true }, - { 2, true }, - { 2, true }, -}; - -struct { - bool guarded; - bool wipeonfork; -} pages[] = { - { false, false }, /* vmas[0] */ - { true, false }, - { true, false }, /* vmas[1] */ - { false, false }, - { false, false }, /* vmas[2] */ - { true, true }, - { true, false }, /* vmas[3] */ - { false, false }, - { true, false }, /* vmas[4] */ - { true, false }, - { false, false }, /* vmas[5] */ - { true, false }, -}; - -static volatile sig_atomic_t signal_jump_set; -static sigjmp_buf signal_jmp_buf; - -static void handle_sigsegv(int signo) -{ - if (!signal_jump_set) - return; - - siglongjmp(signal_jmp_buf, 1); -} - -static bool try_write_to_addr(uint8_t *ptr) -{ - bool failed; - - /* Tell signal handler to jump back here on fatal signal. */ - signal_jump_set = true; - /* If a fatal signal arose, we will jump back here and failed is set. */ - failed = sigsetjmp(signal_jmp_buf, 1) != 0; - - if (!failed) - *ptr = 'x'; - - signal_jump_set = false; - return !failed; -} - -static int setup_sigsegv_handler(void) -{ - uint8_t write_me; - - if (signal(SIGSEGV, handle_sigsegv) == SIG_ERR) { - pr_perror("setting SIGSEGV handler failed"); - return 1; - } - - /* ensure that try_write_to_addr() works properly */ - if (!try_write_to_addr(&write_me)) { - pr_err("Failed to write at valid addr. Buggy try_write_to_addr()?\n"); - return 1; - } - - if (try_write_to_addr(NULL)) { - pr_err("Failed to detect an invalid write. Buggy try_write_to_addr()?\n"); - return 1; - } - - return 0; -} - -static inline void *mmap_pages(void *addr_hint, unsigned int count, bool filemap) -{ - char *map; - - map = mmap(addr_hint, count * PAGE_SIZE, PROT_WRITE | PROT_READ, - MAP_PRIVATE | (filemap ? 0 : MAP_ANONYMOUS) | (addr_hint ? MAP_FIXED : 0), - filemap ? fd : -1, - filemap ? (off_t)((intptr_t)addr_hint - (intptr_t)map_base) : 0); - if (map == MAP_FAILED || (addr_hint && (map != addr_hint))) - return MAP_FAILED; - - return map; -} - -static int __check_guards(const char *when, bool in_child) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(pages); i++) { - /* - * Skip pages that were never guarded, and also those - * that were, but have MADV_WIPEONFORK which means that - * guards were removed on fork. - */ - if (!pages[i].guarded || (in_child && pages[i].wipeonfork)) - continue; - - if (try_write_to_addr(&map_base[i * PAGE_SIZE])) { - pr_err("successful write to a guarded area %d %s C/R\n", - i, when); - return 1; - } - } - - return 0; -} - -static int check_guards(const char *when) -{ - int status; - pid_t pid; - - /* - * First of all, check that guards are on their places - * in a main test process. - */ - if (__check_guards(when, false)) { - return 1; - } - - /* - * Now, check that guards are on their places - * after fork(). This allows to ensure that - * combo MADV_WIPEONFORK + MADV_GUARD_INSTALL - * is restored properly too. - */ - - pid = test_fork(); - if (pid < 0) { - pr_perror("check_guards: fork failed"); - return 1; - } - - if (pid == 0) { - if (__check_guards(when, true)) { - pr_err("check_guards(\"%s\") failed in child\n", when); - exit(1); - } - - exit(0); - } - - if (waitpid(pid, &status, 0) != pid) { - pr_perror("check_guards: waitpid"); - return 1; - } - - if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { - pr_err("check_guards: process didn't exit cleanly: status=%d\n", status); - return 1; - } - - return 0; -} - -static void gen_pages_data(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(pages); i++) { - uint32_t crc; - - if (pages[i].guarded) - continue; - - crc = ~0; - datagen(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc); - } -} - -static int set_pages_madvs(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(pages); i++) { - if (pages[i].guarded) { - if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, - MADV_GUARD_INSTALL)) { - pr_perror("MADV_GUARD_INSTALL failed on page %d", i); - return 1; - } - } - - if (pages[i].wipeonfork) { - if (madvise(&map_base[i * PAGE_SIZE], PAGE_SIZE, - MADV_WIPEONFORK)) { - pr_perror("MADV_WIPEONFORK failed on page %d", i); - return 1; - } - } - } - - return 0; -} - -static int check_pages_data(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(pages); i++) { - uint32_t crc; - - if (pages[i].guarded) - continue; - - crc = ~0; - if (datachk(&map_base[i * PAGE_SIZE], PAGE_SIZE, &crc)) { - pr_err("Page %d is corrupted\n", i); - return 1; - } - } - - return 0; -} - -static int prepare_vmas(void) -{ - char *map; - int i, shift; - - shift = 0; - for (i = 0; i < ARRAY_SIZE(vmas); i++) { - map = mmap_pages(&map_base[shift * PAGE_SIZE], - vmas[i].pages_num, vmas[i].filemap); - if (map == MAP_FAILED) { - pr_err("mmap of [%d,%d] pages failed\n", - shift, shift + vmas[i].pages_num); - return 1; - } - - shift += vmas[i].pages_num; - } - - if (shift != ARRAY_SIZE(pages)) { - pr_err("Different number of pages in vmas and pages arrays.\n"); - return 1; - } - - return 0; -} - -int main(int argc, char **argv) -{ - unsigned int pages_num = ARRAY_SIZE(pages); - - test_init(argc, argv); - - fd = open(filename, O_TRUNC | O_CREAT | O_RDWR, 0600); - if (fd < 0) { - pr_perror("Unable to create a test file"); - return -1; - } - - if (ftruncate(fd, pages_num * PAGE_SIZE)) { - pr_perror("Unable to ftruncate a test file"); - return -1; - } - - if (setup_sigsegv_handler()) { - pr_err("setup_sigsegv_handler() failed\n"); - return 1; - } - - /* let's find a large enough area in address space */ - map_base = mmap_pages(NULL, pages_num, false); - if (map_base == MAP_FAILED) { - pr_err("mmap of %d pages failed\n", pages_num); - return 1; - } - - /* - * Now we know that we have a free vm address space area - * [map_base, map_base + pages_num * PAGE_SIZE). - * We can use (map_base) as a hint for our further mmaps. - */ - if (prepare_vmas()) { - pr_err("prepare_vmas() failed\n"); - return 1; - } - - /* fill non-guarded pages with data and preserve checksums */ - gen_pages_data(); - - if (set_pages_madvs()) { - pr_err("set_pages_madvs() failed\n"); - return 1; - } - - /* ensure that madvise(MADV_GUARD_INSTALL) works like expected */ - if (check_guards("before")) { - pr_err("check_guards(\"before\") failed\n"); - return 1; - } - - test_daemon(); - test_waitsig(); - - /* ensure that guards are at their places */ - if (check_guards("after")) { - fail("check_guards(\"after\") failed"); - return 1; - } - - /* check that non-guarded pages still contain original data */ - if (check_pages_data()) { - fail("check_pages_data() failed"); - return 1; - } - - pass(); - munmap(map_base, pages_num * PAGE_SIZE); - close(fd); - return 0; -} diff --git a/test/zdtm/static/maps12.desc b/test/zdtm/static/maps12.desc deleted file mode 100644 index 3f7627ff3..000000000 --- a/test/zdtm/static/maps12.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'feature': 'pagemap_scan_guard_pages'} diff --git a/test/zdtm/static/membarrier.c b/test/zdtm/static/membarrier.c deleted file mode 100644 index 85d705ba7..000000000 --- a/test/zdtm/static/membarrier.c +++ /dev/null @@ -1,149 +0,0 @@ -#include -#include -#include -#include "zdtmtst.h" - -const char *test_doc = "Test membarrier() migration"; -const char *test_author = "Michał Mirosław "; - -/* - * Define membarrier() CMDs to avoid depending on exact kernel header version. - */ -#define MEMBARRIER_CMD_GLOBAL_EXPEDITED (1 << 1) -#define MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED (1 << 2) -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) -#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE (1 << 5) -#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE (1 << 6) -#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ (1 << 7) -#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ (1 << 8) -#define MEMBARRIER_CMD_GET_REGISTRATIONS (1 << 9) - -static int membarrier(int cmd, unsigned int flags, int cpu_id) -{ - return syscall(__NR_membarrier, cmd, flags, cpu_id); -} - -static const struct { - const char *name_suffix; - int register_cmd; - int execute_cmd; -} membarrier_cmds[] = { - { "GLOBAL_EXPEDITED", MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, - MEMBARRIER_CMD_GLOBAL_EXPEDITED }, - { "PRIVATE_EXPEDITED", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, - MEMBARRIER_CMD_PRIVATE_EXPEDITED }, - { "PRIVATE_EXPEDITED_SYNC_CORE", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, - MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE }, - { "PRIVATE_EXPEDITED_RSEQ", MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, - MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ }, -}; -static const int n_membarrier_cmds = sizeof(membarrier_cmds) / sizeof(*membarrier_cmds); - -static int register_membarriers(void) -{ - int barriers_supported, barriers_registered; - bool all_ok = true; - - barriers_supported = membarrier(MEMBARRIER_CMD_QUERY, 0, 0); - if (barriers_supported < 0) { - fail("membarrier() not supported by running kernel"); - return -1; - } - - barriers_registered = 0; - for (int i = 0; i < n_membarrier_cmds; ++i) { - if (~barriers_supported & membarrier_cmds[i].register_cmd) - continue; - - barriers_registered |= membarrier_cmds[i].register_cmd; - - if (membarrier(membarrier_cmds[i].register_cmd, 0, 0) < 0) { - pr_perror("membarrier(REGISTER_%s)", membarrier_cmds[i].name_suffix); - all_ok = false; - } - } - - if (!all_ok) { - fail("can't register membarrier()s - tried %#x, kernel %#x", - barriers_registered, barriers_supported); - return -1; - } - - if (!barriers_registered) { - fail("no known membarrier() cmds are supported by the kernel"); - return -1; - } - - return barriers_registered; -} - -static bool check_membarriers_compat(int barriers_registered) -{ - bool all_ok = true; - - for (int i = 0; i < n_membarrier_cmds; ++i) { - if (~barriers_registered & membarrier_cmds[i].register_cmd) - continue; - if (membarrier(membarrier_cmds[i].execute_cmd, 0, 0) < 0) { - pr_perror("membarrier(%s)", membarrier_cmds[i].name_suffix); - all_ok = false; - } - } - - if (!all_ok) - fail("membarrier() check failed"); - - return all_ok; -} - -static bool check_membarriers_get_registrations(int barriers_registered) -{ - int ret = membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS, 0, 0); - if (ret < 0) { - if (errno == EINVAL) { - test_msg("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS) not supported by running kernel"); - return true; - } - fail("membarrier(MEMBARRIER_CMD_GET_REGISTRATIONS)"); - return false; - } - if (ret != barriers_registered) { - fail("MEMBARRIER_CMD_GET_REGISTRATIONS check failed, expected: %d, got: %d", - barriers_registered, ret); - return false; - } - - return true; -} - -static bool check_membarriers(int barriers_registered) -{ - return check_membarriers_compat(barriers_registered) && - check_membarriers_get_registrations(barriers_registered); -} - -int main(int argc, char **argv) -{ - int barriers_registered; - - test_init(argc, argv); - - barriers_registered = register_membarriers(); - if (barriers_registered < 0) - return 1; - - test_msg("Pre-migration membarriers check\n"); - if (!check_membarriers(barriers_registered)) - return 1; - - test_daemon(); - test_waitsig(); - - test_msg("Post-migration membarriers check\n"); - if (!check_membarriers(barriers_registered)) - return 1; - - pass(); - return 0; -} diff --git a/test/zdtm/static/memfd00.c b/test/zdtm/static/memfd00.c index 8d77ed06e..d037f6969 100644 --- a/test/zdtm/static/memfd00.c +++ b/test/zdtm/static/memfd00.c @@ -30,10 +30,8 @@ int main(int argc, char *argv[]) { int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; struct statfs statfs1, statfs2; - struct stat stat; off_t pos1, pos2; char buf[5]; - int fmode1, fmode2; test_init(argc, argv); @@ -60,13 +58,6 @@ int main(int argc, char *argv[]) if (lseek(fd, pos1, SEEK_SET) < 0) err(1, "seek error"); - if (fchmod(fd, 0642)) - err(1, "Can't set permission bits"); - - if (fstat(fd, &stat) < 0) - err(1, "fstat() issue"); - fmode1 = stat.st_mode; - test_daemon(); test_waitsig(); @@ -94,15 +85,6 @@ int main(int argc, char *argv[]) return 1; } - if (fstat(fd, &stat) < 0) - err(1, "fstat() issue"); - fmode2 = stat.st_mode; - - if (fmode1 != fmode2) { - fail("stat.st_mode = %#o != %#o", fmode2, fmode1); - return 1; - } - pos2 = lseek(fd, 0, SEEK_CUR); if (pos1 != pos2) { fail("position differs"); diff --git a/test/zdtm/static/memfd02-hugetlb.c b/test/zdtm/static/memfd02-hugetlb.c deleted file mode 120000 index db0820633..000000000 --- a/test/zdtm/static/memfd02-hugetlb.c +++ /dev/null @@ -1 +0,0 @@ -memfd02.c \ No newline at end of file diff --git a/test/zdtm/static/memfd02-hugetlb.desc b/test/zdtm/static/memfd02-hugetlb.desc deleted file mode 100644 index f88ad828b..000000000 --- a/test/zdtm/static/memfd02-hugetlb.desc +++ /dev/null @@ -1 +0,0 @@ -{'feature': 'memfd_hugetlb'} diff --git a/test/zdtm/static/memfd02.c b/test/zdtm/static/memfd02.c index 8950e38e2..12e294921 100644 --- a/test/zdtm/static/memfd02.c +++ b/test/zdtm/static/memfd02.c @@ -13,10 +13,6 @@ #include "zdtmtst.h" -#ifndef MFD_HUGETLB -#define MFD_HUGETLB 4 -#endif - const char *test_doc = "memfd mmap"; const char *test_author = "Nicolas Viennot "; @@ -33,24 +29,14 @@ static int _memfd_create(const char *name, unsigned int flags) int main(int argc, char *argv[]) { -#ifdef ZDTM_HUGETLB -#define LEN (2 * (1 << 20)) /* 2MB */ -#else #define LEN 6 -#endif - - int fd, flag = 0; + int fd; void *addr_shared, *addr_private; char buf[LEN]; - dev_t dev1, dev2; test_init(argc, argv); -#ifdef ZDTM_HUGETLB - flag = MFD_HUGETLB; -#endif - - fd = _memfd_create("somename", MFD_CLOEXEC | flag); + fd = _memfd_create("somename", MFD_CLOEXEC); if (fd < 0) err(1, "Can't call memfd_create"); @@ -61,32 +47,16 @@ int main(int argc, char *argv[]) if (addr_shared == MAP_FAILED) err(1, "Can't mmap"); - dev1 = get_mapping_dev(addr_shared); - if (dev1 == (dev_t)-1) { - fail("Can't get mapping dev"); - return 1; - } - -#ifdef ZDTM_HUGETLB - strcpy(addr_shared, "write1"); -#else write(fd, "write1", LEN); -#endif addr_private = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (addr_private == MAP_FAILED) err(1, "Can't mmap"); - dev2 = get_mapping_dev(addr_private); - if (dev2 == (dev_t)-1) { - fail("Can't get mapping dev"); - return 1; - } - test_daemon(); test_waitsig(); - if (strncmp(addr_shared, "write1", LEN)) { + if (memcmp(addr_shared, "write1", LEN)) { fail("content mismatch (shared)"); return 1; } @@ -98,33 +68,23 @@ int main(int argc, char *argv[]) return 1; } - if (strncmp(buf, "write2", LEN)) { + if (memcmp(buf, "write2", LEN)) { fail("content mismatch (shared)"); return 1; } - if (strncmp(addr_private, "write2", LEN)) { + if (memcmp(addr_private, "write2", LEN)) { fail("content mismatch (private)"); return 1; } strcpy(addr_private, "write3"); - if (strncmp(addr_shared, "write2", LEN)) { + if (memcmp(addr_shared, "write2", LEN)) { fail("content mismatch (shared)"); return 1; } - if (dev1 != get_mapping_dev(addr_shared)) { - fail("Mapping dev mismatch"); - return 1; - } - - if (dev2 != get_mapping_dev(addr_private)) { - fail("Mapping dev mismatch"); - return 1; - } - pass(); return 0; diff --git a/test/zdtm/static/memfd04.c b/test/zdtm/static/memfd04.c deleted file mode 100644 index 215e949d1..000000000 --- a/test/zdtm/static/memfd04.c +++ /dev/null @@ -1,132 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "exec(memfd)"; -const char *test_author = "Michał Mirosław "; - -static int _memfd_create(const char *name, unsigned int flags) -{ - return syscall(SYS_memfd_create, name, flags); -} - -static int _execveat(int dirfd, const char *pathname, const char *const argv[], const char *const envp[], int flags) -{ - return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); -} - -static const char *const script_argv[] = { "true", NULL }; -static const char *const script_env[] = { NULL }; - -static bool test_exec_fd(int fd) -{ - int err, pid, status; - - err = fcntl(fd, F_GETFD); - if (err < 0) { - fail("fcntl(F_GETFD)"); - return false; - } - if (err) { - errno = 0; - fail("F_GETFD for the memfd returned %d but expected 0", err); - return false; - } - - pid = fork(); - if (!pid) { - _execveat(fd, "", script_argv, script_env, AT_EMPTY_PATH); - err = errno; - pr_perror("execveat()"); - _exit(err); - } - - if (pid < 0) { - fail("fork()"); - return false; - } - - while (waitpid(pid, &status, 0) != pid) { - if (errno == EINTR) - continue; - fail("waitpid(child=%d)", pid); - return false; - } - - if (status != 0) { - pr_err("child exited with status=%d\n", status); - return false; - } - - return true; -} - -static const char script[] = "#!/bin/true"; -static const size_t script_len = sizeof(script) - 1; - -int main(int argc, char *argv[]) -{ -#ifdef MEMFD05 - char path[PATH_MAX]; - char *addr_p, *addr_s; - int rofd; -#endif - int fd; - - test_init(argc, argv); - - fd = _memfd_create("somename", 0); - if (fd < 0) { - pr_perror("memfd_create()"); - return 1; - } - if (ftruncate(fd, script_len) == -1) { - pr_perror("ftruncate"); - return 1; - } - if (write(fd, script, script_len) != script_len) { - pr_perror("write(memfd)"); - return 1; - } -#ifdef MEMFD05 - snprintf(path, PATH_MAX - 1, "/proc/self/fd/%d", fd); - rofd = open(path, O_RDONLY); - if (rofd < 0) { - pr_perror("unable to open read-only memfd"); - return 1; - } - addr_p = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, rofd, 0); - if (addr_p == MAP_FAILED) { - pr_perror("mmap"); - return 1; - } - addr_s = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); - if (addr_s == MAP_FAILED) { - pr_perror("mmap"); - return 1; - } -#endif - - if (!test_exec_fd(fd)) - return 1; - - test_msg("execveat(memfd) succeeded before C/R.\n"); - - test_daemon(); - test_waitsig(); - - if (!test_exec_fd(fd)) - return 1; - - pass(); - - return 0; -} diff --git a/test/zdtm/static/memfd04.desc b/test/zdtm/static/memfd04.desc deleted file mode 100644 index bbf136d14..000000000 --- a/test/zdtm/static/memfd04.desc +++ /dev/null @@ -1 +0,0 @@ -{'deps': ['/bin/true']} diff --git a/test/zdtm/static/memfd05.c b/test/zdtm/static/memfd05.c deleted file mode 120000 index 6caa9556f..000000000 --- a/test/zdtm/static/memfd05.c +++ /dev/null @@ -1 +0,0 @@ -memfd04.c \ No newline at end of file diff --git a/test/zdtm/static/memfd05.desc b/test/zdtm/static/memfd05.desc deleted file mode 120000 index 1b4963572..000000000 --- a/test/zdtm/static/memfd05.desc +++ /dev/null @@ -1 +0,0 @@ -memfd04.desc \ No newline at end of file diff --git a/test/zdtm/static/mnt_ext_collision.c b/test/zdtm/static/mnt_ext_collision.c deleted file mode 100644 index 8bd085c6a..000000000 --- a/test/zdtm/static/mnt_ext_collision.c +++ /dev/null @@ -1,194 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" -#include "lock.h" - -const char *test_doc = "Check external mount mountpoint collide with different mount in nested mntns"; -const char *test_author = "Pavel Tikhomirov "; - -char *dirname = "mnt_ext_collision.test"; -TEST_OPTION(dirname, string, "directory name", 1); - -char *source = "zdtm_ext_collision"; -char *source2 = "zdtm_ext_collision_2"; - -enum { - TEST_INIT = 0, - TEST_CHILD, - TEST_CHECK, - TEST_EXIT, - EMERGENCY_ABORT, -}; - -futex_t *futex; - -#define BUF_SIZE 4096 - -static int child(void) -{ - char dst[PATH_MAX], dst_file[PATH_MAX]; - int fd; - - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - goto err; - } - - /* - * Umount external mount copy - */ - sprintf(dst, "/%s/dst", dirname); - if (umount(dst)) { - pr_perror("umount"); - goto err; - } - - /* - * Mount tmpfs in its place - */ - if (mount(source2, dst, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - goto err; - } - - sprintf(dst_file, "/%s/dst/file", dirname); - fd = open(dst_file, O_RDWR | O_CREAT | O_EXCL, 0666); - if (fd < 0) { - pr_perror("open"); - goto err; - } - close(fd); - - futex_set_and_wake(futex, TEST_CHILD); - futex_wait_while_lt(futex, TEST_CHECK); - - if (access(dst_file, F_OK)) { - pr_perror("access"); - goto err; - } - - futex_set_and_wake(futex, TEST_EXIT); - return 0; -err: - futex_set_and_wake(futex, EMERGENCY_ABORT); - return 1; -} - -int main(int argc, char **argv) -{ - char *root, testdir[PATH_MAX]; - char lckd[PATH_MAX], dst[PATH_MAX]; - char *tmp = "/tmp/zdtm_ext_collision.tmp"; - char *zdtm_newns = getenv("ZDTM_NEWNS"); - int pid; - - root = getenv("ZDTM_ROOT"); - if (root == NULL) { - pr_perror("root"); - return 1; - } - - if (!zdtm_newns) { - pr_perror("ZDTM_NEWNS is not set"); - return 1; - } else if (strcmp(zdtm_newns, "1")) { - goto test; - } - - /* Prepare directories in test root */ - sprintf(testdir, "%s/%s", root, dirname); - mkdir(testdir, 0755); - - sprintf(lckd, "%s/%s/lckd", root, dirname); - mkdir(lckd, 0755); - sprintf(dst, "%s/%s/dst", root, dirname); - mkdir(dst, 0755); - - /* Prepare mount in criu root */ - mkdir(tmp, 0755); - if (mount(source, tmp, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - return 1; - } - if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - return 1; - } - - /* - * Create temporary mntns, next mounts will not show up in criu mntns - */ - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - return 1; - } - - /* - * Populate external mount to the tests mntns root - * (in uns flavour this would become locked) - */ - if (mount(tmp, lckd, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } -test: - test_init(argc, argv); - - /* - * Hack to create unlocked external mount without pivot_root+bind thing - */ - sprintf(lckd, "/%s/lckd", dirname); - sprintf(dst, "/%s/dst", dirname); - if (mount(lckd, dst, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } - - /* - * Setup futex for processes synchronization - */ - futex = mmap(NULL, sizeof(futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (futex == MAP_FAILED) { - pr_perror("mmap"); - return 1; - } - futex_init(futex); - - /* - * Fork child which would have nested mntns - */ - pid = fork(); - if (pid < 0) { - pr_perror("fork"); - return 1; - } else if (pid == 0) { - exit(child()); - } - - futex_wait_while_lt(futex, TEST_CHILD); - if (futex_get(futex) == EMERGENCY_ABORT) { - pr_err("Fail in child\n"); - return 1; - } - - test_daemon(); - test_waitsig(); - - futex_set_and_wake(futex, TEST_CHECK); - futex_wait_while_lt(futex, TEST_EXIT); - if (futex_get(futex) == EMERGENCY_ABORT) { - fail("Fail in child on check stage"); - return 1; - } - - waitpid(pid, NULL, 0); - pass(); - return 0; -} diff --git a/test/zdtm/static/mnt_ext_collision.desc b/test/zdtm/static/mnt_ext_collision.desc deleted file mode 100644 index 9b68a4ae0..000000000 --- a/test/zdtm/static/mnt_ext_collision.desc +++ /dev/null @@ -1,5 +0,0 @@ -{ 'dopts': '--external mnt[/mnt_ext_collision.test/dst]:ZDTM', - 'feature': 'mnt_id', - 'flavor': 'ns uns', - 'flags': 'suid', - 'ropts': '--external mnt[ZDTM]:/tmp/zdtm_ext_collision.tmp'} diff --git a/test/zdtm/static/mnt_ext_collision.hook b/test/zdtm/static/mnt_ext_collision.hook deleted file mode 100755 index 31c908d67..000000000 --- a/test/zdtm/static/mnt_ext_collision.hook +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -[ "$1" == "--clean" ] || exit 0 - -TMP="/tmp/zdtm_ext_collision.tmp" -echo "Cleanup mnt_ext_collision" -umount "$TMP" -rm -rf $TMP - -rm -rf "mnt_ext_collision.test" - -exit 0 diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.c b/test/zdtm/static/mnt_ext_file_bind_auto.c deleted file mode 100644 index 0c3b9f5fb..000000000 --- a/test/zdtm/static/mnt_ext_file_bind_auto.c +++ /dev/null @@ -1,104 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check if external file mount works"; -const char *test_author = "Pavel Tikhomirov "; - -char *filename = "mnt_ext_file_bind_auto_bind_auto.file"; -TEST_OPTION(filename, string, "file name", 1); - -char *source = "mnt_ext_file_bind_auto_bind_auto.source"; - -int create_file(const char *path) -{ - int fd; - - fd = open(path, O_CREAT | O_RDWR, 0644); - if (fd < 0) { - pr_perror("open"); - return -1; - } - - close(fd); - return 0; -} - -int main(int argc, char **argv) -{ - char *zdtm_newns = getenv("ZDTM_NEWNS"); - char *tmp = "/tmp/zdtm_ext_file_bind_auto.tmp"; - char *sourcefile = "/tmp/zdtm_ext_file_bind_auto.file"; - char *root, tmpfile[PATH_MAX], testfile[PATH_MAX]; - - root = getenv("ZDTM_ROOT"); - if (root == NULL) { - pr_perror("root"); - return 1; - } - - if (!zdtm_newns) { - pr_perror("ZDTM_NEWNS is not set"); - return 1; - } else if (strcmp(zdtm_newns, "1")) { - goto test; - } - - /* Prepare file bindmount in criu root (source for external file bindmount) */ - mkdir(tmp, 0755); - if (mount(source, tmp, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - return 1; - } - if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - return 1; - } - - sprintf(tmpfile, "%s/%s", tmp, filename); - if (create_file(tmpfile)) - return 1; - - if (create_file(sourcefile)) - return 1; - - if (mount(tmpfile, sourcefile, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } - - umount2(tmp, MNT_DETACH); - - /* Prepare file in test root (mount point for external file bindmount) */ - sprintf(testfile, "%s/%s", root, filename); - if (create_file(testfile)) - return 1; - - /* - * Create temporary mntns, next mounts will not show up in criu mntns - * and will be inherited into test mntns - */ - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - return 1; - } - - if (mount(sourcefile, testfile, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } -test: - test_init(argc, argv); - - test_daemon(); - test_waitsig(); - - pass(); - return 0; -} diff --git a/test/zdtm/static/mnt_ext_file_bind_auto.desc b/test/zdtm/static/mnt_ext_file_bind_auto.desc deleted file mode 100644 index 825b08127..000000000 --- a/test/zdtm/static/mnt_ext_file_bind_auto.desc +++ /dev/null @@ -1,4 +0,0 @@ -{ 'opts': '--external mnt[]', - 'feature': 'mnt_id', - 'flavor': 'ns uns', - 'flags': 'suid'} diff --git a/test/zdtm/static/mnt_ext_master.c b/test/zdtm/static/mnt_ext_master.c index dbadef0b8..5fd8fa8b2 100644 --- a/test/zdtm/static/mnt_ext_master.c +++ b/test/zdtm/static/mnt_ext_master.c @@ -39,10 +39,6 @@ int main(int argc, char **argv) pr_perror("mount"); return 1; } - if (mount(NULL, dname, NULL, MS_SHARED, NULL)) { - pr_perror("shared"); - return 1; - } mkdir(src, 755); mkdir(dst, 755); diff --git a/test/zdtm/static/mnt_ext_multiple.c b/test/zdtm/static/mnt_ext_multiple.c deleted file mode 100644 index 7014927ac..000000000 --- a/test/zdtm/static/mnt_ext_multiple.c +++ /dev/null @@ -1,118 +0,0 @@ -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check multiple non-common root external mounts with same external master"; -const char *test_author = "Pavel Tikhomirov "; - -char *dirname = "mnt_ext_multiple.test"; -char *source = "zdtm_ext_multiple"; -char *ext_source = "zdtm_ext_multiple.ext"; -TEST_OPTION(dirname, string, "directory name", 1); - -int main(int argc, char **argv) -{ - char *root, testdir[PATH_MAX]; - char dst_a[PATH_MAX], dst_b[PATH_MAX]; - char src[PATH_MAX], src_a[PATH_MAX], src_b[PATH_MAX]; - char nsdst_a[PATH_MAX], nsdst_b[PATH_MAX]; - char *tmp = "/tmp/zdtm_ext_multiple.tmp"; - char *zdtm_newns = getenv("ZDTM_NEWNS"); - - root = getenv("ZDTM_ROOT"); - if (root == NULL) { - pr_perror("root"); - return 1; - } - - if (!zdtm_newns) { - pr_perror("ZDTM_NEWNS is not set"); - return 1; - } else if (strcmp(zdtm_newns, "1")) { - goto test; - } - - /* Prepare directories in test root */ - sprintf(testdir, "%s/%s", root, dirname); - mkdir(testdir, 0755); - sprintf(dst_a, "%s/%s/dst_a", root, dirname); - mkdir(dst_a, 0755); - sprintf(dst_b, "%s/%s/dst_b", root, dirname); - mkdir(dst_b, 0755); - - /* Prepare directories in criu root */ - mkdir(tmp, 0755); - if (mount(source, tmp, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - return 1; - } - if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - return 1; - } - sprintf(src, "%s/src", tmp); - mkdir(src, 0755); - - /* Create a shared mount in criu mntns */ - if (mount(ext_source, src, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - return 1; - } - if (mount(NULL, src, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - return 1; - } - if (mount(NULL, src, NULL, MS_SHARED, NULL)) { - pr_perror("make shared"); - return 1; - } - - /* - * Create temporary mntns, next mounts will not show up in criu mntns - */ - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - return 1; - } - - /* - * Populate to the tests root subdirectories of the src mount - */ - sprintf(src_a, "%s/src/a", tmp); - mkdir(src_a, 0755); - if (mount(src_a, dst_a, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } - sprintf(src_b, "%s/src/b", tmp); - mkdir(src_b, 0755); - if (mount(src_b, dst_b, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } - -test: - test_init(argc, argv); - - /* Make "external" mounts to have external master */ - sprintf(nsdst_a, "/%s/dst_a", dirname); - if (mount(NULL, nsdst_a, NULL, MS_SLAVE, NULL)) { - pr_perror("make slave"); - return 1; - } - sprintf(nsdst_b, "/%s/dst_b", dirname); - if (mount(NULL, nsdst_b, NULL, MS_SLAVE, NULL)) { - pr_perror("make slave"); - return 1; - } - - test_daemon(); - test_waitsig(); - - pass(); - - return 0; -} diff --git a/test/zdtm/static/mnt_ext_multiple.desc b/test/zdtm/static/mnt_ext_multiple.desc deleted file mode 100644 index fd413ed15..000000000 --- a/test/zdtm/static/mnt_ext_multiple.desc +++ /dev/null @@ -1,5 +0,0 @@ -{ 'dopts': '--external mnt[/mnt_ext_multiple.test/dst_a]:MNT_A --external mnt[/mnt_ext_multiple.test/dst_b]:MNT_B', - 'feature': 'mnt_id move_mount_set_group', - 'flavor': 'ns uns', - 'flags': 'suid', - 'ropts': '--external mnt[MNT_A]:/tmp/zdtm_ext_multiple.tmp/src/a --external mnt[MNT_B]:/tmp/zdtm_ext_multiple.tmp/src/b --no-mntns-compat-mode'} diff --git a/test/zdtm/static/mnt_ext_root.c b/test/zdtm/static/mnt_ext_root.c deleted file mode 100644 index 9f9554d51..000000000 --- a/test/zdtm/static/mnt_ext_root.c +++ /dev/null @@ -1,88 +0,0 @@ -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check root external mount with \"deepper\" bind"; -const char *test_author = "Pavel Tikhomirov "; - -char *source = "zdtm_ext_root"; -char *dirname = "mnt_ext_root.test"; -TEST_OPTION(dirname, string, "directory name", 1); - -#define BUF_SIZE 4096 - -int main(int argc, char **argv) -{ - char *root, testdir[PATH_MAX]; - char dst[PATH_MAX], deep_bind[PATH_MAX]; - char *tmp = "/tmp/zdtm_ext_root.tmp"; - char *zdtm_newns = getenv("ZDTM_NEWNS"); - - root = getenv("ZDTM_ROOT"); - if (root == NULL) { - pr_perror("root"); - return 1; - } - - if (!zdtm_newns) { - pr_perror("ZDTM_NEWNS is not set"); - return 1; - } else if (strcmp(zdtm_newns, "1")) { - goto test; - } - - /* Prepare directories in test root */ - sprintf(testdir, "%s/%s", root, dirname); - mkdir(testdir, 0755); - - sprintf(dst, "%s/%s/dst", root, dirname); - mkdir(dst, 0755); - sprintf(deep_bind, "%s/%s/deep", root, dirname); - mkdir(deep_bind, 0755); - sprintf(deep_bind, "%s/%s/deep/bind", root, dirname); - mkdir(deep_bind, 0755); - - /* Prepare mount in criu root */ - mkdir(tmp, 0755); - if (mount(source, tmp, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - return 1; - } - if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - return 1; - } - - /* - * Create temporary mntns, next mounts will not show up in criu mntns - */ - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - return 1; - } - - /* - * Populate to the tests mntns root mounts - */ - if (mount(tmp, dst, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } - - if (mount(tmp, deep_bind, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } - -test: - test_init(argc, argv); - - test_daemon(); - test_waitsig(); - - pass(); - return 0; -} diff --git a/test/zdtm/static/mnt_ext_root.desc b/test/zdtm/static/mnt_ext_root.desc deleted file mode 100644 index 121dbb0a7..000000000 --- a/test/zdtm/static/mnt_ext_root.desc +++ /dev/null @@ -1,5 +0,0 @@ -{ 'dopts': '--external mnt[/mnt_ext_root.test/dst]:ZDTM', - 'feature': 'mnt_id', - 'flavor': 'ns uns', - 'flags': 'suid', - 'ropts': '--external mnt[ZDTM]:/tmp/zdtm_ext_root.tmp'} diff --git a/test/zdtm/static/mnt_ext_root.hook b/test/zdtm/static/mnt_ext_root.hook deleted file mode 100755 index c022f91ad..000000000 --- a/test/zdtm/static/mnt_ext_root.hook +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -[ "$1" == "--clean" ] || exit 0 - -TMP="/tmp/zdtm_ext_root.tmp" -echo "Cleanup mnt_ext_sharing" -umount "$TMP" -rm -rf $TMP - -rm -rf "mnt_ext_root.test" - -exit 0 diff --git a/test/zdtm/static/mnt_ext_sharing.c b/test/zdtm/static/mnt_ext_sharing.c deleted file mode 100644 index b562b5716..000000000 --- a/test/zdtm/static/mnt_ext_sharing.c +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include -#include -#include -#include - -#include "zdtmtst.h" -#include "lock.h" - -const char *test_doc = "Check sharing vs external mounts vs mntns"; -const char *test_author = "Pavel Tikhomirov "; - -char *dirname = "mnt_ext_sharing.test"; -char *source = "zdtm_ext_sharing"; -char *internal_source = "zdtm_ext_sharing.internal"; -#define SUBDIR "subdir" -TEST_OPTION(dirname, string, "directory name", 1); - -enum { - TEST_START, - TEST_STARTED, - TEST_EXIT, - TEST_EXITED, -}; - -struct shared { - futex_t fstate; - int ret; -}; - -struct shared *sh; - -#define BUF_SIZE 4096 - -int pid_mntinfo_get_shid(char *pid, char *source) -{ - char path[PATH_MAX], line[BUF_SIZE]; - FILE *mountinfo; - char *hyphen, *shared; - int ret = -1; - - sprintf(path, "/proc/%s/mountinfo", pid); - mountinfo = fopen(path, "r"); - if (!mountinfo) { - pr_perror("fopen"); - return ret; - } - - while (fgets(line, sizeof(line), mountinfo)) { - hyphen = strchr(line, '-'); - if (!hyphen) { - pr_perror("no hyphen in mountinfo"); - break; - } - - if (!strstr(hyphen + 1, source)) - continue; - - shared = strstr(line, "shared:"); - if (!shared) { - pr_err("no shared id\n"); - break; - } - - ret = atoi(shared + 7); - break; - } - - fclose(mountinfo); - return ret; -} - -int secondary_mntns_child(void) -{ - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - sh->ret = 1; - futex_abort_and_wake(&sh->fstate); - return 1; - } - futex_set_and_wake(&sh->fstate, TEST_STARTED); - futex_wait_until(&sh->fstate, TEST_EXIT); - /* These task is just holding the reference to secondary mntns */ - futex_set_and_wake(&sh->fstate, TEST_EXITED); - return 0; -} - -int main(int argc, char **argv) -{ - char *root, testdir[PATH_MAX], spid[BUF_SIZE]; - char internal_dst[PATH_MAX], internal_src[PATH_MAX], internal_nsdst[PATH_MAX]; - int internal_shid_self = -1, internal_shid_pid = -1; - char *tmp = "/tmp/zdtm_ext_sharing.tmp"; - char *zdtm_newns = getenv("ZDTM_NEWNS"); - int pid, status; - - root = getenv("ZDTM_ROOT"); - if (root == NULL) { - pr_perror("root"); - return 1; - } - - if (!zdtm_newns) { - pr_perror("ZDTM_NEWNS is not set"); - return 1; - } else if (strcmp(zdtm_newns, "1")) { - goto test; - } - - /* Prepare directories in test root */ - sprintf(testdir, "%s/%s", root, dirname); - mkdir(testdir, 0755); - - sprintf(internal_dst, "%s/%s/internal", root, dirname); - mkdir(internal_dst, 0755); - - /* Prepare directories in criu root */ - mkdir(tmp, 0755); - if (mount(source, tmp, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - return 1; - } - if (mount(NULL, tmp, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - return 1; - } - - sprintf(internal_src, "%s/internal", tmp); - mkdir(internal_src, 0755); - - /* Create a shared mount in criu mntns */ - if (mount(internal_source, internal_src, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - return 1; - } - if (mount(NULL, internal_src, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - return 1; - } - - if (mount(NULL, internal_src, NULL, MS_SHARED, NULL)) { - pr_perror("make shared"); - return 1; - } - - /* - * Create temporary mntns, next mounts will not show up in criu mntns - */ - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - return 1; - } - - /* - * Populate to the tests root only a subdirectory of the internal_src - * mount to ensure that it will be restored as an external mount. - */ - sprintf(internal_src, "%s/internal/%s", tmp, SUBDIR); - mkdir(internal_src, 0755); - if (mount(internal_src, internal_dst, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } - -test: - test_init(argc, argv); - - sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (sh == MAP_FAILED) { - pr_perror("Failed to alloc shared region"); - exit(1); - } - - futex_set(&sh->fstate, TEST_START); - sh->ret = 0; - - sprintf(internal_nsdst, "/%s/internal", dirname); - /* Make "external" mount to have internal sharing */ - if (mount(NULL, internal_nsdst, NULL, MS_PRIVATE, NULL)) { - pr_perror("make shared"); - return 1; - } - - if (mount(NULL, internal_nsdst, NULL, MS_SHARED, NULL)) { - pr_perror("make shared"); - return 1; - } - - /* Create secondary mntns copying all mounts */ - pid = fork(); - if (pid < 0) { - pr_perror("fork"); - return 1; - } else if (pid == 0) { - exit(secondary_mntns_child()); - } - - futex_wait_until(&sh->fstate, TEST_STARTED); - if (sh->ret != 0) { - pr_err("error in child\n"); - return 1; - } - - test_daemon(); - test_waitsig(); - - /* - * Check mounts in primary and secondary - * mntnses are shared to each other. - */ - sprintf(spid, "%d", pid); - internal_shid_pid = pid_mntinfo_get_shid(spid, internal_source); - internal_shid_self = pid_mntinfo_get_shid("self", internal_source); - - /* Cleanup */ - futex_set_and_wake(&sh->fstate, TEST_EXIT); - futex_wait_until(&sh->fstate, TEST_EXITED); - - while (wait(&status) > 0) { - if (!WIFEXITED(status) || WEXITSTATUS(status)) { - fail("Wrong exit status: %d", status); - return 1; - } - } - - if (internal_shid_pid == -1 || internal_shid_self == -1 || internal_shid_pid != internal_shid_self) { - fail("Shared ids does not match (internal)"); - return 1; - } - - /* Print shared id so that it can be checked in cleanup hook */ - test_msg("internal_shared_id = %d\n", internal_shid_pid); - pass(); - - return 0; -} diff --git a/test/zdtm/static/mnt_ext_sharing.desc b/test/zdtm/static/mnt_ext_sharing.desc deleted file mode 100644 index d72505837..000000000 --- a/test/zdtm/static/mnt_ext_sharing.desc +++ /dev/null @@ -1,5 +0,0 @@ -{ 'dopts': '--external mnt[/mnt_ext_sharing.test/internal]:ZDTM', - 'feature': 'mnt_id move_mount_set_group', - 'flavor': 'ns uns', - 'flags': 'suid', - 'ropts': '--external mnt[ZDTM]:/tmp/zdtm_ext_sharing.tmp/internal/subdir --no-mntns-compat-mode'} diff --git a/test/zdtm/static/mnt_ext_sharing.hook b/test/zdtm/static/mnt_ext_sharing.hook deleted file mode 100755 index 82443b632..000000000 --- a/test/zdtm/static/mnt_ext_sharing.hook +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -[ "$1" == "--clean" ] || exit 0 - -function err { - echo "$1" - exit 1 -} - -# Check shared ids don't match in criu mntns and test mntns -OUT="zdtm/static/mnt_ext_sharing.out" -[ -f "$OUT" ] || err "No $OUT file" - -SHID=$(cat $OUT | grep internal_shared_id | awk '{print $5}') -[ -z "$SHID" ] && err "Failed to get shared id from file" -MATCH=$(cat /proc/self/mountinfo | grep "\.* - tmpfs zdtm_ext_sharing.internal") -[ -z "$MATCH" ] || err "Can lookup internal shared id $SHID in criu mntns" - -TMP="/tmp/zdtm_ext_sharing.tmp" -echo "Cleanup mnt_ext_sharing" -umount "$TMP/internal" -umount "$TMP" -rm -rf $TMP - -rm -rf "mnt_ext_sharing.test" - -exit 0 diff --git a/test/zdtm/static/mnt_ro_root.c b/test/zdtm/static/mnt_ro_root.c deleted file mode 100644 index 2d8370150..000000000 --- a/test/zdtm/static/mnt_ro_root.c +++ /dev/null @@ -1,32 +0,0 @@ -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check if root mount remains read-only after c/r"; -const char *test_author = "Pavel Tikhomirov "; - -char *dirname; -TEST_OPTION(dirname, string, "directory name", 1); - -int main(int argc, char **argv) -{ - test_init(argc, argv); - - if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL)) { - pr_perror("mount"); - return 1; - } - - test_daemon(); - test_waitsig(); - - /* - * Note: In zdtm.py:check_visible_state() we already check for all - * tests, that all mounts in the test's mount namespace remain the - * same, by comparing mountinfo before and after c/r. So rw/ro mount - * option inconsistency will be detected there and we don't need to - * check it in the test itself. - */ - pass(); - return 0; -} diff --git a/test/zdtm/static/mnt_ro_root.desc b/test/zdtm/static/mnt_ro_root.desc deleted file mode 100644 index c9a8e4f18..000000000 --- a/test/zdtm/static/mnt_ro_root.desc +++ /dev/null @@ -1,6 +0,0 @@ -{ - 'flavor': 'ns uns', - 'flags': 'suid', - 'feature': 'mnt_id', - 'bind': 'zdtm/static', -} diff --git a/test/zdtm/static/mnt_root_ext.c b/test/zdtm/static/mnt_root_ext.c deleted file mode 100644 index 305e87262..000000000 --- a/test/zdtm/static/mnt_root_ext.c +++ /dev/null @@ -1,87 +0,0 @@ -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check external mount from host's rootfs"; -const char *test_author = "Pavel Tikhomirov "; - -char *dirname = "mnt_root_ext.test"; -TEST_OPTION(dirname, string, "directory name", 1); - -int main(int argc, char **argv) -{ - char *root, testdir[PATH_MAX], nstestdir[PATH_MAX]; - char *zdtm_newns = getenv("ZDTM_NEWNS"); - char tmp[] = "/.zdtm_root_ext.tmp"; - - root = getenv("ZDTM_ROOT"); - if (root == NULL) { - pr_perror("root"); - return 1; - } - - if (!zdtm_newns) { - pr_perror("ZDTM_NEWNS is not set"); - return 1; - } else if (strcmp(zdtm_newns, "1")) { - goto test; - } - - /* Prepare directories in test root */ - sprintf(testdir, "%s/%s", root, dirname); - mkdir(testdir, 0755); - - /* Prepare directories in criu root */ - mkdir(tmp, 0755); - - /* Make criu's mntns root mount shared */ - if (mount(NULL, "/", NULL, MS_SHARED, NULL)) { - pr_perror("make shared"); - return 1; - } - - /* - * Create temporary mntns, next mounts will not show up in criu mntns - */ - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - return 1; - } - - /* - * Make mounts in temporary mntns slave, to prevent propagation to criu mntns - */ - if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL)) { - pr_perror("make rslave"); - return 1; - } - - /* - * Populate to the tests root host's rootfs subdir - */ - if (mount(tmp, testdir, NULL, MS_BIND, NULL)) { - pr_perror("bind"); - return 1; - } -test: - test_init(argc, argv); - - /* - * Make "external" mount to be slave - */ - sprintf(nstestdir, "/%s", dirname); - if (mount(NULL, nstestdir, NULL, MS_SLAVE, NULL)) { - pr_perror("make slave"); - return 1; - } - - test_daemon(); - test_waitsig(); - - pass(); - - return 0; -} diff --git a/test/zdtm/static/mnt_root_ext.desc b/test/zdtm/static/mnt_root_ext.desc deleted file mode 100644 index fee7efbae..000000000 --- a/test/zdtm/static/mnt_root_ext.desc +++ /dev/null @@ -1,5 +0,0 @@ -{ 'dopts': '--external mnt[/mnt_root_ext.test]:MNT', - 'feature': 'mnt_id move_mount_set_group', - 'flavor': 'ns uns', - 'flags': 'suid', - 'ropts': '--external mnt[MNT]:.zdtm_root_ext.tmp --no-mntns-compat-mode'} diff --git a/test/zdtm/static/mnt_root_ext.hook b/test/zdtm/static/mnt_root_ext.hook deleted file mode 100755 index a5286f208..000000000 --- a/test/zdtm/static/mnt_root_ext.hook +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -[ "$1" == "--clean" ] || exit 0 - -rmdir /.zdtm_root_ext.tmp diff --git a/test/zdtm/static/mnt_tracefs.desc b/test/zdtm/static/mnt_tracefs.desc index 7cf98034a..e90ea9415 100644 --- a/test/zdtm/static/mnt_tracefs.desc +++ b/test/zdtm/static/mnt_tracefs.desc @@ -1,4 +1,3 @@ { 'feature': 'mnt_id', 'flavor': 'uns', - 'opts': '--ext-mount-map auto --enable-external-masters', - 'ropts': '--mntns-compat-mode'} + 'opts': '--ext-mount-map auto --enable-external-masters'} diff --git a/compel/arch/riscv64/src/lib/include/cpu.h b/test/zdtm/static/mntns-deleted-dst similarity index 100% rename from compel/arch/riscv64/src/lib/include/cpu.h rename to test/zdtm/static/mntns-deleted-dst diff --git a/test/zdtm/static/mntns_ghost01.c b/test/zdtm/static/mntns_ghost01.c index 2cc2270dd..20397d543 100644 --- a/test/zdtm/static/mntns_ghost01.c +++ b/test/zdtm/static/mntns_ghost01.c @@ -6,7 +6,6 @@ #include #include #include -#include #include "zdtmtst.h" @@ -90,13 +89,6 @@ int main(int argc, char **argv) return 1; } - fd = open(ghost_path, O_CREAT | O_WRONLY, 0600); - if (fd >= 0 || errno != EROFS) { - pr_perror("open for write on rofs -> %d", fd); - close(fd); - return 1; - } - return 0; } diff --git a/test/zdtm/static/mntns_open.c b/test/zdtm/static/mntns_open.c index 0430f5b99..7d8bbbaa4 100644 --- a/test/zdtm/static/mntns_open.c +++ b/test/zdtm/static/mntns_open.c @@ -17,7 +17,7 @@ #define CLONE_NEWNS 0x00020000 #endif -const char *test_doc = "Check that mnt_id is respected"; +const char *test_doc = "Check that mnt_id is repsected"; const char *test_author = "Pavel Emelianov "; #define MPTS_FILE "F" diff --git a/test/zdtm/static/mntns_pivot_root.c b/test/zdtm/static/mntns_pivot_root.c deleted file mode 100644 index 95af54cde..000000000 --- a/test/zdtm/static/mntns_pivot_root.c +++ /dev/null @@ -1,181 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" -#include "lock.h" - -const char *test_doc = "Check nested mntns with different root"; -const char *test_author = "Pavel Tikhomirov "; - -char *dirname = "mntns_pivot_root.test"; -TEST_OPTION(dirname, string, "directory name", 1); - -char *source = "mntns_pivot_root"; - -enum { - TEST_INIT = 0, - TEST_CHILD, - TEST_CHECK, - TEST_EXIT, - EMERGENCY_ABORT, -}; - -futex_t *futex; - -static int sys_pivot_root(const char *new_root, const char *put_old) -{ - return syscall(SYS_pivot_root, new_root, put_old); -} - -#define BUF_SIZE 4096 - -static int child(void) -{ - char *put_root = "put_root"; - char *testfile = "testfile"; - int fd; - - if (unshare(CLONE_NEWNS)) { - pr_perror("unshare"); - goto err; - } - /* - * Setup new root - */ - mkdir(dirname, 0755); - - if (mount(source, dirname, "tmpfs", 0, NULL)) { - pr_perror("mount tmpfs"); - goto err; - } - - if (mount(NULL, dirname, NULL, MS_PRIVATE, NULL)) { - pr_perror("make private"); - goto err; - } - - if (chdir(dirname)) { - pr_perror("chdir"); - goto err; - } - - mkdir(put_root, 0755); - - if (sys_pivot_root(".", put_root)) { - pr_perror("pivot_root"); - goto err; - } - - if (umount2(put_root, MNT_DETACH)) { - pr_perror("umount2"); - goto err; - } - - fd = open(testfile, O_RDWR | O_CREAT | O_EXCL, 0666); - if (fd < 0) { - pr_perror("open"); - goto err; - } - close(fd); - -#ifdef MNTNS_PIVOT_ROOT_RO - /* - * Hack to make cr_pivot_root work on readonly mntns root, - * normally nested containers have /tmp directory - */ - mkdir("tmp", 0755); - /* - * Make superblock readonly - */ - if (mount(NULL, "/", NULL, MS_REMOUNT | MS_RDONLY, NULL)) { - pr_perror("remount_ro"); - goto err; - } -#endif - - futex_set_and_wake(futex, TEST_CHILD); - futex_wait_while_lt(futex, TEST_CHECK); - - if (access(testfile, F_OK)) { - pr_perror("access"); - goto err; - } - -#ifdef MNTNS_PIVOT_ROOT_RO - /* - * Check superblock readonly - */ - fd = open(testfile, O_WRONLY); - if (fd >= 0) { - pr_err("Open on readonly superblock should fail\n"); - close(fd); - goto err; - } else if (errno != EROFS) { - pr_perror("open write"); - goto err; - } -#endif - - futex_set_and_wake(futex, TEST_EXIT); - return 0; -err: - futex_set_and_wake(futex, EMERGENCY_ABORT); - return 1; -} - -int main(int argc, char **argv) -{ - int pid; - - test_init(argc, argv); - - /* - * Setup futex for processes synchronization - */ - futex = mmap(NULL, sizeof(futex), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (futex == MAP_FAILED) { - pr_perror("mmap"); - return 1; - } - futex_init(futex); - - /* - * Fork child which would have nested mntns - */ - pid = fork(); - if (pid < 0) { - pr_perror("fork"); - return 1; - } else if (pid == 0) { - exit(child()); - } - - futex_wait_while_lt(futex, TEST_CHILD); - if (futex_get(futex) == EMERGENCY_ABORT) { - pr_err("Fail in child\n"); - return 1; - } - - test_daemon(); - test_waitsig(); - - futex_set_and_wake(futex, TEST_CHECK); - futex_wait_while_lt(futex, TEST_EXIT); - if (futex_get(futex) == EMERGENCY_ABORT) { - fail("Fail in child on check stage"); - return 1; - } - - waitpid(pid, NULL, 0); - pass(); - return 0; -} diff --git a/test/zdtm/static/mntns_pivot_root.desc b/test/zdtm/static/mntns_pivot_root.desc deleted file mode 100644 index 7657ba45c..000000000 --- a/test/zdtm/static/mntns_pivot_root.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'ns uns', 'flags': 'suid'} diff --git a/test/zdtm/static/mntns_pivot_root_ro.c b/test/zdtm/static/mntns_pivot_root_ro.c deleted file mode 120000 index d352a6369..000000000 --- a/test/zdtm/static/mntns_pivot_root_ro.c +++ /dev/null @@ -1 +0,0 @@ -mntns_pivot_root.c \ No newline at end of file diff --git a/test/zdtm/static/mntns_pivot_root_ro.desc b/test/zdtm/static/mntns_pivot_root_ro.desc deleted file mode 120000 index 8708421ed..000000000 --- a/test/zdtm/static/mntns_pivot_root_ro.desc +++ /dev/null @@ -1 +0,0 @@ -mntns_pivot_root.desc \ No newline at end of file diff --git a/test/zdtm/static/mntns_root_bind.c b/test/zdtm/static/mntns_root_bind.c index 4c0347cb2..9e1ba06e6 100644 --- a/test/zdtm/static/mntns_root_bind.c +++ b/test/zdtm/static/mntns_root_bind.c @@ -71,7 +71,7 @@ int main(int argc, char **argv) task_waiter_wait4(&t, 2); if (access(bspath, F_OK)) { - fail("%s isn't accessible", bspath); + fail("%s isn't accessiable", bspath); return 1; } diff --git a/test/zdtm/static/mount_complex_sharing.c b/test/zdtm/static/mount_complex_sharing.c deleted file mode 100644 index 5f247a8e4..000000000 --- a/test/zdtm/static/mount_complex_sharing.c +++ /dev/null @@ -1,249 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "mountinfo.h" -#include "zdtmtst.h" - -const char *test_doc = "Check complex sharing options for mounts"; -const char *test_author = "Pavel Tikhomirov "; - -char *dirname = "mount_complex_sharing"; -TEST_OPTION(dirname, string, "directory name", 1); - -/* - * Description for creating a single file: - * path - path to create file in (relative to mount) - * dir - true if file is a directory - * content - if file is not a directory, this string is written into the file - */ -struct file { - char *path; - bool dir; - char *content; -}; - -/* - * Description for creating a single mount: - * mountpoint - path to create mount on (relative to dirname) - * bind - id of bind source if any or -1 - * bind_root - root offset from bind source - * fstype - needed for non-binds, always tmpfs - * source - source for mounting - * flags - array of sharing options or mount flags applied after - * mounting (ending with -1) - * mounted - identifies implicitly propagated mounts - * files - array of files we need to create on mount (ending with zeroed file) - */ -struct mountinfo { - char *mountpoint; - int bind; - char *bind_root; - char *fstype; - char *source; - int flags[3]; - bool mounted; - struct file files[10]; -}; - -/* clang-format off */ -struct mountinfo mounts[] = { - {"", -1, "", "tmpfs", "tmpfs-dirname", {MS_PRIVATE, -1}, false, - { - {"shared-bind-1", true}, - {"shared-bind-2", true}, - {"shared-bind-3", true}, - {"shared-bind-4", true}, - {"private-mnt", true}, - {"shared-mnt", true}, - {"slave-mnt", true}, - {"slave-shared-mnt", true}, - {"testfile", false, "TESTFILE"}, - {NULL} - } - }, - - {"shared-bind-1", -1, "", "tmpfs", "tmpfs-shared-bind", {MS_SHARED, -1}, false, - { - {"prop-private", true}, - {"prop-shared", true}, - {"prop-slave", true}, - {"prop-slave-shared", true}, - {"prop-mount-flags", true}, - {NULL} - } - }, - {"shared-bind-2", 1, "", NULL, NULL, {-1}, false}, - {"shared-bind-3", 1, "", NULL, NULL, {-1}, false}, - {"shared-bind-4", 1, "", NULL, NULL, {-1}, false}, - - {"private-mnt", -1, "", "tmpfs", "tmpfs-mnt", {MS_PRIVATE, -1}, false, - { - {"subdir", true}, - {NULL} - } - }, - {"shared-mnt", 5, "", NULL, NULL, {MS_SHARED, -1}, false}, - {"slave-mnt", 6, "", NULL, NULL, {MS_SLAVE, -1}, false}, - {"slave-shared-mnt", 7, "", NULL, NULL, {MS_SHARED, -1}, false}, - - {"shared-bind-1/prop-private", 5, "subdir", NULL, NULL, {-1}, false}, - {"shared-bind-1/prop-shared", 6, "subdir", NULL, NULL, {-1}, false}, - {"shared-bind-1/prop-slave", 7, "subdir", NULL, NULL, {-1}, false}, - {"shared-bind-1/prop-slave-shared", 8, "subdir", NULL, NULL, {-1}, false}, - - {"shared-bind-2/prop-private", -1, NULL, NULL, NULL, {MS_PRIVATE, -1}, true}, - {"shared-bind-2/prop-shared", -1, NULL, NULL, NULL, {MS_PRIVATE, -1}, true}, - {"shared-bind-2/prop-slave", -1, NULL, NULL, NULL, {MS_PRIVATE, -1}, true}, - {"shared-bind-2/prop-slave-shared", -1, NULL, NULL, NULL, {MS_PRIVATE, -1}, true}, - - {"shared-bind-3/prop-private", -1, NULL, NULL, NULL, {MS_SLAVE, -1}, true}, - {"shared-bind-3/prop-shared", -1, NULL, NULL, NULL, {MS_SLAVE, -1}, true}, - {"shared-bind-3/prop-slave", -1, NULL, NULL, NULL, {MS_SLAVE, -1}, true}, - {"shared-bind-3/prop-slave-shared", -1, NULL, NULL, NULL, {MS_SLAVE, -1}, true}, - - {"shared-bind-4/prop-private", -1, NULL, NULL, NULL, {MS_PRIVATE, MS_SHARED, -1}, true}, - {"shared-bind-4/prop-shared", -1, NULL, NULL, NULL, {MS_PRIVATE, MS_SHARED, -1}, true}, - {"shared-bind-4/prop-slave", -1, NULL, NULL, NULL, {MS_PRIVATE, MS_SHARED, -1}, true}, - {"shared-bind-4/prop-slave-shared", -1, NULL, NULL, NULL, {MS_PRIVATE, MS_SHARED, -1}, true}, - - {"shared-bind-1/prop-mount-flags", 5, "subdir", NULL, NULL, {MS_RDONLY|MS_REMOUNT|MS_BIND, -1}, false}, - {"shared-bind-2/prop-mount-flags", -1, NULL, NULL, NULL, {MS_RDONLY|MS_REMOUNT|MS_BIND, -1}, true}, - {"shared-bind-3/prop-mount-flags", -1, NULL, NULL, NULL, {-1}, true}, - {"shared-bind-4/prop-mount-flags", -1, NULL, NULL, NULL, {-1}, true}, -}; -/* clang-format on */ - -static int fill_content(struct mountinfo *mi) -{ - struct file *file = &mi->files[0]; - char path[PATH_MAX]; - - while (file->path != NULL) { - snprintf(path, sizeof(path), "%s/%s/%s", dirname, mi->mountpoint, file->path); - - if (file->dir) { - test_msg("Mkdir %s\n", path); - if (mkdir(path, 0700)) { - pr_perror("Failed to create dir %s", path); - return -1; - } - } else { - int fd, len = strlen(file->content); - - test_msg("Create file %s with content %s\n", path, file->content); - fd = open(path, O_WRONLY | O_CREAT, 0777); - if (fd < 0) { - pr_perror("Failed to create file %s", path); - return -1; - } - - if (write(fd, file->content, len) != len) { - pr_perror("Failed to write %s to file %s", file->content, path); - close(fd); - return -1; - } - close(fd); - } - - file++; - } - - return 0; -} - -static int mount_one(struct mountinfo *mi) -{ - char source[PATH_MAX], target[PATH_MAX]; - int *flags = mi->flags, mflags = 0; - char *fstype = NULL; - - test_msg("Mounting %s %d %s %s %d\n", mi->mountpoint, mi->bind, mi->fstype, mi->source, mi->mounted); - - snprintf(target, sizeof(target), "%s/%s", dirname, mi->mountpoint); - - if (mi->mounted) - goto apply_flags; - - if (mi->bind != -1) { - snprintf(source, sizeof(source), "%s/%s/%s", dirname, mounts[mi->bind].mountpoint, mi->bind_root); - fstype = NULL; - mflags = MS_BIND; - } else { - snprintf(source, sizeof(source), "%s", mi->source); - fstype = mi->fstype; - } - - if (mount(source, target, fstype, mflags, NULL)) { - pr_perror("Failed to mount %s %s %s", source, target, fstype); - return -1; - } - - if (fill_content(mi)) - return -1; - -apply_flags: - while (flags[0] != -1) { - test_msg("Making mount %s 0x%x\n", target, flags[0]); - if (mount(NULL, target, NULL, flags[0], NULL)) { - pr_perror("Failed to make mount %s 0x%x", target, flags[0]); - return -1; - } - flags++; - } - - return 0; -} - -static int mount_loop(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mounts); i++) { - if (mount_one(&mounts[i])) - return 1; - } - - return 0; -} - -int main(int argc, char **argv) -{ - MNTNS_ZDTM(mntns_before); - MNTNS_ZDTM(mntns_after); - int ret = 1; - - test_init(argc, argv); - - if (mkdir(dirname, 0700) && errno != EEXIST) { - pr_perror("Failed to create %s", dirname); - goto err; - } - - if (mount_loop()) - goto err; - - if (mntns_parse_mountinfo(&mntns_before)) - goto err; - - test_daemon(); - test_waitsig(); - - if (mntns_parse_mountinfo(&mntns_after)) - goto err; - - if (mntns_compare(&mntns_before, &mntns_after)) - goto err; - - pass(); - ret = 0; -err: - mntns_free_all(&mntns_before); - mntns_free_all(&mntns_after); - if (ret) - fail(); - return ret; -} diff --git a/test/zdtm/static/mount_complex_sharing.desc b/test/zdtm/static/mount_complex_sharing.desc deleted file mode 100644 index 8910f4684..000000000 --- a/test/zdtm/static/mount_complex_sharing.desc +++ /dev/null @@ -1,4 +0,0 @@ -{ 'flavor': 'ns uns', - 'flags': 'suid', - 'feature': 'move_mount_set_group', - 'ropts': '--no-mntns-compat-mode'} diff --git a/test/zdtm/static/mprotect00.c b/test/zdtm/static/mprotect00.c index 717b7ddcf..006b64772 100644 --- a/test/zdtm/static/mprotect00.c +++ b/test/zdtm/static/mprotect00.c @@ -44,12 +44,10 @@ static int check_prot(char *ptr, int prot) fail("PROT_READ bypassed"); return -1; } - } else { - /* we come here on return from SIGSEGV handler */ + } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_READ) { - fail("PROT_READ rejected"); - return -1; - } + fail("PROT_READ rejected"); + return -1; } if (!sigsetjmp(segv_ret, 1)) { @@ -58,12 +56,10 @@ static int check_prot(char *ptr, int prot) fail("PROT_WRITE bypassed"); return -1; } - } else { - /* we come here on return from SIGSEGV handler */ + } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_WRITE) { - fail("PROT_WRITE rejected"); - return -1; - } + fail("PROT_WRITE rejected"); + return -1; } if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { diff --git a/test/zdtm/static/mtime_mmap.c b/test/zdtm/static/mtime_mmap.c index 4de8438ee..f9a595864 100644 --- a/test/zdtm/static/mtime_mmap.c +++ b/test/zdtm/static/mtime_mmap.c @@ -1,5 +1,4 @@ #include -#include #include #include #include @@ -12,7 +11,7 @@ #include "zdtmtst.h" -const char *test_doc = "file mmapped for write and being written should change mtime\n" +const char *test_doc = "file mmaped for write and being written should change mtime\n" "and be migrated with correct new data"; char *filename; @@ -78,7 +77,7 @@ int main(int argc, char **argv) mtime_new = fst.st_mtime; /* time of last modification */ if (mtime_new <= mtime_old) { - fail("mtime %" PRId64 " wasn't updated on mmapped %s file", (int64_t)mtime_new, filename); + fail("mtime %ld wasn't updated on mmapped %s file", mtime_new, filename); goto failed; } @@ -99,7 +98,7 @@ int main(int argc, char **argv) /* time of last modification */ if (fst.st_mtime != mtime_new) { - fail("After migration, mtime changed to %" PRId64, (int64_t)fst.st_mtime); + fail("After migration, mtime changed to %ld", fst.st_mtime); goto failed; } diff --git a/test/zdtm/static/net_lock_socket_iptables.desc b/test/zdtm/static/net_lock_socket_iptables.desc index cb622536f..936ff8702 100644 --- a/test/zdtm/static/net_lock_socket_iptables.desc +++ b/test/zdtm/static/net_lock_socket_iptables.desc @@ -1,6 +1,5 @@ { 'flavor': 'h', - 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/net_lock_socket_iptables.hook b/test/zdtm/static/net_lock_socket_iptables.hook index e9fcd7350..0ee147eb2 100755 --- a/test/zdtm/static/net_lock_socket_iptables.hook +++ b/test/zdtm/static/net_lock_socket_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import socket import time diff --git a/test/zdtm/static/net_lock_socket_iptables6.desc b/test/zdtm/static/net_lock_socket_iptables6.desc index cb622536f..936ff8702 100644 --- a/test/zdtm/static/net_lock_socket_iptables6.desc +++ b/test/zdtm/static/net_lock_socket_iptables6.desc @@ -1,6 +1,5 @@ { 'flavor': 'h', - 'feature': 'has_ipt_legacy', 'flags': 'suid excl reqrst', 'dopts': '--tcp-established --network-lock iptables', 'ropts': '--tcp-established', diff --git a/test/zdtm/static/netns-dev.c b/test/zdtm/static/netns-dev.c index f268f2fec..e220daa7f 100644 --- a/test/zdtm/static/netns-dev.c +++ b/test/zdtm/static/netns-dev.c @@ -55,36 +55,36 @@ struct range { }; struct range rand_range4[] = { - { 0, 1 }, /* accept_local */ - { -1, 0 }, /* accept_source_route */ - { 0, 1 }, /* arp_accept */ - { 0, 2 }, /* arp_announce */ - { 0, 1 }, /* arp_filter */ - { 0, 8 }, /* arp_ignore */ - { 0, 1 }, /* arp_notify */ - { 0, 1 }, /* bootp_relay */ - { 0, 1 }, /* disable_policy */ - { 0, 1 }, /* disable_xfrm */ - { 0, 1 }, /* drop_gratuitous_arp */ - { 0, 1 }, /* drop_unicast_in_l2_multicast */ - { 0, INT_MAX }, /* force_igmp_version */ - { 0, 1 }, /* forwarding */ - { 0, 1 }, /* accept_redirects */ - { 0, INT_MAX }, /* igmpv2_unsolicited_report_interval */ - { 0, INT_MAX }, /* igmpv3_unsolicited_report_interval */ - { 0, 1 }, /* ignore_routes_with_linkdown */ - { 0, 1 }, /* log_martians */ - { 0, 1 }, /* mc_forwarding */ - { -1, INT_MAX }, /* medium_id */ - { 0, 1 }, /* promote_secondaries */ - { 0, 1 }, /* proxy_arp */ - { 0, 1 }, /* proxy_arp_pvlan */ - { 0, 1 }, /* route_localnet */ - { 0, 2 }, /* rp_filter */ - { 0, 1 }, /* secure_redirects */ - { 0, 1 }, /* send_redirects */ - { 0, 1 }, /* shared_media */ - { 0, 1 }, /* src_valid_mark */ + { 0, 1 }, /* accept_local */ + { -1, 0 }, /* accept_source_route */ + { 0, 1 }, /* arp_accept */ + { 0, 2 }, /* arp_announce */ + { 0, 1 }, /* arp_filter */ + { 0, 8 }, /* arp_ignore */ + { 0, 1 }, /* arp_notify */ + { 0, 1 }, /* bootp_relay */ + { 0, 1 }, /* disable_policy */ + { 0, 1 }, /* disable_xfrm */ + { 0, 1 }, /* drop_gratuitous_arp */ + { 0, 1 }, /* drop_unicast_in_l2_multicast */ + { 0, INT_MAX }, /* force_igmp_version */ + { 0, 1 }, /* forwarding */ + { 0, 1 }, /* accept_redirects */ + { 0, INT_MAX }, /* igmpv2_unsolicited_report_interval */ + { 0, INT_MAX }, /* igmpv3_unsolicited_report_interval */ + { 0, 1 }, /* ignore_routes_with_linkdown */ + { 0, 1 }, /* log_martians */ + { 0, 1 }, /* mc_forwarding */ + { -1, INT_MAX }, /* medium_id */ + { 0, 1 }, /* promote_secondaries */ + { 0, 1 }, /* proxy_arp */ + { 0, 1 }, /* proxy_arp_pvlan */ + { 0, 1 }, /* route_localnet */ + { 0, 2 }, /* rp_filter */ + { 0, 1 }, /* secure_redirects */ + { 0, 1 }, /* send_redirects */ + { 0, 1 }, /* shared_media */ + { 0, 1 }, /* src_valid_mark */ { INT_MIN, INT_MAX }, /* tag */ }; @@ -139,47 +139,47 @@ char *devconfs6[] = { #define MAX_ADDRESSES 128 struct range rand_range6[] = { - { 0, 2 }, /* accept_dad */ - { 0, 2 }, /* accept_ra */ - { 0, 1 }, /* accept_ra_defrtr */ - { 0, 1 }, /* accept_ra_from_local */ - { 0, INT_MAX }, /* accept_ra_min_hop_limit */ - { 0, 1 }, /* accept_ra_mtu */ - { 0, 1 }, /* accept_ra_pinfo */ - { 0, INT_MAX }, /* accept_ra_rt_info_max_plen */ - { 0, 1 }, /* accept_ra_rtr_pref */ - { -1, 0 }, /* accept_source_route */ - { 0, 1 }, /* autoconf */ - { 0, INT_MAX }, /* dad_transmits */ - { 0, 1 }, /* disable_ipv6 */ - { 0, 1 }, /* drop_unicast_in_l2_multicast */ - { 0, 1 }, /* drop_unsolicited_na */ - { 0, 2 }, /* force_mld_version */ - { 0, 1 }, /* force_tllao */ - { 0, 1 }, /* forwarding */ - { 0, 1 }, /* accept_redirects */ - { 1, 255 }, /* hop_limit */ - { 0, 1 }, /* ignore_routes_with_linkdown */ - { -1, 1 }, /* keep_addr_on_down */ - { 0, MAX_ADDRESSES }, /* max_addresses */ - { 0, INT_MAX }, /* max_desync_factor */ - { 0, INT_MAX }, /* mldv1_unsolicited_report_interval */ - { 0, INT_MAX }, /* mldv2_unsolicited_report_interval */ + { 0, 2 }, /* accept_dad */ + { 0, 2 }, /* accept_ra */ + { 0, 1 }, /* accept_ra_defrtr */ + { 0, 1 }, /* accept_ra_from_local */ + { 0, INT_MAX }, /* accept_ra_min_hop_limit */ + { 0, 1 }, /* accept_ra_mtu */ + { 0, 1 }, /* accept_ra_pinfo */ + { 0, INT_MAX }, /* accept_ra_rt_info_max_plen */ + { 0, 1 }, /* accept_ra_rtr_pref */ + { -1, 0 }, /* accept_source_route */ + { 0, 1 }, /* autoconf */ + { 0, INT_MAX }, /* dad_transmits */ + { 0, 1 }, /* disable_ipv6 */ + { 0, 1 }, /* drop_unicast_in_l2_multicast */ + { 0, 1 }, /* drop_unsolicited_na */ + { 0, 2 }, /* force_mld_version */ + { 0, 1 }, /* force_tllao */ + { 0, 1 }, /* forwarding */ + { 0, 1 }, /* accept_redirects */ + { 1, 255 }, /* hop_limit */ + { 0, 1 }, /* ignore_routes_with_linkdown */ + { -1, 1 }, /* keep_addr_on_down */ + { 0, MAX_ADDRESSES }, /* max_addresses */ + { 0, INT_MAX }, /* max_desync_factor */ + { 0, INT_MAX }, /* mldv1_unsolicited_report_interval */ + { 0, INT_MAX }, /* mldv2_unsolicited_report_interval */ { IPV6_MIN_MTU, IPV6_MIN_MTU }, /* mtu */ - { 0, 1 }, /* ndisc_notify */ - { 0, 1 }, /* optimistic_dad */ - { 0, 1 }, /* proxy_ndp */ - { 0, INT_MAX }, /* regen_max_retry */ - { 0, ROUTER_MAX }, /* router_probe_interval */ - { 0, ROUTER_MAX }, /* router_solicitation_delay */ - { 0, ROUTER_MAX }, /* router_solicitation_interval */ - { 0, ROUTER_MAX }, /* router_solicitations */ - { 0, 1 }, /* suppress_frag_ndisc */ - { 0, INT_MAX }, /* temp_prefered_lft */ - { 0, INT_MAX }, /* temp_valid_lft */ - { 0, 1 }, /* use_oif_addrs_only */ - { 0, 1 }, /* use_optimistic */ - { 0, 2 }, /* use_tempaddr */ + { 0, 1 }, /* ndisc_notify */ + { 0, 1 }, /* optimistic_dad */ + { 0, 1 }, /* proxy_ndp */ + { 0, INT_MAX }, /* regen_max_retry */ + { 0, ROUTER_MAX }, /* router_probe_interval */ + { 0, ROUTER_MAX }, /* router_solicitation_delay */ + { 0, ROUTER_MAX }, /* router_solicitation_interval */ + { 0, ROUTER_MAX }, /* router_solicitations */ + { 0, 1 }, /* suppress_frag_ndisc */ + { 0, INT_MAX }, /* temp_prefered_lft */ + { 0, INT_MAX }, /* temp_valid_lft */ + { 0, 1 }, /* use_oif_addrs_only */ + { 0, 1 }, /* use_optimistic */ + { 0, 2 }, /* use_tempaddr */ }; struct test_conf { @@ -414,7 +414,7 @@ static int check_stable_secret(struct test_conf *tc) return -1; } - ret = fscanf(fp, "%200s", val); + ret = fscanf(fp, "%s", val); if (ret != 1) { pr_perror("fscanf"); fclose(fp); diff --git a/test/zdtm/static/netns-nf.desc b/test/zdtm/static/netns-nf.desc index 58c23e8ba..e7e73b1ae 100644 --- a/test/zdtm/static/netns-nf.desc +++ b/test/zdtm/static/netns-nf.desc @@ -1,7 +1,6 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', - 'feature': 'has_ipt_legacy', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns-nft-ipt.desc b/test/zdtm/static/netns-nft-ipt.desc index 6d04589b3..4120f74d6 100644 --- a/test/zdtm/static/netns-nft-ipt.desc +++ b/test/zdtm/static/netns-nft-ipt.desc @@ -2,7 +2,7 @@ 'deps': [ '/bin/sh', '/usr/sbin/nft', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/iptables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', '/usr/bin/diff'], 'flags': 'suid', 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns_lock_iptables.desc b/test/zdtm/static/netns_lock_iptables.desc index b465706b8..69020f34e 100644 --- a/test/zdtm/static/netns_lock_iptables.desc +++ b/test/zdtm/static/netns_lock_iptables.desc @@ -1,7 +1,6 @@ { 'flavor': 'h', 'flags': 'suid excl reqrst', - 'feature': 'has_ipt_legacy', 'opts': '--tcp-established', 'dopts': '--network-lock iptables', 'ropts': '--join-ns net:/var/run/netns/criu-net-lock-test' diff --git a/test/zdtm/static/netns_lock_iptables.hook b/test/zdtm/static/netns_lock_iptables.hook index b51d3c2cc..b5508a7cb 100755 --- a/test/zdtm/static/netns_lock_iptables.hook +++ b/test/zdtm/static/netns_lock_iptables.hook @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import subprocess import socket @@ -30,7 +30,7 @@ if sys.argv[1] == "--post-start": nsenter() # Enter test netns subprocess.Popen(["ip", "link", "set", "up", "dev", "lo"]).wait() - # Lets test know that the netns is initialized successfully + # Lets test know that the netns is initilized successfully # by checking the access of SYNCFILE create_sync_file() @@ -67,7 +67,7 @@ if sys.argv[1] == "--post-start": cln, addr = srv.accept() cln.sendall(str.encode("--post-restore")) cln.close() - + # Server will be closed when zdtm sends SIGKILL if sys.argv[1] == "--pre-dump": diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c index 03b478b7d..545a17308 100644 --- a/test/zdtm/static/netns_sub_sysctl.c +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -1,38 +1,20 @@ #include -#include -#include #include "zdtmtst.h" #include "sysctl.h" -const char *test_doc = "Check dump and restore of sysctls in subns"; +const char *test_doc = "Check dump and restore a net.unix.max_dgram_qlen sysctl parameter in subns"; const char *test_author = "Alexander Mikhalitsyn "; -#define MAX_STR_SYSCTL_LEN 200 - -enum { - SYSCTL_INT, - SYSCTL_STR, -}; - typedef struct { const char *path; - int type; int old; int new; - char s_old[MAX_STR_SYSCTL_LEN]; - char s_new[MAX_STR_SYSCTL_LEN]; - bool set; } sysctl_opt_t; #define CONF_UNIX_BASE "/proc/sys/net/unix" -#define IPV4_SYSCTL_BASE "/proc/sys/net/ipv4" -static sysctl_opt_t net_unix_params[] = { - {CONF_UNIX_BASE "/max_dgram_qlen", SYSCTL_INT}, - {IPV4_SYSCTL_BASE "/ping_group_range", SYSCTL_STR, 0, 0, "40000\t50000\n"}, - {NULL, 0, 0} -}; +static sysctl_opt_t net_unix_params[] = { { CONF_UNIX_BASE "/max_dgram_qlen", 0, 0 }, { NULL, 0, 0 } }; int main(int argc, char **argv) { @@ -41,22 +23,10 @@ int main(int argc, char **argv) test_init(argc, argv); for (p = net_unix_params; p->path != NULL; p++) { - if (access(p->path, W_OK) != 0) { - test_msg("%s doesn't exist\n", p->path); - continue; - } - p->set = true; - if (p->type == SYSCTL_INT) { - p->old = (((unsigned)lrand48()) % 1023) + 1; - if (sysctl_write_int(p->path, p->old)) { - pr_perror("Can't change %s", p->path); - return -1; - } - } else if (p->type == SYSCTL_STR) { - if (sysctl_write_str(p->path, p->s_old)) { - pr_perror("Can't change %s", p->path); - return -1; - } + p->old = (((unsigned)lrand48()) % 1023) + 1; + if (sysctl_write_int(p->path, p->old)) { + pr_perror("Can't change %s", p->path); + return -1; } } @@ -64,27 +34,13 @@ int main(int argc, char **argv) test_waitsig(); for (p = net_unix_params; p->path != NULL; p++) { - if (!p->set) - continue; - if (p->type == SYSCTL_INT) { - if (sysctl_read_int(p->path, &p->new)) - ret = 1; + if (sysctl_read_int(p->path, &p->new)) + ret = 1; - if (p->old != p->new) { - errno = EINVAL; - pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); - ret = 1; - } - } else if (p->type == SYSCTL_STR) { - if (sysctl_read_str(p->path, p->s_new, MAX_STR_SYSCTL_LEN)) { - ret = 1; - } else { - if (strcmp(p->s_old, p->s_new)) { - errno = EINVAL; - pr_perror("%s changed: %s ---> %s", p->path, p->s_old, p->s_new); - ret = 1; - } - } + if (p->old != p->new) { + errno = EINVAL; + pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); + ret = 1; } } diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc index 0c357aefe..535842668 100644 --- a/test/zdtm/static/netns_sub_sysctl.desc +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -1,4 +1,4 @@ { - 'flavor': 'ns uns', + 'flavor': 'ns', 'flags': 'suid' } diff --git a/test/zdtm/static/ofd_file_locks.c b/test/zdtm/static/ofd_file_locks.c index a68fa38ee..68b6f22f5 100644 --- a/test/zdtm/static/ofd_file_locks.c +++ b/test/zdtm/static/ofd_file_locks.c @@ -16,7 +16,7 @@ static int parse_ofd_lock(char *buf, struct flock *lck) if (strncmp(buf, "lock:\t", 6) != 0) return 1; /* isn't lock, skip record */ - num = sscanf(buf, "%*s %*d: %9s %14s %9s %*d %*x:%*x:%*d %lld %31s", fl_flag, fl_type, fl_option, &start, fl_end); + num = sscanf(buf, "%*s %*d: %s %s %s %*d %*x:%*x:%*d %lld %s", fl_flag, fl_type, fl_option, &start, fl_end); if (num < 4) { pr_err("Invalid lock info %s\n", buf); diff --git a/test/zdtm/static/packet_sock.c b/test/zdtm/static/packet_sock.c index c1c94ac21..4a9078f81 100644 --- a/test/zdtm/static/packet_sock.c +++ b/test/zdtm/static/packet_sock.c @@ -5,7 +5,7 @@ const char *test_author = "Pavel Emelyanov "; /* * Description: - * Create and bind several packet sockets, check that getname + * Create and bind several packet sockets, check thet getname * reports same result before and after c/r cycle. This is enough * for _basic_ packet functionality only, but still. */ diff --git a/test/zdtm/static/pidfd_child.c b/test/zdtm/static/pidfd_child.c deleted file mode 100644 index ec559605d..000000000 --- a/test/zdtm/static/pidfd_child.c +++ /dev/null @@ -1,66 +0,0 @@ -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Checks pidfd sends signal to child process after restore\n"; -const char *test_author = "Bhavik Sachdev "; - -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) -{ - return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); -} - -int main(int argc, char* argv[]) -{ - int pidfd, status; - pid_t child; - - test_init(argc, argv); - - child = fork(); - if (child < 0) { - pr_perror("Unable to fork a new process"); - return 1; - } else if (child == 0) { - test_waitsig(); - return 0; - } - - pidfd = pidfd_open(child, 0); - if (pidfd < 0) { - pr_perror("pidfd_open failed"); - return 1; - } - - test_daemon(); - test_waitsig(); - - if (pidfd_send_signal(pidfd, SIGTERM, NULL, 0)) { - fail("Could not send signal"); - goto err_close; - } - - if (waitpid(child, &status, 0) != child) { - pr_perror("waitpid()"); - goto err_close; - } - - if (status != 0) { - fail("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), WIFSIGNALED(status), WTERMSIG(status)); - goto err_close; - } - - pass(); - close(pidfd); - return 0; -err_close: - close(pidfd); - return 1; -} diff --git a/test/zdtm/static/pidfd_dead.c b/test/zdtm/static/pidfd_dead.c deleted file mode 100644 index 9c825899d..000000000 --- a/test/zdtm/static/pidfd_dead.c +++ /dev/null @@ -1,244 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check C/R of pidfds that point to dead processes\n"; -const char *test_author = "Bhavik Sachdev "; - -#ifndef PID_FS_MAGIC -#define PID_FS_MAGIC 0x50494446 -#endif - -/* - * main - * `- child - * `- grandchild - * - * main opens a pidfd for both child and grandchild. - * Before C/R we kill both child and grandchild. - * We end up with two unique dead pidfds. - */ - -static long get_fs_type(int lfd) -{ - struct statfs fst; - - if (fstatfs(lfd, &fst)) { - return -1; - } - return fst.f_type; -} - -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) -{ - return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); -} - -static int open_pidfd_pair(int pidfd[2], int pid) -{ - pidfd[0] = pidfd_open(pid, 0); - if (pidfd[0] < 0) { - pr_perror("pidfd_open() failed"); - return 1; - } - - pidfd[1] = pidfd_open(pid, 0); - if (pidfd[1] < 0) { - close(pidfd[0]); - pr_perror("pidfd_open() failed"); - return 1; - } - return 0; -} - -static int compare_pidfds(int pidfd[2]) -{ - /* - * After linux 6.9 we can compare inode numbers - * to determine if two pidfds point to the same process. - * While the inode number may change before and after C/R - * pidfds pointing to the same pid should have the same inode number. - */ - struct statx stats[2]; - statx(pidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); - statx(pidfd[1], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); - if (stats[0].stx_ino != stats[1].stx_ino) - return 1; - return 0; -} - -static int check_for_pidfs(void) -{ - long type; - int pidfd = pidfd_open(getpid(), 0); - if (pidfd < 0) { - pr_perror("pidfd open() failed"); - return -1; - } - type = get_fs_type(pidfd); - close(pidfd); - return type == PID_FS_MAGIC; -} - -int main(int argc, char* argv[]) -{ - #define READ 0 - #define WRITE 1 - - int child, ret, gchild, p[2], status; - int cpidfd[2], gpidfd[2]; - struct statx stats[2]; - - test_init(argc, argv); - - ret = check_for_pidfs(); - if (ret < 0) - return 1; - - if (ret == 0) { - test_daemon(); - test_waitsig(); - skip("Test requires pidfs. skipping..."); - pass(); - return 0; - } - - if (pipe(p)) { - pr_perror("pipe"); - return 1; - } - - child = test_fork(); - if (child < 0) { - pr_perror("fork"); - return 1; - } else if (child == 0) { - int gchild = test_fork(); - close(p[READ]); - if (gchild < 0) { - pr_perror("fork"); - return 1; - } else if (gchild == 0) { - close(p[WRITE]); - while(1) - sleep(1000); - } else { - if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { - pr_perror("write"); - return 1; - } - close(p[WRITE]); - if (waitpid(gchild, &status, 0) != gchild) { - pr_perror("waitpid"); - return 1; - } - - if (!WIFSIGNALED(status)) { - fail("Expected grandchild to be terminated by a signal"); - return 1; - } - - if (WTERMSIG(status) != SIGKILL) { - fail("Expected grandchild to be terminated by SIGKILL"); - return 1; - } - - return 0; - } - } - - ret = open_pidfd_pair(cpidfd, child); - if (ret) - return 1; - - close(p[WRITE]); - if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { - pr_perror("write"); - return 1; - } - close(p[READ]); - - ret = open_pidfd_pair(gpidfd, gchild); - if (ret) - return 1; - - /* - * We kill grandchild and child processes only after opening pidfds. - */ - if (pidfd_send_signal(gpidfd[0], SIGKILL, NULL, 0)) { - pr_perror("pidfd_send_signal"); - goto fail_close; - } - - if (waitpid(child, &status, 0) != child) { - pr_perror("waitpid"); - goto fail_close; - } - - if (!WIFEXITED(status)) { - fail("Expected child to exit normally"); - goto fail_close; - } - - if (WEXITSTATUS(status) != 0) { - fail("Expected child to exit with 0"); - goto fail_close; - } - usleep(1000); - - if (kill(gchild, 0) != -1 && errno != ESRCH) { - fail("Expected grand child to not exist"); - goto fail_close; - } - - if (kill(child, 0) != -1 && errno != ESRCH) { - fail("Expected child to not exist"); - goto fail_close; - } - - test_daemon(); - test_waitsig(); - - ret = compare_pidfds(cpidfd); - if (ret) { - fail("inodes not same for same pid"); - goto fail_close; - } - - ret = compare_pidfds(gpidfd); - if (ret) { - fail("inodes not same for same pid"); - goto fail_close; - } - - statx(cpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[0]); - statx(gpidfd[0], "", AT_EMPTY_PATH, STATX_ALL, &stats[1]); - if (stats[0].stx_ino == stats[1].stx_ino) { - fail("pidfds pointing to diff pids should have diff inodes"); - goto fail_close; - } - - pass(); - close(cpidfd[0]); - close(cpidfd[1]); - close(gpidfd[0]); - close(gpidfd[1]); - return 0; - -fail_close: - close(cpidfd[0]); - close(cpidfd[1]); - close(gpidfd[0]); - close(gpidfd[1]); - return 1; -} diff --git a/test/zdtm/static/pidfd_diffdead.c b/test/zdtm/static/pidfd_diffdead.c deleted file mode 100644 index 5bc1911a5..000000000 --- a/test/zdtm/static/pidfd_diffdead.c +++ /dev/null @@ -1,228 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check C/R of processes that point to a common dead pidfd\n"; -const char *test_author = "Bhavik Sachdev "; - -#ifndef PID_FS_MAGIC -#define PID_FS_MAGIC 0x50494446 -#endif - -/* - * main - * `- child - * `- grandchild - * - * main and child open a pidfd for grandchild. - * Before C/R we kill grandchild. - * We end up with two pidfds in two diff processes that point to the same dead process. - */ - -static long get_fs_type(int lfd) -{ - struct statfs fst; - - if (fstatfs(lfd, &fst)) { - return -1; - } - return fst.f_type; -} - -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) -{ - return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); -} - -static int check_for_pidfs(void) -{ - long type; - int pidfd = pidfd_open(getpid(), 0); - if (pidfd < 0) { - pr_perror("pidfd open() failed"); - return -1; - } - type = get_fs_type(pidfd); - close(pidfd); - return type == PID_FS_MAGIC; -} - -int main(int argc, char *argv[]) -{ -#define READ 0 -#define WRITE 1 - - int child, ret, gchild, status; - struct statx stat; - task_waiter_t t; - unsigned long long ino; - - /* - * We use the inop pipe to send the inode number of the - * pidfd opened in the child to the main process for - * comparison. - */ - int p[2]; - int pidfd; - - test_init(argc, argv); - task_waiter_init(&t); - - ret = check_for_pidfs(); - if (ret < 0) - return 1; - - if (ret == 0) { - test_daemon(); - test_waitsig(); - skip("Test requires pidfs. skipping..."); - pass(); - return 0; - } - - if (pipe(p)) { - pr_perror("pipe"); - return 1; - } - - child = test_fork(); - if (child < 0) { - pr_perror("fork"); - return 1; - } else if (child == 0) { - int gchild; - gchild = test_fork(); - if (gchild < 0) { - pr_perror("fork"); - return 1; - } else if (gchild == 0) { - close(p[READ]); - close(p[WRITE]); - while (1) - sleep(1000); - } else { - if (write(p[WRITE], &gchild, sizeof(int)) != sizeof(int)) { - pr_perror("write"); - return 1; - } - - pidfd = pidfd_open(gchild, 0); - if (pidfd < 0) { - pr_perror("pidfd_open"); - return 1; - } - - if (waitpid(gchild, &status, 0) != gchild) { - pr_perror("waitpid"); - return 1; - } - - if (!WIFSIGNALED(status)) { - fail("Expected grandchild to be terminated by a signal"); - return 1; - } - - if (WTERMSIG(status) != SIGKILL) { - fail("Expected grandchild to be terminated by SIGKILL"); - return 1; - } - task_waiter_complete(&t, 1); - - test_waitsig(); - - if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { - pr_perror("statx"); - return 1; - } - - close(p[WRITE]); - if (read(p[READ], &ino, sizeof(ino)) != sizeof(ino)) { - pr_perror("read"); - return 1; - } - close(p[READ]); - close(pidfd); - - /* ino number should be same because both pidfds were for the same process */ - if (ino != stat.stx_ino) { - exit(1); - } - exit(0); - } - } - - if (read(p[READ], &gchild, sizeof(int)) != sizeof(int)) { - pr_perror("write"); - return 1; - } - - pidfd = pidfd_open(gchild, 0); - if (pidfd < 0) { - pr_perror("pidfd_open"); - return 1; - } - - /* - * We kill grandchild process only after opening pidfd. - */ - if (pidfd_send_signal(pidfd, SIGKILL, NULL, 0)) { - pr_perror("pidfd_send_signal"); - return 1; - } - - /* Wait for child to waitpid on gchild */ - task_waiter_wait4(&t, 1); - - test_daemon(); - test_waitsig(); - - close(p[READ]); - if (statx(pidfd, "", AT_EMPTY_PATH, STATX_ALL, &stat) < 0) { - pr_perror("statx"); - goto err; - } - - /* Send inode number of pidfd to child for comparison */ - if (write(p[WRITE], &stat.stx_ino, sizeof(stat.stx_ino)) != sizeof(stat.stx_ino)) { - pr_perror("write"); - goto err; - } - close(p[WRITE]); - - if (kill(child, SIGTERM)) { - pr_perror("kill"); - goto err; - } - - if (waitpid(child, &status, 0) != child) { - pr_perror("waitpid"); - goto err; - } - - if (!WIFEXITED(status)) { - fail("Expected child to terminate normally"); - goto err; - } - - if (WEXITSTATUS(status) != 0) { - fail("Child failed"); - goto err; - } - - pass(); - close(pidfd); - return 0; -err: - close(pidfd); - return 1; -} diff --git a/test/zdtm/static/pidfd_kill.c b/test/zdtm/static/pidfd_kill.c deleted file mode 100644 index 6232d033a..000000000 --- a/test/zdtm/static/pidfd_kill.c +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Kill child and grandchild process using pidfds\n"; -const char *test_author = "Bhavik Sachdev "; - -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) -{ - return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); -} - -static int wait_for_child(int child) -{ - int status; - if (waitpid(child, &status, 0) != child) { - pr_perror("waitpid()"); - return 1; - } - - if (status != 0) { - test_msg("%d:%d:%d:%d", WIFEXITED(status), WEXITSTATUS(status), - WIFSIGNALED(status), WTERMSIG(status)); - } - - return 0; -} - -int main(int argc, char* argv[]) -{ - #define READ 0 - #define WRITE 1 - - int child, gchild, cpidfd, gpidfd, gchild_pid, ret; - int p[2]; - - if (pipe(p)) { - pr_perror("pipe"); - return 1; - } - - test_init(argc, argv); - - child = fork(); - if (child < 0) { - pr_perror("fork"); - return 1; - } - - if (child == 0) { - gchild = fork(); - if (gchild < 0) { - pr_perror("fork"); - return 1; - } - - if (gchild == 0) { - test_waitsig(); - return 0; - } - - close(p[READ]); - if (write(p[WRITE], &gchild, sizeof(gchild)) - != sizeof(gchild)) { - pr_perror("write"); - return 1; - } - close(p[WRITE]); - - test_waitsig(); - return wait_for_child(gchild); - } - - cpidfd = pidfd_open(child, 0); - if (cpidfd < 0) { - pr_perror("pidfd_open"); - return 1; - } - - close(p[WRITE]); - if (read(p[READ], &gchild_pid, sizeof(gchild_pid)) - != sizeof(gchild_pid)) { - pr_perror("read"); - return 1; - } - close(p[READ]); - - gpidfd = pidfd_open(gchild_pid, 0); - if (gpidfd < 0) { - pr_perror("pidfd_open"); - return 1; - } - - test_daemon(); - test_waitsig(); - - if (pidfd_send_signal(gpidfd, SIGKILL, NULL, 0)) { - pr_perror("Could not send signal"); - goto fail_close; - } - - if (pidfd_send_signal(cpidfd, SIGKILL, NULL, 0)) { - pr_perror("Could not send signal"); - goto fail_close; - } - - ret = wait_for_child(child); - if (ret) - goto fail_close; - - pass(); - close(cpidfd); - close(gpidfd); - return 0; - -fail_close: - fail(); - close(cpidfd); - close(gpidfd); - return 1; -} diff --git a/test/zdtm/static/pidfd_of_thread.c b/test/zdtm/static/pidfd_of_thread.c deleted file mode 100644 index d232c7ac1..000000000 --- a/test/zdtm/static/pidfd_of_thread.c +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include -#include - -#include "zdtmtst.h" -#include "lock.h" - -const char *test_doc = "Check C/R of pidfds that point to threads\n"; -const char *test_author = "Bhavik Sachdev "; - -/* see also: https://codebrowser.dev/glibc/glibc/sysdeps/unix/sysv/linux/tst-clone3.c.html */ - -#ifndef PIDFD_THREAD -#define PIDFD_THREAD O_EXCL -#endif - -#ifndef PIDFD_SIGNAL_THREAD -#define PIDFD_SIGNAL_THREAD (1UL << 0) -#endif - -#ifndef PID_FS_MAGIC -#define PID_FS_MAGIC 0x50494446 -#endif - -static long get_fs_type(int lfd) -{ - struct statfs fst; - - if (fstatfs(lfd, &fst)) { - return -1; - } - return fst.f_type; -} - -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) -{ - return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); -} - -static int thread_func(void *a) -{ - test_waitsig(); - return 0; -} - -#define CTID_INIT_VAL 1 - -int main(int argc, char* argv[]) -{ - char st[64 * 1024] __attribute__ ((aligned)); - pid_t tid; - int pidfd, test_pidfd; - futex_t exited; - - int clone_flags = CLONE_THREAD; - clone_flags |= CLONE_VM | CLONE_SIGHAND; - clone_flags |= CLONE_CHILD_CLEARTID; - - test_init(argc, argv); - - test_pidfd = pidfd_open(getpid(), 0); - if (test_pidfd < 0) { - pr_perror("pidfd_open() failed"); - return 1; - } - - /* PIDFD_THREAD, PIDFD_SIGNAL_THREAD are supported only with pidfs */ - if (get_fs_type(test_pidfd) != PID_FS_MAGIC) { - test_daemon(); - test_waitsig(); - skip("pidfs not supported."); - close(test_pidfd); - return 0; - } - close(test_pidfd); - - futex_set(&exited, CTID_INIT_VAL); - - tid = clone(thread_func, st + sizeof(st), clone_flags, NULL, NULL, NULL, &(exited.raw)); - if (tid == -1) { - pr_perror("clone() failed"); - return 1; - } - - test_msg("Successfully created a thread with tid: %d\n", tid); - pidfd = pidfd_open(tid, PIDFD_THREAD); - if (pidfd < 0) { - pr_perror("pidfd_open() failed"); - return 1; - } - - test_daemon(); - test_waitsig(); - - if (pidfd_send_signal(pidfd, SIGTERM, NULL, PIDFD_SIGNAL_THREAD)) { - pr_perror("pidfd_send_signal() failed"); - fail(); - close(pidfd); - return 1; - } - - test_msg("Waiting for thread to exit\n"); - futex_wait_until(&exited, 0); - - pass(); - close(pidfd); - return 0; -} diff --git a/test/zdtm/static/pidfd_of_thread.desc b/test/zdtm/static/pidfd_of_thread.desc deleted file mode 100644 index 802caed65..000000000 --- a/test/zdtm/static/pidfd_of_thread.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'noauto crfail'} diff --git a/test/zdtm/static/pidfd_self.c b/test/zdtm/static/pidfd_self.c deleted file mode 100644 index 2730ee123..000000000 --- a/test/zdtm/static/pidfd_self.c +++ /dev/null @@ -1,140 +0,0 @@ -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check pidfd /proc/self/fdinfo/ entry remains consistent after checkpoint/restore\n"; -const char *test_author = "Bhavik Sachdev "; - -struct pidfd_status { - unsigned int flags; - pid_t pid; -}; - -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(__NR_pidfd_open, pid, flags); -} - -static int pidfd_send_signal(int pidfd, int sig, siginfo_t* info, unsigned int flags) -{ - return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); -} - -static void show_pidfd(char *prefix, struct pidfd_status *s) -{ - test_msg("\n\t%s\n\tflags: 0%o\n\tpid: %d\n", prefix, s->flags, s->pid); -} - -static int parse_self_fdinfo(int pidfd, struct pidfd_status *s) -{ - char buf[256]; - int ret = -1; - FILE *f; - - sprintf(buf, "/proc/self/fdinfo/%d", pidfd); - f = fopen(buf, "r"); - if (!f) { - perror("Can't open /proc/self/fdinfo/ to parse"); - return -1; - } - - memset(s, 0, sizeof(*s)); - - /* - * flags: file access mode (octal) 02000002 => [O_RDWR | O_CLOEXEC] - * pid: the pid to which we have pidfd open - */ - while (fgets(buf, sizeof(buf), f)) { - if (!fgets(buf, sizeof(buf), f)) - goto parse_err; - - if (sscanf(buf, "flags: 0%o", &s->flags) != 1) { - goto parse_err; - } - - if (!fgets(buf, sizeof(buf), f)) - goto parse_err; - if (!fgets(buf, sizeof(buf), f)) - goto parse_err; - - if (!fgets(buf, sizeof(buf), f)) - goto parse_err; - - if (sscanf(buf, "Pid: %d", &s->pid) != 1) - goto parse_err; - ret = 0; - break; - } - - if (ret) - goto parse_err; -err: - fclose(f); - return ret; - -parse_err: - pr_perror("Format error"); - goto err; -} - -static int check_pidfd(int fd, struct pidfd_status *old) -{ - struct pidfd_status new; - - if (parse_self_fdinfo(fd, &new)) - return -1; - - show_pidfd("restored", &new); - - if (old->flags != new.flags || old->pid != new.pid) - return -1; - - return 0; -} - -int main(int argc, char* argv[]) -{ - struct pidfd_status old; - int pidfd, ret; - - test_init(argc, argv); - - pidfd = pidfd_open(getpid(), 0); - if (pidfd < 0) { - pr_perror("pidfd_open failed"); - return 1; - } - - parse_self_fdinfo(pidfd, &old); - - show_pidfd("old", &old); - - if (pidfd_send_signal(pidfd, 0, NULL, 0)) { - pr_perror("Could not send signal"); - return 1; - } - - test_daemon(); - test_waitsig(); - - ret = check_pidfd(pidfd, &old); - if (ret) { - fail(); - goto err; - } - - if (pidfd_send_signal(pidfd, 0, NULL, 0)) { - pr_perror("Could not send signal"); - fail(); - goto err; - } - - pass(); - close(pidfd); - return 0; -err: - close(pidfd); - return 1; -} diff --git a/test/zdtm/static/pipe00.c b/test/zdtm/static/pipe00.c index 2c21fbca5..492722573 100644 --- a/test/zdtm/static/pipe00.c +++ b/test/zdtm/static/pipe00.c @@ -79,7 +79,7 @@ int main(int argc, char **argv) goto err; } if (strcmp(TEST_STRING, buf)) { - pr_perror("data corruption"); + pr_perror("data curruption"); goto err; } @@ -109,7 +109,7 @@ int main(int argc, char **argv) } close(11); if (strcmp(TEST_STRING, buf)) { - pr_perror("data corruption"); + pr_perror("data curruption"); return 1; } } diff --git a/test/zdtm/static/pthread00-pac.c b/test/zdtm/static/pthread00-pac.c deleted file mode 120000 index 3ee8dc1f1..000000000 --- a/test/zdtm/static/pthread00-pac.c +++ /dev/null @@ -1 +0,0 @@ -pthread00.c \ No newline at end of file diff --git a/test/zdtm/static/pthread_timers.c b/test/zdtm/static/pthread_timers.c index b1b2a9a23..5246a985f 100644 --- a/test/zdtm/static/pthread_timers.c +++ b/test/zdtm/static/pthread_timers.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -70,8 +69,7 @@ int main(int argc, char **argv) } if (itimerspec.it_interval.tv_nsec != TEST_INTERVAL_NSEC || itimerspec.it_interval.tv_sec) { - pr_perror("wrong interval: %" PRId64 ":%" PRId64, - (int64_t)itimerspec.it_interval.tv_sec, (int64_t)itimerspec.it_interval.tv_nsec); + pr_perror("wrong interval: %ld:%ld", itimerspec.it_interval.tv_sec, itimerspec.it_interval.tv_nsec); return 1; } diff --git a/test/zdtm/static/pty-console.desc b/test/zdtm/static/pty-console.desc index 3b3e3f73d..fba5e8749 100644 --- a/test/zdtm/static/pty-console.desc +++ b/test/zdtm/static/pty-console.desc @@ -1 +1 @@ -{'flags': 'suid', 'flavor' : 'ns uns', 'ropts': '--mntns-compat-mode'} +{'flags': 'suid', 'flavor' : 'ns uns'} diff --git a/test/zdtm/static/pty03.c b/test/zdtm/static/pty03.c index e57f54557..59672d697 100644 --- a/test/zdtm/static/pty03.c +++ b/test/zdtm/static/pty03.c @@ -40,7 +40,7 @@ int main(int argc, char *argv[]) } if (ioctl(slave, TIOCSCTTY, 1)) { - pr_perror("Can't set a control terminal"); + pr_perror("Can't set a controll terminal"); return 1; } @@ -51,7 +51,7 @@ int main(int argc, char *argv[]) slave = open("/dev/tty", O_RDWR); if (slave == -1) { - pr_perror("Can't open the control terminal"); + pr_perror("Can't open the controll terminal"); return -1; } diff --git a/test/zdtm/static/pty04.c b/test/zdtm/static/pty04.c index 43a880fb4..406fbee4d 100644 --- a/test/zdtm/static/pty04.c +++ b/test/zdtm/static/pty04.c @@ -44,7 +44,7 @@ int main(int argc, char *argv[]) } if (ioctl(slave1, TIOCSCTTY, 1)) { - pr_perror("Can't set a control terminal"); + pr_perror("Can't set a controll terminal"); return 1; } diff --git a/test/zdtm/static/route_rules b/test/zdtm/static/route_rules index 97edf3622..9a735c6a9 100755 --- a/test/zdtm/static/route_rules +++ b/test/zdtm/static/route_rules @@ -31,7 +31,7 @@ do_or_fail() do_start() { - [ -f "$statefile" ] && die "state file $statefile already exists" + [ -f "$statefile" ] && die "state file $statefile aleady exists" # Get default route dev_name=`ip route list match 0.0.0.0/0 | sed 's/.*dev \([^ ]*\).*/\1/'` @@ -66,7 +66,7 @@ tmpargs="$(../lib/parseargs.sh --name=$0 \ die "can't parse command line" eval "$tmpargs" -[ -f "$outfile" ] && die "out file $outfile already exists" +[ -f "$outfile" ] && die "out file $outfile aleady exists" # expect "start" or "stop" action=${1:?Specify action$(die 'Specify action')} diff --git a/test/zdtm/static/rseq00.c b/test/zdtm/static/rseq00.c deleted file mode 100644 index 7add7801e..000000000 --- a/test/zdtm/static/rseq00.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * test for rseq() syscall - * See also https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ - * https://github.com/torvalds/linux/commit/d7822b1e24f2df5df98c76f0e94a5416349ff759 - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -#ifdef __has_include -#if __has_include("sys/rseq.h") -#include -#endif -#endif - -#if defined(__x86_64__) - -#if defined(RSEQ_SIG) -static inline void *__criu_thread_pointer(void) -{ -#if __GNUC_PREREQ(11, 1) - return __builtin_thread_pointer(); -#else - void *__result; -#ifdef __x86_64__ - __asm__("mov %%fs:0, %0" : "=r"(__result)); -#else - __asm__("mov %%gs:0, %0" : "=r"(__result)); -#endif /* __x86_64__ */ - return __result; -#endif /* !GCC 11 */ -} - -static inline void unregister_glibc_rseq(void) -{ - struct rseq *rseq = (struct rseq *)((char *)__criu_thread_pointer() + __rseq_offset); - unsigned int size = __rseq_size; - - /* hack: mark glibc rseq structure as failed to register */ - rseq->cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED; - - /* unregister rseq */ - if (__rseq_size < 32) - size = 32; - syscall(__NR_rseq, (void *)rseq, size, 1, RSEQ_SIG); -} -#else -static inline void unregister_glibc_rseq(void) -{ -} -#endif /* defined(RSEQ_SIG) */ - -const char *test_doc = "Check that rseq() basic C/R works"; -const char *test_author = "Alexander Mikhalitsyn "; - -/* some useful definitions from kernel uapi */ -#ifndef RSEQ_SIG - -enum rseq_flags { - RSEQ_FLAG_UNREGISTER = (1 << 0), -}; - -struct rseq { - uint32_t cpu_id_start; - uint32_t cpu_id; - uint64_t rseq_cs; - uint32_t flags; -} __attribute__((aligned(4 * sizeof(uint64_t)))); - -#define RSEQ_SIG 0x53053053 - -#endif /* RSEQ_SIG */ - -#ifndef __NR_rseq -#define __NR_rseq 334 -#endif -/* EOF */ - -static __thread volatile struct rseq __rseq_abi; - -static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) -{ - return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); -} - -static void register_thread(void) -{ - int rc; - unregister_glibc_rseq(); - rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); - if (rc) { - fail("Failed to register rseq"); - exit(1); - } -} - -static void check_thread(void) -{ - int rc; - rc = sys_rseq(&__rseq_abi, sizeof(struct rseq), 0, RSEQ_SIG); - if (!(rc && errno == EBUSY)) { - fail("Failed to check rseq %d", rc); - exit(1); - } -} - -int main(int argc, char *argv[]) -{ - test_init(argc, argv); - - register_thread(); - - test_daemon(); - test_waitsig(); - - check_thread(); - - pass(); - return 0; -} - -#else /* #if defined(__x86_64__) */ - -int main(int argc, char *argv[]) -{ - test_init(argc, argv); - skip("Unsupported arch"); - test_daemon(); - test_waitsig(); - pass(); - return 0; -} - -#endif /* #if defined(__x86_64__) */ \ No newline at end of file diff --git a/test/zdtm/static/rseq00.desc b/test/zdtm/static/rseq00.desc deleted file mode 100644 index 0324fa39c..000000000 --- a/test/zdtm/static/rseq00.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} diff --git a/test/zdtm/static/s390x_regs_check.c b/test/zdtm/static/s390x_regs_check.c index 82dca0519..8d6b47997 100644 --- a/test/zdtm/static/s390x_regs_check.c +++ b/test/zdtm/static/s390x_regs_check.c @@ -40,13 +40,13 @@ const char *test_author = "Michael Holzheu "; * * - Verify that "criu restore" sets the correct register sets * from "criu dump": - * $ zdtm.py run -t zdtm/static/s390x_regs_check + * $ zdtmp.py run -t zdtm/static/s390x_regs_check * * - Verify that dumpee continues running with correct registers after * parasite injection: - * $ zdtm.py run --norst -t zdtm/static/s390x_regs_check - * $ zdtm.py run --norst --pre 2 -t zdtm/static/s390x_regs_check - * $ zdtm.py run --check-only -t zdtm/static/s390x_regs_check + * $ zdtmp.py run --norst -t zdtm/static/s390x_regs_check + * $ zdtmp.py run --norst --pre 2 -t zdtm/static/s390x_regs_check + * $ zdtmp.py run --check-only -t zdtm/static/s390x_regs_check */ #define NR_THREADS 2 #define NR_THREADS_ALL (NR_THREADS + 1) @@ -59,11 +59,11 @@ static int pipefd[2]; */ struct reg_set { const char *name; /* Name of regset */ - int nr; /* Number of regset */ - void *data; /* Test data */ - int len; /* Number of bytes of test data */ - bool optional; /* Not all kernels/machines have this reg set */ - bool available; /* Current kernel/machine has this reg set */ + int nr; /* Number of regset */ + void *data; /* Test data */ + int len; /* Number of bytes of test data */ + bool optional; /* Not all kernels/machines have this reg set */ + bool available; /* Current kernel/machine has this reg set */ }; /* @@ -397,8 +397,8 @@ static inline void send_tid_and_loop(int fd) asm volatile("lgr 2,%0\n" /* Arg 1: fd */ "la 3,%1\n" /* Arg 2: &tid */ - "lghi 4,4\n" /* Arg 3: sizeof(int) */ - "svc 4\n" /* __NR_write SVC: */ + "lghi 4,4\n" /* Arg 3: sizeof(int) */ + "svc 4\n" /* __NR_write SVC: */ /* After SVC no more registers are changed */ "0: j 0b\n" /* Loop here */ : diff --git a/test/zdtm/static/sched_policy00.c b/test/zdtm/static/sched_policy00.c index a35135050..794c11af2 100644 --- a/test/zdtm/static/sched_policy00.c +++ b/test/zdtm/static/sched_policy00.c @@ -13,7 +13,7 @@ const char *test_doc = "Check sched policy to be preserved"; const char *test_author = "Pavel Emelyanov "; -static const int param = 3; +static const int parm = 3; static int do_nothing(void) { @@ -50,8 +50,8 @@ int main(int argc, char **argv) } } - p.sched_priority = param; - if (sched_setscheduler(pid, SCHED_RR | SCHED_RESET_ON_FORK, &p)) { + p.sched_priority = parm; + if (sched_setscheduler(pid, SCHED_RR, &p)) { pr_perror("Can't set policy"); kill(pid, SIGKILL); return -1; @@ -61,13 +61,13 @@ int main(int argc, char **argv) test_waitsig(); ret = sched_getscheduler(pid); - if (ret != (SCHED_RR | SCHED_RESET_ON_FORK)) { + if (ret != SCHED_RR) { fail("Broken/No policy"); err++; } ret = sched_getparam(pid, &p); - if (ret < 0 || p.sched_priority != param) { + if (ret < 0 || p.sched_priority != parm) { fail("Broken prio"); err++; } diff --git a/test/zdtm/static/sched_prio00.c b/test/zdtm/static/sched_prio00.c index abdf33983..4bc878786 100644 --- a/test/zdtm/static/sched_prio00.c +++ b/test/zdtm/static/sched_prio00.c @@ -34,20 +34,20 @@ static void kill_all(int *pid, int n) int main(int argc, char **argv) { - int pid[NRTASKS], i, param[NRTASKS], ret; + int pid[NRTASKS], i, parm[NRTASKS], ret; test_init(argc, argv); - param[0] = -20; - param[1] = 19; - param[2] = 1; + parm[0] = -20; + parm[1] = 19; + parm[2] = 1; for (i = 0; i < NRTASKS; i++) { pid[i] = fork(); if (!pid[i]) return do_nothing(); - if (setpriority(PRIO_PROCESS, pid[i], param[i])) { + if (setpriority(PRIO_PROCESS, pid[i], parm[i])) { pr_perror("Can't set prio %d", i); kill_all(pid, i); return -1; @@ -65,7 +65,7 @@ int main(int argc, char **argv) break; } - if (ret != param[i]) { + if (ret != parm[i]) { fail("Broken nice for %d", i); break; } diff --git a/test/zdtm/static/scm00.c b/test/zdtm/static/scm00.c index 670e6fd6a..d66975582 100644 --- a/test/zdtm/static/scm00.c +++ b/test/zdtm/static/scm00.c @@ -105,9 +105,6 @@ int main(int argc, char **argv) p[1] = p[0]; p[0] = -1; #endif -#endif -#ifdef CLOSE_SENDER_FD - close(sk[0]); #endif test_daemon(); diff --git a/test/zdtm/static/scm03-seqpacket.c b/test/zdtm/static/scm03-seqpacket.c deleted file mode 120000 index f1f86dd8b..000000000 --- a/test/zdtm/static/scm03-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -scm03.c \ No newline at end of file diff --git a/test/zdtm/static/scm03.c b/test/zdtm/static/scm03.c index 4453f7e93..a40fc0101 100644 --- a/test/zdtm/static/scm03.c +++ b/test/zdtm/static/scm03.c @@ -9,12 +9,6 @@ const char *test_doc = "Check that SCM_RIGHTS are preserved"; const char *test_author = "Pavel Emelyanov "; -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_DGRAM -#endif - static int send_fd(int via, int fd1, int fd2) { struct msghdr h = {}; @@ -111,7 +105,7 @@ int main(int argc, char **argv) test_init(argc, argv); - if (socketpair(PF_UNIX, SOCK_TYPE, 0, sk) < 0) { + if (socketpair(PF_UNIX, SOCK_DGRAM, 0, sk) < 0) { pr_perror("Can't make unix pair"); exit(1); } diff --git a/test/zdtm/static/scm06.desc b/test/zdtm/static/scm06.desc index 38cc3be51..2eac7e654 100644 --- a/test/zdtm/static/scm06.desc +++ b/test/zdtm/static/scm06.desc @@ -1,4 +1 @@ -# This test isn't executed in the host flavor (in the same network namespace, -# because the kernel releases a test socket asynchronously, so the restore -# can fail if it is executed before the kernel actually destroys the socket. -{'flags': 'suid', 'flavor': 'ns uns'} +{'flags': 'suid'} diff --git a/test/zdtm/static/scm09.c b/test/zdtm/static/scm09.c deleted file mode 120000 index 4cab0edd2..000000000 --- a/test/zdtm/static/scm09.c +++ /dev/null @@ -1 +0,0 @@ -scm00.c \ No newline at end of file diff --git a/test/zdtm/static/seccomp_filter_inheritance.c b/test/zdtm/static/seccomp_filter_inheritance.c index 5afcb3f84..7a86cd85e 100644 --- a/test/zdtm/static/seccomp_filter_inheritance.c +++ b/test/zdtm/static/seccomp_filter_inheritance.c @@ -100,7 +100,7 @@ int main(int argc, char **argv) if (filter_syscall(__NR_ptrace) < 0) _exit(1); - if (filter_syscall(__NR_statx) < 0) + if (filter_syscall(__NR_fstat) < 0) _exit(1); zdtm_seccomp = 1; diff --git a/test/zdtm/static/seccomp_no_new_privs.c b/test/zdtm/static/seccomp_no_new_privs.c deleted file mode 100644 index 95f9501ed..000000000 --- a/test/zdtm/static/seccomp_no_new_privs.c +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check that NO_NEW_PRIVS attribute is restored"; -const char *test_author = "Michał Mirosław "; - -int main(int argc, char **argv) -{ - int ret; - - test_init(argc, argv); - - ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); - if (ret < 0) { - pr_perror("Can't read NO_NEW_PRIVS attribute"); - return 1; - } - if (ret != 0) - fail("initial NO_NEW_PRIVS = %d != 0", ret); - - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - if (ret) { - pr_perror("Can't set NO_NEW_PRIVS attribute"); - return 1; - } - - test_daemon(); - test_waitsig(); - - ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); - if (ret < 0) { - pr_perror("Can't read NO_NEW_PRIVS attribute"); - return 1; - } - if (ret != 1) - fail("restored NO_NEW_PRIVS = %d != 1", ret); - - pass(); - return 0; -} diff --git a/test/zdtm/static/selinux00.checkskip b/test/zdtm/static/selinux00.checkskip index 4c85647d1..8d946a75e 100755 --- a/test/zdtm/static/selinux00.checkskip +++ b/test/zdtm/static/selinux00.checkskip @@ -2,19 +2,6 @@ test -d /sys/fs/selinux || exit 1 -# check if necessary commands are installed -if ! command -v setenforce &>/dev/null; then - exit 1 -fi - -if ! command -v setsebool &>/dev/null; then - exit 1 -fi - -if ! command -v getsebool &>/dev/null; then - exit 1 -fi - # See selinux00.hook for details getsebool unconfined_dyntrans_all > /dev/null 2>&1 diff --git a/test/zdtm/static/session01.c b/test/zdtm/static/session01.c index f6834787e..0f727a9a6 100644 --- a/test/zdtm/static/session01.c +++ b/test/zdtm/static/session01.c @@ -40,22 +40,22 @@ enum { static struct testcase *testcases; static futex_t *fstate; static struct testcase __testcases[] = { - { 2, 1, 2, 1, 2, 1 }, /* session00 */ - { 4, 2, 4, 2, 4, 1 }, /* |\_session00 */ - { 15, 4, 4, 4, 15, 1 }, /* | |\_session00 */ - { 16, 4, 4, 4, 15, 1 }, /* | \_session00 */ - { 17, 4, 4, 4, 17, 0 }, /* | |\_session00 */ - { 18, 4, 4, 4, 17, 1 }, /* | \_session00 */ - { 5, 2, 2, 2, 2, 1 }, /* |\_session00 */ - { 8, 2, 8, 2, 8, 1 }, /* |\_session00 */ - { 9, 8, 2, 2, 2, 1 }, /* | \_session00 */ - { 10, 2, 10, 2, 10, 1 }, /* |\_session00 */ + { 2, 1, 2, 1, 2, 1 }, /* session00 */ + { 4, 2, 4, 2, 4, 1 }, /* |\_session00 */ + { 15, 4, 4, 4, 15, 1 }, /* | |\_session00 */ + { 16, 4, 4, 4, 15, 1 }, /* | \_session00 */ + { 17, 4, 4, 4, 17, 0 }, /* | |\_session00 */ + { 18, 4, 4, 4, 17, 1 }, /* | \_session00 */ + { 5, 2, 2, 2, 2, 1 }, /* |\_session00 */ + { 8, 2, 8, 2, 8, 1 }, /* |\_session00 */ + { 9, 8, 2, 2, 2, 1 }, /* | \_session00 */ + { 10, 2, 10, 2, 10, 1 }, /* |\_session00 */ { 11, 10, 11, 2, 11, 1 }, /* | \_session00 */ - { 12, 11, 2, 2, 2, 1 }, /* | \_session00 */ - { 13, 2, 2, 2, 2, 0 }, /* \_session00 */ - { 3, 13, 2, 2, 2, 1 }, /* session00 */ - { 6, 2, 6, 2, 6, 0 }, /* \_session00 */ - { 14, 6, 6, 6, 6, 1 }, /* session00 */ + { 12, 11, 2, 2, 2, 1 }, /* | \_session00 */ + { 13, 2, 2, 2, 2, 0 }, /* \_session00 */ + { 3, 13, 2, 2, 2, 1 }, /* session00 */ + { 6, 2, 6, 2, 6, 0 }, /* \_session00 */ + { 14, 6, 6, 6, 6, 1 }, /* session00 */ }; #define TESTS (sizeof(__testcases) / sizeof(struct testcase)) @@ -154,7 +154,7 @@ static int child(const int c) continue; if (testcases[i].alive) continue; - test_msg("Wait process %d (pid %d)\n", i, testcases[i].master.pid); + test_msg("Wait porcess %d (pid %d)\n", i, testcases[i].master.pid); waitpid(testcases[i].master.pid, NULL, 0); } @@ -244,7 +244,7 @@ int main(int argc, char **argv) continue; if (testcases[i].alive) continue; - test_msg("Wait process %d (pid %d)\n", i, testcases[i].master.pid); + test_msg("Wait porcess %d (pid %d)\n", i, testcases[i].master.pid); waitpid(testcases[i].master.pid, NULL, 0); } diff --git a/test/zdtm/static/shm-hugetlb.c b/test/zdtm/static/shm-hugetlb.c deleted file mode 120000 index 7e1c916a3..000000000 --- a/test/zdtm/static/shm-hugetlb.c +++ /dev/null @@ -1 +0,0 @@ -shm.c \ No newline at end of file diff --git a/test/zdtm/static/shm-hugetlb.checkskip b/test/zdtm/static/shm-hugetlb.checkskip deleted file mode 100755 index df2370815..000000000 --- a/test/zdtm/static/shm-hugetlb.checkskip +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -# will fail with EOPNOTSUPP -cat /proc/sys/vm/nr_hugepages &> /dev/null diff --git a/test/zdtm/static/shm-hugetlb.desc b/test/zdtm/static/shm-hugetlb.desc deleted file mode 100644 index dfe829b84..000000000 --- a/test/zdtm/static/shm-hugetlb.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'ns', 'flags': 'suid'} diff --git a/test/zdtm/static/shm-mp.c b/test/zdtm/static/shm-mp.c index c95f3d84c..1929dac19 100644 --- a/test/zdtm/static/shm-mp.c +++ b/test/zdtm/static/shm-mp.c @@ -33,12 +33,10 @@ static int check_prot(char *ptr, char val, int prot) fail("PROT_READ bypassed"); return -1; } - } else { - /* we come here on return from SIGSEGV handler */ + } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_READ) { - fail("PROT_READ rejected"); - return -1; - } + fail("PROT_READ rejected"); + return -1; } if (!sigsetjmp(segv_ret, 1)) { @@ -47,12 +45,10 @@ static int check_prot(char *ptr, char val, int prot) fail("PROT_WRITE bypassed"); return -1; } - } else { - /* we come here on return from SIGSEGV handler */ + } else /* we come here on return from SIGSEGV handler */ if (prot & PROT_WRITE) { - fail("PROT_WRITE rejected"); - return -1; - } + fail("PROT_WRITE rejected"); + return -1; } if (signal(SIGSEGV, SIG_DFL) == SIG_ERR) { diff --git a/test/zdtm/static/shm.c b/test/zdtm/static/shm.c index 423658397..3da29daeb 100644 --- a/test/zdtm/static/shm.c +++ b/test/zdtm/static/shm.c @@ -56,17 +56,13 @@ static int get_shm_seg(int key, size_t size, unsigned int flags) static int prepare_shm(int key, size_t size) { - int id, flag = 0; + int id; -#ifdef ZDTM_HUGETLB - flag = SHM_HUGETLB; -#endif - - id = get_shm_seg(key, size, IPC_CREAT | IPC_EXCL | flag); + id = get_shm_seg(key, shmem_size, IPC_CREAT | IPC_EXCL); if (id == -1) { return -1; } - if (fill_shm_seg(id, size) < 0) + if (fill_shm_seg(id, shmem_size) < 0) return -1; return id; } @@ -109,7 +105,6 @@ int main(int argc, char **argv) int shm; int fail_count = 0; int ret = -1; - dev_t dev; void *mem; uint32_t crc = INIT_CRC; @@ -138,12 +133,6 @@ int main(int argc, char **argv) goto out; } - dev = get_mapping_dev(mem); - if (dev == (dev_t)-1) { - fail("Can't get mapping dev"); - return -1; - } - test_daemon(); test_waitsig(); @@ -166,11 +155,6 @@ int main(int argc, char **argv) return -1; } - if (dev != get_mapping_dev(mem)) { - fail("Mapping dev mismatch"); - return -1; - } - if (shmdt(mem) < 0) { pr_perror("Can't detach shm"); return -1; diff --git a/test/zdtm/static/sigpending.c b/test/zdtm/static/sigpending.c index ce03ff55c..1641fdd86 100644 --- a/test/zdtm/static/sigpending.c +++ b/test/zdtm/static/sigpending.c @@ -18,7 +18,7 @@ static int numsig; #define TESTSIG (SIGRTMAX) #define THREADSIG (SIGRTMIN) static siginfo_t share_infos[2]; -static siginfo_t self_infos[64]; /* self */ +static siginfo_t self_infos[64]; /* self */ static siginfo_t thread_infos[3]; /* thread */ static int share_nr; static int self_nr; diff --git a/test/zdtm/static/sigtrap.c b/test/zdtm/static/sigtrap.c index 4df80e9c0..6bd30aad4 100644 --- a/test/zdtm/static/sigtrap.c +++ b/test/zdtm/static/sigtrap.c @@ -74,7 +74,7 @@ int main(int argc, char *argv[]) } if (act.sa_handler != sigh) { - fail("unexpected sighanl handler"); + fail("unexpected sighanl hanlder"); exit(1); } diff --git a/test/zdtm/static/sk-unix-dgram-ghost.c b/test/zdtm/static/sk-unix-dgram-ghost.c index 7c116fb19..fdf8fb18b 100644 --- a/test/zdtm/static/sk-unix-dgram-ghost.c +++ b/test/zdtm/static/sk-unix-dgram-ghost.c @@ -198,11 +198,11 @@ int main(int argc, char **argv) test_msg("C/R complete\n"); - /* Let children send data to server socket. */ + /* let childs to send data to server socket */ for (i = 0; i < PROCESSES_NUM; i++) task_waiter_complete(&t[i], 2); - /* Wait for children to send data. */ + /* wait childs for send data */ for (i = 0; i < PROCESSES_NUM; i++) task_waiter_wait4(&t[i], 3); diff --git a/test/zdtm/static/sk-unix-listen01.c b/test/zdtm/static/sk-unix-listen01.c deleted file mode 100644 index 5c9274acb..000000000 --- a/test/zdtm/static/sk-unix-listen01.c +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Test in-flight unix sockets with data in them\n"; -const char *test_author = "Andrei Vagin "; - -#define SK_DATA "packet" - -char *filename; -TEST_OPTION(filename, string, "socket file name", 1); - -#define TEST_MODE 0640 - -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - -int main(int argc, char *argv[]) -{ - struct sockaddr_un addr; - unsigned int addrlen; - int ssk, sk; - - char path[PATH_MAX]; - char *cwd; - int ret; - - test_init(argc, argv); - - cwd = get_current_dir_name(); - if (!cwd) - return pr_perror("get_current_dir_name"); - - snprintf(path, sizeof(path), "%s/%s", cwd, filename); - unlink(path); - - addr.sun_family = AF_UNIX; - addrlen = strlen(filename); - if (addrlen > sizeof(addr.sun_path)) - return pr_err("address is too long"); - memcpy(addr.sun_path, filename, addrlen); - addrlen += sizeof(addr.sun_family); - - ssk = socket(AF_UNIX, SOCK_TYPE, 0); - if (ssk == -1) - return pr_perror("socket"); - - sk = socket(AF_UNIX, SOCK_TYPE, 0); - if (sk < 0) - return pr_perror("socket"); - - ret = bind(ssk, (struct sockaddr *)&addr, addrlen); - if (ret) - return pr_perror("bind"); - - ret = listen(ssk, 16); - if (ret) - return pr_perror("listen"); - - if (connect(sk, (struct sockaddr *)&addr, addrlen)) - return pr_perror("connect"); - -#ifdef SK_UNIX_LISTEN02 - { - char buf[64]; - memset(buf, 0, sizeof(buf)); - write(sk, SK_DATA, sizeof(SK_DATA)); - } -#endif - -#ifdef SK_UNIX_LISTEN03 - close(sk); - sk = -1; -#endif - - test_daemon(); - test_waitsig(); - - if (sk != -1) - close(sk); - - ret = accept(ssk, NULL, NULL); - if (ret < 0) - return fail("accept"); - -#ifdef SK_UNIX_LISTEN02 - { - char buf[64]; - if (read(ret, &buf, sizeof(buf)) != sizeof(SK_DATA)) - return pr_perror("read"); - - if (strcmp(buf, SK_DATA)) - return fail("data corrupted"); - } -#endif - - close(ssk); - unlink(path); - - pass(); - return 0; -} diff --git a/test/zdtm/static/sk-unix-listen02.c b/test/zdtm/static/sk-unix-listen02.c deleted file mode 120000 index 1211f4666..000000000 --- a/test/zdtm/static/sk-unix-listen02.c +++ /dev/null @@ -1 +0,0 @@ -sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen03.c b/test/zdtm/static/sk-unix-listen03.c deleted file mode 120000 index 1211f4666..000000000 --- a/test/zdtm/static/sk-unix-listen03.c +++ /dev/null @@ -1 +0,0 @@ -sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen03.desc b/test/zdtm/static/sk-unix-listen03.desc deleted file mode 100644 index ded89879a..000000000 --- a/test/zdtm/static/sk-unix-listen03.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-listen04.c b/test/zdtm/static/sk-unix-listen04.c deleted file mode 120000 index 1211f4666..000000000 --- a/test/zdtm/static/sk-unix-listen04.c +++ /dev/null @@ -1 +0,0 @@ -sk-unix-listen01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-listen04.desc b/test/zdtm/static/sk-unix-listen04.desc deleted file mode 100644 index ded89879a..000000000 --- a/test/zdtm/static/sk-unix-listen04.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'crfail'} diff --git a/test/zdtm/static/sk-unix-rel-seqpacket.c b/test/zdtm/static/sk-unix-rel-seqpacket.c deleted file mode 120000 index 1f98e3845..000000000 --- a/test/zdtm/static/sk-unix-rel-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -sk-unix-rel.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-rel.c b/test/zdtm/static/sk-unix-rel.c index 7e4aeafe6..10c19080a 100644 --- a/test/zdtm/static/sk-unix-rel.c +++ b/test/zdtm/static/sk-unix-rel.c @@ -25,12 +25,6 @@ TEST_OPTION(filename, string, "socket file name", 1); #define TEST_MODE 0640 -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - int main(int argc, char *argv[]) { struct sockaddr_un addr; @@ -60,8 +54,8 @@ int main(int argc, char *argv[]) memcpy(addr.sun_path, filename, addrlen); addrlen += sizeof(addr.sun_family); - sock[0] = socket(AF_UNIX, SOCK_TYPE, 0); - sock[1] = socket(AF_UNIX, SOCK_TYPE, 0); + sock[0] = socket(AF_UNIX, SOCK_STREAM, 0); + sock[1] = socket(AF_UNIX, SOCK_STREAM, 0); if (sock[0] < 0 || sock[1] < 0) { fail("socket"); exit(1); diff --git a/test/zdtm/static/sk-unix-restore-fs-share.c b/test/zdtm/static/sk-unix-restore-fs-share.c deleted file mode 100644 index d4f6dde75..000000000 --- a/test/zdtm/static/sk-unix-restore-fs-share.c +++ /dev/null @@ -1,196 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Test non-empty process group with terminated parent and unix socket"; -const char *test_author = "Qiao Ma "; - -char *filename; -TEST_OPTION(filename, string, "socket file name", 1); - -static int create_and_connect(void) -{ - struct sockaddr_un addr; - int client_fd; - - client_fd = socket(AF_UNIX, SOCK_STREAM, 0); - if (client_fd == -1) { - pr_perror("socket"); - return -1; - } - - memset(&addr, 0, sizeof(addr)); - addr.sun_family = AF_UNIX; - if (snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", filename) >= (int)sizeof(addr.sun_path)) { - pr_err("Socket path too long\n"); - close(client_fd); - return -1; - } - - if (connect(client_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { - pr_perror("connect"); - close(client_fd); - return -1; - } - - return 0; -} - -static int child(int ready_fd) -{ - int listen_fd; - struct sockaddr_un addr; - int ret = EXIT_FAILURE; - - listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); - if (listen_fd == -1) { - pr_perror("socket"); - return EXIT_FAILURE; - } - - memset(&addr, 0, sizeof(addr)); - addr.sun_family = AF_UNIX; - if (strlen(filename) >= sizeof(addr.sun_path)) { - pr_err("Socket path too long\n"); - goto cleanup; - } - strncpy(addr.sun_path, filename, sizeof(addr.sun_path)); - - unlink(filename); /* Ignore error if file doesn't exist */ - - if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { - pr_perror("bind"); - goto cleanup; - } - - if (listen(listen_fd, 5) == -1) { - pr_perror("listen"); - goto cleanup; - } - - if (create_and_connect() != 0) { - pr_err("Failed to create and connect\n"); - goto cleanup; - } - - /* Signal parent that socket is ready */ - if (write(ready_fd, "1", 1) != 1) { - pr_perror("write ready_fd"); - goto cleanup; - } - - /* Wait indefinitely */ - pause(); - - ret = EXIT_SUCCESS; -cleanup: - if (listen_fd != -1) - close(listen_fd); - unlink(filename); - - return ret; -} - -static int zombie_leader(int *cpid) -{ - char buf; - pid_t pid; - int pipefd[2]; - - if (pipe(pipefd) == -1) { - pr_perror("pipe"); - return EXIT_FAILURE; - } - - if (setpgid(0, 0) == -1) { - pr_perror("setpgid"); - return EXIT_FAILURE; - } - - pid = fork(); - if (pid < 0) { - pr_perror("Failed to fork child"); - return EXIT_FAILURE; - } - - if (pid == 0) { - /* Close read end */ - close(pipefd[0]); - exit(child(pipefd[1])); - } - - /* Close write end in parent */ - close(pipefd[1]); - - /* Wait for child to set up socket */ - if (read(pipefd[0], &buf, 1) != 1) { - pr_err("Failed to receive readiness signal from child\n"); - close(pipefd[0]); - return EXIT_FAILURE; - } - close(pipefd[0]); - - *cpid = pid; - return EXIT_SUCCESS; -} - -int main(int argc, char **argv) -{ - int ret = EXIT_FAILURE, status; - pid_t pid; - int *cpid; - - test_init(argc, argv); - - cpid = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); - if (cpid == MAP_FAILED) { - pr_perror("mmap"); - return EXIT_FAILURE; - } - *cpid = 0; - - pid = fork(); - if (pid < 0) { - pr_perror("Failed to fork zombie"); - goto out; - } - - if (pid == 0) - exit(zombie_leader(cpid)); - - if (waitpid(pid, &status, 0) < 0) { - pr_perror("Failed to waitpid zombie"); - goto out; - } - - if (!WIFEXITED(status) || WEXITSTATUS(status) != EXIT_SUCCESS) { - pr_err("Unexpected exit code: %d\n", WEXITSTATUS(status)); - goto out; - } - - if (!*cpid) { - pr_err("Don't know grandchild's pid\n"); - goto out; - } - - test_daemon(); - test_waitsig(); - - ret = EXIT_SUCCESS; - pass(); -out: - /* Clean up */ - if (*cpid) - kill(*cpid, SIGKILL); - - munmap(cpid, sizeof(int)); - - return ret; -} diff --git a/test/zdtm/static/sk-unix-restore-fs-share.desc b/test/zdtm/static/sk-unix-restore-fs-share.desc deleted file mode 100644 index 6c4afe5f0..000000000 --- a/test/zdtm/static/sk-unix-restore-fs-share.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'ns uns'} diff --git a/test/zdtm/static/sk-unix-unconn-seqpacket.c b/test/zdtm/static/sk-unix-unconn-seqpacket.c deleted file mode 120000 index f5c276186..000000000 --- a/test/zdtm/static/sk-unix-unconn-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -sk-unix-unconn.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix-unconn.c b/test/zdtm/static/sk-unix-unconn.c index 62e48247f..caad3d315 100644 --- a/test/zdtm/static/sk-unix-unconn.c +++ b/test/zdtm/static/sk-unix-unconn.c @@ -9,12 +9,6 @@ const char *test_doc = "Check unconnected unix sockets"; const char *test_author = "Vagin Andrew "; -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - int main(int argc, char **argv) { int sk, skc; @@ -25,13 +19,13 @@ int main(int argc, char **argv) test_init(argc, argv); - sk = socket(AF_UNIX, SOCK_TYPE, 0); + sk = socket(AF_UNIX, SOCK_STREAM, 0); if (sk == -1) { pr_perror("socket"); return 1; } - skc = socket(AF_UNIX, SOCK_TYPE, 0); + skc = socket(AF_UNIX, SOCK_STREAM, 0); if (skc == -1) { pr_perror("socket"); return 1; diff --git a/test/zdtm/static/sk-unix01-seqpacket.c b/test/zdtm/static/sk-unix01-seqpacket.c deleted file mode 120000 index bef734ed6..000000000 --- a/test/zdtm/static/sk-unix01-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -sk-unix01.c \ No newline at end of file diff --git a/test/zdtm/static/sk-unix01-seqpacket.desc b/test/zdtm/static/sk-unix01-seqpacket.desc deleted file mode 120000 index 7a30da25c..000000000 --- a/test/zdtm/static/sk-unix01-seqpacket.desc +++ /dev/null @@ -1 +0,0 @@ -sk-unix01.desc \ No newline at end of file diff --git a/test/zdtm/static/sk-unix01.c b/test/zdtm/static/sk-unix01.c index 5146c027f..c2bb8b9ed 100644 --- a/test/zdtm/static/sk-unix01.c +++ b/test/zdtm/static/sk-unix01.c @@ -24,12 +24,6 @@ const char *test_author = "Cyrill Gorcunov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - static int sk_alloc_bind(int type, struct sockaddr_un *addr) { int sk; @@ -262,7 +256,7 @@ int main(int argc, char **argv) unlink(addr.sun_path); - sk_st[0] = sk_alloc_bind(SOCK_TYPE, &addr); + sk_st[0] = sk_alloc_bind(SOCK_STREAM, &addr); if (sk_st[0] < 0) return 1; test_msg("sk-st: alloc/bind/listen %d\n", sk_st[0]); @@ -272,7 +266,7 @@ int main(int argc, char **argv) return 1; } - sk_st[1] = sk_alloc_connect(SOCK_TYPE, &addr); + sk_st[1] = sk_alloc_connect(SOCK_STREAM, &addr); if (sk_st[1] < 0) return 1; test_msg("sk-st: alloc/connect %d\n", sk_st[1]); @@ -285,7 +279,7 @@ int main(int argc, char **argv) } test_msg("sk-st: accept %d\n", sk_st[2]); - sk_st[3] = sk_alloc_connect(SOCK_TYPE, &addr); + sk_st[3] = sk_alloc_connect(SOCK_STREAM, &addr); if (sk_st[3] < 0) return 1; test_msg("sk-st: alloc/connect %d\n", sk_st[3]); diff --git a/test/zdtm/static/sock_ip_opts00.c b/test/zdtm/static/sock_ip_opts00.c deleted file mode 100644 index cb464365d..000000000 --- a/test/zdtm/static/sock_ip_opts00.c +++ /dev/null @@ -1,114 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check that different ip socket options are restored"; -const char *test_author = "Pavel Tikhomirov "; - -#ifdef ZDTM_VAL_ZERO -#define IP_OPT_VAL 0 -#else -#define IP_OPT_VAL 1 -#endif - -struct sk_opt { - int level; - int opt; - int val; -}; - -struct sk_opt sk_opts_v4[] = { - { SOL_IP, IP_FREEBIND, IP_OPT_VAL }, - { SOL_IP, IP_PKTINFO, IP_OPT_VAL }, - { SOL_IP, IP_TTL, 32 }, - { SOL_IP, IP_TOS, IPTOS_TOS(IPTOS_THROUGHPUT) }, -}; - -#ifndef IPV6_FREEBIND -#define IPV6_FREEBIND 78 -#endif - -struct sk_opt sk_opts_v6[] = { - { SOL_IPV6, IPV6_FREEBIND, IP_OPT_VAL }, - { SOL_IPV6, IPV6_RECVPKTINFO, IP_OPT_VAL }, -}; - -struct sk_conf { - int domain; - int type; - int protocol; - int sk; -} sk_confs[] = { - { AF_INET, SOCK_DGRAM, IPPROTO_UDP }, - { AF_INET, SOCK_RAW, IPPROTO_UDP }, - { AF_INET6, SOCK_DGRAM, IPPROTO_UDP }, - { AF_INET6, SOCK_RAW, IPPROTO_UDP }, -}; - -int main(int argc, char **argv) -{ - struct sk_opt *opts; - int exit_code = 1; - int i, j, val; - socklen_t len; - int n_opts; - - test_init(argc, argv); - - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { - sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); - if (sk_confs[i].sk == -1) { - pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, - sk_confs[i].protocol); - goto close; - } - } - - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { - opts = sk_confs[i].domain == AF_INET ? sk_opts_v4 : sk_opts_v6; - n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); - - for (j = 0; j < n_opts; j++) { - val = opts[j].val; - if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { - pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); - goto close; - } - } - } - - test_daemon(); - test_waitsig(); - - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { - opts = sk_confs[i].domain == AF_INET ? sk_opts_v4 : sk_opts_v6; - n_opts = sk_confs[i].domain == AF_INET ? ARRAY_SIZE(sk_opts_v4) : ARRAY_SIZE(sk_opts_v6); - - for (j = 0; j < n_opts; j++) { - len = sizeof(int); - if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { - pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); - goto close; - } - - if (val != opts[j].val) { - fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, - sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); - goto close; - } - } - } - - pass(); - exit_code = 0; -close: - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) - close(sk_confs[i].sk); - return exit_code; -} diff --git a/test/zdtm/static/sock_ip_opts00.desc b/test/zdtm/static/sock_ip_opts00.desc deleted file mode 100644 index 2201f0298..000000000 --- a/test/zdtm/static/sock_ip_opts00.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'suid', 'feature': 'ipv6_freebind'} diff --git a/test/zdtm/static/sock_ip_opts01.c b/test/zdtm/static/sock_ip_opts01.c deleted file mode 120000 index 15526f808..000000000 --- a/test/zdtm/static/sock_ip_opts01.c +++ /dev/null @@ -1 +0,0 @@ -sock_ip_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_ip_opts01.desc b/test/zdtm/static/sock_ip_opts01.desc deleted file mode 120000 index e2c29ca25..000000000 --- a/test/zdtm/static/sock_ip_opts01.desc +++ /dev/null @@ -1 +0,0 @@ -sock_ip_opts00.desc \ No newline at end of file diff --git a/test/zdtm/static/sock_opts00.c b/test/zdtm/static/sock_opts00.c index 854aaa591..5b4624f6d 100644 --- a/test/zdtm/static/sock_opts00.c +++ b/test/zdtm/static/sock_opts00.c @@ -12,28 +12,22 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_PORT 59687 #define TEST_ADDR INADDR_ANY +#define NOPTS 8 + int main(int argc, char **argv) { - #define OPT(x) { x, #x } - static const struct { - int opt; - const char *name; - } vname[] = { - OPT(SO_PRIORITY), - OPT(SO_RCVLOWAT), - OPT(SO_MARK), - OPT(SO_PASSCRED), - OPT(SO_PASSSEC), - OPT(SO_DONTROUTE), - OPT(SO_NO_CHECK), - OPT(SO_OOBINLINE), - }; - static const int NOPTS = sizeof(vname) / sizeof(*vname); - #undef OPT - - int sock, usock, sk, ret = 0, val[NOPTS], rval, i; + int sock, ret = 0, vname[NOPTS], val[NOPTS], rval, i; socklen_t len = sizeof(int); + vname[0] = SO_PRIORITY; + vname[1] = SO_RCVLOWAT; + vname[2] = SO_MARK; + vname[3] = SO_PASSCRED; + vname[4] = SO_PASSSEC; + vname[5] = SO_DONTROUTE; + vname[6] = SO_NO_CHECK; + vname[7] = SO_OOBINLINE; + test_init(argc, argv); sock = socket(PF_INET, SOCK_STREAM, 0); @@ -42,37 +36,30 @@ int main(int argc, char **argv) return 1; } - usock = socket(AF_UNIX, SOCK_STREAM, 0); - if (usock < 0) { - pr_perror("can't create unix socket"); - return 1; - } - for (i = 0; i < NOPTS; i++) { - sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; - ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i], &val[i], &len); if (ret) { - pr_perror("can't get %s", vname[i].name); + pr_perror("can't get option %d", i); return 1; } val[i]++; - ret = setsockopt(sk, SOL_SOCKET, vname[i].opt, &val[i], len); + ret = setsockopt(sock, SOL_SOCKET, vname[i], &val[i], len); if (ret) { - pr_perror("can't set %s = %d", vname[i].name, val[i]); + pr_perror("can't set option %d", i); return 1; } - ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); if (ret) { - pr_perror("can't re-get %s", vname[i].name); + pr_perror("can't get option %d 2", i); return 1; } if (rval != val[i]) { if (rval + 1 == val[i]) { - pr_perror("failed to set %s: want %d have %d", vname[i].name, val[i], rval); + pr_perror("can't reset option %d want %d have %d", i, val[i], rval); return 1; } @@ -85,23 +72,20 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < NOPTS; i++) { - sk = vname[i].opt == SO_PASSCRED || vname[i].opt == SO_PASSSEC ? usock : sock; - ret = getsockopt(sk, SOL_SOCKET, vname[i].opt, &rval, &len); + ret = getsockopt(sock, SOL_SOCKET, vname[i], &rval, &len); if (ret) { - pr_perror("can't verify %s", vname[i].name); + pr_perror("can't get option %d again", i); return 1; } if (val[i] != rval) { - errno = 0; - fail("%s changed: %d -> %d", vname[i].name, val[i], rval); + fail("option %d changed", i); return 1; } } pass(); close(sock); - close(usock); return 0; } diff --git a/test/zdtm/static/sock_opts02.c b/test/zdtm/static/sock_opts02.c deleted file mode 100644 index 58c6d4712..000000000 --- a/test/zdtm/static/sock_opts02.c +++ /dev/null @@ -1,118 +0,0 @@ -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check that SO_BUF_LOCK option dumped"; -const char *test_author = "Pavel Tikhomirov "; - -#ifndef SO_BUF_LOCK -#define SO_BUF_LOCK 72 -#endif - -#ifndef SOCK_SNDBUF_LOCK -#define SOCK_SNDBUF_LOCK 1 -#endif -#ifndef SOCK_RCVBUF_LOCK -#define SOCK_RCVBUF_LOCK 2 -#endif - -#define BUFSIZE 16384 - -struct sk_opt { - int type; - uint32_t val; - uint32_t lock; -} sk_opts[] = { { SO_BUF_LOCK, 0, 0 }, - { SO_BUF_LOCK, SOCK_SNDBUF_LOCK, SOCK_SNDBUF_LOCK }, - { SO_BUF_LOCK, SOCK_RCVBUF_LOCK, SOCK_RCVBUF_LOCK }, - { SO_BUF_LOCK, SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK, SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK }, - { SO_SNDBUF, BUFSIZE, SOCK_SNDBUF_LOCK }, - { SO_RCVBUF, BUFSIZE, SOCK_RCVBUF_LOCK } }; - -#define NSOCK ARRAY_SIZE(sk_opts) - -char *type_to_str(int type) -{ - switch (type) { - case SO_BUF_LOCK: - return "SO_BUF_LOCK"; - case SO_SNDBUFFORCE: - return "SO_SNDBUFFORCE"; - case SO_RCVBUFFORCE: - return "SO_RCVBUFFORCE"; - } - return NULL; -} - -int main(int argc, char **argv) -{ - int sock[NSOCK]; - int ret, i; - int exit_code = 1; - - test_init(argc, argv); - - for (i = 0; i < NSOCK; i++) - sock[i] = -1; - - for (i = 0; i < NSOCK; i++) { - uint32_t tmp; - socklen_t len; - - sock[i] = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); - if (sock[i] < 0) { - pr_perror("can't create socket %d", i); - goto err; - } - - ret = setsockopt(sock[i], SOL_SOCKET, sk_opts[i].type, &sk_opts[i].val, sizeof(sk_opts[i].val)); - if (ret < 0) { - pr_perror("can't set %s (%u) on socket %d", type_to_str(sk_opts[i].type), sk_opts[i].val, i); - goto err; - } - - len = sizeof(tmp); - ret = getsockopt(sock[i], SOL_SOCKET, SO_BUF_LOCK, &tmp, &len); - if (ret < 0) { - pr_perror("can't get SO_BUF_LOCK from socket %d", i); - goto err; - } - - if (tmp != sk_opts[i].lock) { - fail("SO_BUF_LOCK mismatch %u != %u", tmp, sk_opts[i].lock); - goto err; - } - } - - test_daemon(); - test_waitsig(); - - for (i = 0; i < NSOCK; i++) { - uint32_t tmp; - socklen_t len; - - len = sizeof(tmp); - ret = getsockopt(sock[i], SOL_SOCKET, SO_BUF_LOCK, &tmp, &len); - if (ret < 0) { - pr_perror("can't get SO_BUF_LOCK from socket %d", i); - goto err; - } - - if (tmp != sk_opts[i].lock) { - fail("SO_BUF_LOCK mismatch %u != %u", tmp, sk_opts[i].lock); - goto err; - } - } - - pass(); - exit_code = 0; -err: - for (i = 0; i < NSOCK; i++) - close(sock[i]); - - return exit_code; -} diff --git a/test/zdtm/static/sock_opts02.desc b/test/zdtm/static/sock_opts02.desc deleted file mode 100644 index 37d3a6354..000000000 --- a/test/zdtm/static/sock_opts02.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'suid', 'feature': 'sockopt_buf_lock'} diff --git a/test/zdtm/static/sock_tcp_opts00.c b/test/zdtm/static/sock_tcp_opts00.c deleted file mode 100644 index 8061bc9ea..000000000 --- a/test/zdtm/static/sock_tcp_opts00.c +++ /dev/null @@ -1,96 +0,0 @@ -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check that different tcp socket options are restored"; -const char *test_author = "Juntong Deng "; - -#ifdef ZDTM_VAL_ZERO -#define TCP_OPT_VAL 0 -#else -#define TCP_OPT_VAL 1 -#endif - -#ifndef SOL_TCP -#define SOL_TCP 6 -#endif - -struct sk_opt { - int level; - int opt; - int val; -}; - -struct sk_opt tcp_sk_opts[] = { - { SOL_TCP, TCP_CORK, TCP_OPT_VAL }, - { SOL_TCP, TCP_NODELAY, TCP_OPT_VAL }, -}; - -struct sk_conf { - int domain; - int type; - int protocol; - int sk; -} sk_confs[] = { - { AF_INET, SOCK_STREAM, IPPROTO_TCP }, - { AF_INET6, SOCK_STREAM, IPPROTO_TCP }, -}; - -int main(int argc, char **argv) -{ - struct sk_opt *opts = tcp_sk_opts; - int n_opts = ARRAY_SIZE(tcp_sk_opts); - int exit_code = 1; - int i, j, val; - socklen_t len; - - test_init(argc, argv); - - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { - sk_confs[i].sk = socket(sk_confs[i].domain, sk_confs[i].type, sk_confs[i].protocol); - if (sk_confs[i].sk == -1) { - pr_perror("socket(%d,%d,%d) failed", sk_confs[i].domain, sk_confs[i].type, - sk_confs[i].protocol); - goto close; - } - } - - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { - for (j = 0; j < n_opts; j++) { - val = opts[j].val; - if (setsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, sizeof(int)) == -1) { - pr_perror("setsockopt(%d, %d) failed", opts[j].level, opts[j].opt); - goto close; - } - } - } - - test_daemon(); - test_waitsig(); - - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) { - for (j = 0; j < n_opts; j++) { - len = sizeof(int); - if (getsockopt(sk_confs[i].sk, opts[j].level, opts[j].opt, &val, &len) == -1) { - pr_perror("getsockopt(%d, %d) failed", opts[j].level, opts[j].opt); - goto close; - } - - if (val != opts[j].val) { - fail("Unexpected value socket(%d,%d,%d) opts(%d,%d)", sk_confs[i].domain, - sk_confs[i].type, sk_confs[i].protocol, opts[j].level, opts[j].opt); - goto close; - } - } - } - - pass(); - exit_code = 0; -close: - for (i = 0; i < ARRAY_SIZE(sk_confs); i++) - close(sk_confs[i].sk); - return exit_code; -} diff --git a/test/zdtm/static/sock_tcp_opts00.desc b/test/zdtm/static/sock_tcp_opts00.desc deleted file mode 100644 index 2eac7e654..000000000 --- a/test/zdtm/static/sock_tcp_opts00.desc +++ /dev/null @@ -1 +0,0 @@ -{'flags': 'suid'} diff --git a/test/zdtm/static/sock_tcp_opts01.c b/test/zdtm/static/sock_tcp_opts01.c deleted file mode 120000 index 5219c2e98..000000000 --- a/test/zdtm/static/sock_tcp_opts01.c +++ /dev/null @@ -1 +0,0 @@ -./sock_tcp_opts00.c \ No newline at end of file diff --git a/test/zdtm/static/sock_tcp_opts01.desc b/test/zdtm/static/sock_tcp_opts01.desc deleted file mode 120000 index fb1dfdcd1..000000000 --- a/test/zdtm/static/sock_tcp_opts01.desc +++ /dev/null @@ -1 +0,0 @@ -./sock_tcp_opts00.desc \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-close2.c b/test/zdtm/static/socket-tcp-close2.c deleted file mode 100644 index 697c99f39..000000000 --- a/test/zdtm/static/socket-tcp-close2.c +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Check both dump and restore with tcp_close on TCP_CLOSE sockets"; -const char *test_author = "Bui Quang Minh "; - -static int port = 8880; - -int main(int argc, char **argv) -{ - int fd_s, fd, client; - char c; - - test_init(argc, argv); - signal(SIGPIPE, SIG_IGN); - - fd_s = tcp_init_server(AF_INET, &port); - if (fd_s < 0) { - pr_err("Server initializations failed\n"); - return 1; - } - - client = tcp_init_client(AF_INET, "localhost", port); - if (client < 0) { - pr_err("Client initializations failed\n"); - return 1; - } - - fd = tcp_accept_server(fd_s); - if (fd < 0) { - pr_err("Can't accept client\n"); - return 1; - } - close(fd_s); - - shutdown(client, SHUT_WR); - shutdown(fd, SHUT_WR); - - test_daemon(); - test_waitsig(); - - if (read(fd, &c, 1) != 0) { - fail("read server"); - return 1; - } - if (read(client, &c, 1) != 0) { - fail("read client"); - return 1; - } - if (write(client, &c, 1) != -1) { - fail("write client"); - return 1; - } - if (write(fd, &c, 1) != -1) { - fail("write server"); - return 1; - } - - pass(); - return 0; -} diff --git a/test/zdtm/static/socket-tcp-close2.desc b/test/zdtm/static/socket-tcp-close2.desc deleted file mode 100644 index c53a1f315..000000000 --- a/test/zdtm/static/socket-tcp-close2.desc +++ /dev/null @@ -1 +0,0 @@ -{'opts': '--tcp-close', 'flags': 'reqrst '} diff --git a/test/zdtm/static/socket-tcp-closed-last-ack.desc b/test/zdtm/static/socket-tcp-closed-last-ack.desc index c77d58477..d4cfe5064 100644 --- a/test/zdtm/static/socket-tcp-closed-last-ack.desc +++ b/test/zdtm/static/socket-tcp-closed-last-ack.desc @@ -1,10 +1,10 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed has_ipt_legacy', + 'feature' : 'tcp_half_closed', 'flavor': 'ns uns', } diff --git a/test/zdtm/static/socket-tcp-closing.c b/test/zdtm/static/socket-tcp-closing.c index df291d446..87e1d7533 100644 --- a/test/zdtm/static/socket-tcp-closing.c +++ b/test/zdtm/static/socket-tcp-closing.c @@ -31,13 +31,10 @@ static int port = 8880; int fill_sock_buf(int fd) { - char zdtm[512]; int flags; int size; int ret; - memset(zdtm, 5, sizeof(zdtm)); - flags = fcntl(fd, F_GETFL, 0); if (flags == -1) { pr_perror("Can't get flags"); @@ -50,6 +47,7 @@ int fill_sock_buf(int fd) size = 0; while (1) { + char zdtm[] = "zdtm test packet"; ret = write(fd, zdtm, sizeof(zdtm)); if (ret == -1) { if (errno == EAGAIN) diff --git a/test/zdtm/static/socket-tcp-fin-wait1.hook b/test/zdtm/static/socket-tcp-fin-wait1.hook index 30f8ce071..9504557da 100755 --- a/test/zdtm/static/socket-tcp-fin-wait1.hook +++ b/test/zdtm/static/socket-tcp-fin-wait1.hook @@ -1,7 +1,7 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import sys -sys.path.append("../lib") +sys.path.append("../crit") import pycriu import os, os.path diff --git a/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc b/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc deleted file mode 100644 index 53dd82285..000000000 --- a/test/zdtm/static/socket-tcp-ipt-nfconntrack.desc +++ /dev/null @@ -1,6 +0,0 @@ -{ - 'feature': 'has_ipt_legacy', - 'flavor': 'h', - 'opts': '--tcp-established', - 'flags': 'suid' -} diff --git a/test/zdtm/static/socket-tcp-ipt-nfconntrack.c b/test/zdtm/static/socket-tcp-nfconntrack.c similarity index 100% rename from test/zdtm/static/socket-tcp-ipt-nfconntrack.c rename to test/zdtm/static/socket-tcp-nfconntrack.c diff --git a/test/zdtm/static/socket-tcp-nfconntrack.desc b/test/zdtm/static/socket-tcp-nfconntrack.desc new file mode 100644 index 000000000..add2513f8 --- /dev/null +++ b/test/zdtm/static/socket-tcp-nfconntrack.desc @@ -0,0 +1 @@ +{'flavor': 'h', 'opts': '--tcp-established', 'flags': 'suid'} diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.c b/test/zdtm/static/socket-tcp-nft-nfconntrack.c deleted file mode 120000 index 8cb60dd03..000000000 --- a/test/zdtm/static/socket-tcp-nft-nfconntrack.c +++ /dev/null @@ -1 +0,0 @@ -socket-tcp.c \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-nft-nfconntrack.desc b/test/zdtm/static/socket-tcp-nft-nfconntrack.desc deleted file mode 100644 index 38a4eb389..000000000 --- a/test/zdtm/static/socket-tcp-nft-nfconntrack.desc +++ /dev/null @@ -1,7 +0,0 @@ -{ - 'flavor': 'h', - 'feature': 'network_lock_nftables', - 'opts': '--tcp-established', - 'dopts': '--network-lock nftables', - 'flags': 'suid' -} diff --git a/test/zdtm/static/socket-tcp-reseted.c b/test/zdtm/static/socket-tcp-reseted.c index 4a328e3e0..ad382e38e 100644 --- a/test/zdtm/static/socket-tcp-reseted.c +++ b/test/zdtm/static/socket-tcp-reseted.c @@ -18,7 +18,7 @@ #define ZDTM_SRV_FAMILY AF_INET #endif -const char *test_doc = "Check, that a reset TCP connection can be restored\n"; +const char *test_doc = "Check, that a reseted TCP connection can be restored\n"; const char *test_author = "Andrey Vagin diff --git a/test/zdtm/static/socket-tcp-reseted.desc b/test/zdtm/static/socket-tcp-reseted.desc index ff92e9f9f..3ebdfeef8 100644 --- a/test/zdtm/static/socket-tcp-reseted.desc +++ b/test/zdtm/static/socket-tcp-reseted.desc @@ -1,10 +1,10 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', - '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/riscv64-linux-gnu/xtables/libipt_REJECT.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libipt_REJECT.so|/usr/lib64/xtables/libipt_REJECT.so|/usr/lib/powerpc64le-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/x86_64-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/xtables/libipt_REJECT.so|/usr/lib/s390x-linux-gnu/xtables/libipt_REJECT.so|/usr/lib/aarch64-linux-gnu/xtables/libipt_REJECT.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed has_ipt_legacy' + 'feature' : 'tcp_half_closed' } diff --git a/test/zdtm/static/socket-tcp-syn-sent.desc b/test/zdtm/static/socket-tcp-syn-sent.desc index 52382414b..4cc23c8fc 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.desc +++ b/test/zdtm/static/socket-tcp-syn-sent.desc @@ -1,9 +1,9 @@ { 'deps': [ '/bin/sh', '/sbin/iptables|/usr/sbin/iptables', - '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_tcp.so', - '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so|/usr/lib/riscv64-linux-gnu/xtables/libxt_standard.so', + '/lib/xtables/libxt_tcp.so|/usr/lib64/xtables/libxt_tcp.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_tcp.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_tcp.so|/usr/lib/xtables/libxt_tcp.so|/usr/lib/s390x-linux-gnu/xtables/libxt_tcp.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_tcp.so', + '/lib/xtables/libxt_standard.so|/usr/lib64/xtables/libxt_standard.so|/usr/lib/powerpc64le-linux-gnu/xtables/libxt_standard.so|/usr/lib/x86_64-linux-gnu/xtables/libxt_standard.so|/usr/lib/xtables/libxt_standard.so|/usr/lib/s390x-linux-gnu/xtables/libxt_standard.so|/usr/lib/aarch64-linux-gnu/xtables/libxt_standard.so', ], 'opts': '--tcp-established', 'flags': 'suid nouser samens', - 'feature' : 'tcp_half_closed has_ipt_legacy' + 'feature' : 'tcp_half_closed' } diff --git a/test/zdtm/static/socket-tcp.c b/test/zdtm/static/socket-tcp.c index bc2075496..f6ef47385 100644 --- a/test/zdtm/static/socket-tcp.c +++ b/test/zdtm/static/socket-tcp.c @@ -67,38 +67,17 @@ int main(int argc, char **argv) int val; socklen_t optlen; -#ifdef ZDTM_IPT_CONNTRACK +#ifdef ZDTM_CONNTRACK if (unshare(CLONE_NEWNET)) { pr_perror("unshare"); return 1; } if (system("ip link set up dev lo")) return 1; - - if (system("iptables-legacy -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) + if (system("iptables -w -A INPUT -i lo -p tcp -m state --state NEW,ESTABLISHED -j ACCEPT")) return 1; - if (system("iptables-legacy -w -A INPUT -j DROP")) + if (system("iptables -w -A INPUT -j DROP")) return 1; - -#endif - -#ifdef ZDTM_NFT_CONNTRACK - if (unshare(CLONE_NEWNET)) { - pr_perror("unshare"); - return 1; - } - if (system("ip link set up dev lo")) - return 1; - - if (system("nft add table ip filter")) - return 1; - if (system("nft 'add chain ip filter INPUT { type filter hook input priority 0 ; }'")) - return 1; - if (system("nft add rule ip filter INPUT iifname \"lo\" ip protocol tcp ct state new,established counter accept")) - return 1; - if (system("nft add rule ip filter INPUT counter drop")) - return 1; - #endif #ifdef ZDTM_TCP_LOCAL diff --git a/test/zdtm/static/socket6_icmp.c b/test/zdtm/static/socket6_icmp.c deleted file mode 120000 index 24d8fd806..000000000 --- a/test/zdtm/static/socket6_icmp.c +++ /dev/null @@ -1 +0,0 @@ -socket_icmp.c \ No newline at end of file diff --git a/test/zdtm/static/socket_aio.c b/test/zdtm/static/socket_aio.c index 4d228361f..a545483e1 100644 --- a/test/zdtm/static/socket_aio.c +++ b/test/zdtm/static/socket_aio.c @@ -51,7 +51,7 @@ int main(int argc, char **argv) if (pid == 0) { /* - * Child is client of TCP connection. + * Chiled is client of TCP connection */ close(fd_s); fd = tcp_init_client(AF_INET, "127.0.0.1", port); diff --git a/test/zdtm/static/socket_icmp.c b/test/zdtm/static/socket_icmp.c deleted file mode 100644 index f72e348bf..000000000 --- a/test/zdtm/static/socket_icmp.c +++ /dev/null @@ -1,128 +0,0 @@ -#include "zdtmtst.h" - -const char *test_doc = "static test for ICMP socket\n"; -const char *test_author = "समीर सिंह Sameer Singh \n"; - -/* Description: - * Send a ping to localhost using ICMP socket - */ - -#include -#include -#include -#include -#if defined(ZDTM_IPV6) -#include -#else -#include -#endif -#include -#include -#include - -#include "sysctl.h" - -#define PACKET_SIZE 64 -#define RECV_TIMEOUT 1 - -static int echo_id = 1234; - -#if defined(ZDTM_IPV6) -#define TEST_ICMP_ECHOREPLY ICMP6_ECHOREPLY -#else -#define TEST_ICMP_ECHOREPLY ICMP_ECHOREPLY -#endif -int main(int argc, char **argv) -{ - int ret, sock, seq = 0; - char packet[PACKET_SIZE], recv_packet[PACKET_SIZE]; - - struct timeval tv; -#if defined(ZDTM_IPV6) - struct sockaddr_in6 addr, recv_addr; -#else - struct icmphdr icmp_header, *icmp_reply; -#endif - struct sockaddr_in addr, recv_addr; - socklen_t addr_len; - - // Allow GIDs 0-58468 to open an unprivileged ICMP socket - if (sysctl_write_str("/proc/sys/net/ipv4/ping_group_range", "0 58468")) - return -1; - - test_init(argc, argv); - -#if defined(ZDTM_IPV6) - sock = socket(PF_INET6, SOCK_DGRAM, IPPROTO_ICMPV6); -#else - sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP); -#endif - if (sock < 0) { - pr_perror("Can't create socket"); - return 1; - } - - tv.tv_sec = RECV_TIMEOUT; - tv.tv_usec = 0; - if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) { - pr_perror("Can't set socket option"); - return 1; - } - - memset(&addr, 0, sizeof(addr)); - memset(&icmp_header, 0, sizeof(icmp_header)); -#if defined(ZDTM_IPV6) - addr.sin6_family = AF_INET6; - inet_pton(AF_INET6, "::1", &addr.sin6_addr); - - icmp_header.icmp6_type = ICMP6_ECHO_REQUEST; - icmp_header.icmp6_code = 0; - icmp_header.icmp6_id = echo_id; - icmp_header.icmp6_seq = seq; -#else - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = inet_addr("127.0.0.1"); - - icmp_header.type = ICMP_ECHO; - icmp_header.code = 0; - icmp_header.un.echo.id = echo_id; - icmp_header.un.echo.sequence = seq; -#endif - - memcpy(packet, &icmp_header, sizeof(icmp_header)); - memset(packet + sizeof(icmp_header), 0xa5, - PACKET_SIZE - sizeof(icmp_header)); - - test_daemon(); - test_waitsig(); - - ret = sendto(sock, packet, PACKET_SIZE, 0, - (struct sockaddr *)&addr, sizeof(addr)); - - if (ret < 0) { - fail("Can't send"); - return 1; - } - - addr_len = sizeof(recv_addr); - - ret = recvfrom(sock, recv_packet, sizeof(recv_packet), 0, - (struct sockaddr *)&recv_addr, &addr_len); - - if (ret < 0) { - fail("Can't recv"); - return 1; - } - - icmp_reply = (struct icmphdr *)recv_packet; - - if (icmp_reply->type != ICMP_ECHOREPLY) { - fail("Got no ICMP_ECHO_REPLY"); - return 1; - } - - close(sock); - - pass(); - return 0; -} diff --git a/test/zdtm/static/socket_listen.c b/test/zdtm/static/socket_listen.c index 14e321439..519107d01 100644 --- a/test/zdtm/static/socket_listen.c +++ b/test/zdtm/static/socket_listen.c @@ -70,7 +70,7 @@ int main(int argc, char **argv) if (pid == 0) { /* - * Child is client of TCP connection. + * Chiled is client of TCP connection */ close(fd_s); fd = tcp_init_client(ZDTM_FAMILY, "localhost", port); diff --git a/test/zdtm/static/socket_queues.c b/test/zdtm/static/socket_queues.c index 44495f06b..e30bca0e1 100644 --- a/test/zdtm/static/socket_queues.c +++ b/test/zdtm/static/socket_queues.c @@ -24,12 +24,6 @@ const char *test_author = "Stanislav Kinsbursky \n"; #define SK_DATA_D1 "packet dgram left" #define SK_DATA_D2 "packet dgram right" -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - int main(int argc, char *argv[]) { int ssk_pair_d[2]; @@ -38,7 +32,7 @@ int main(int argc, char *argv[]) test_init(argc, argv); - if (socketpair(AF_UNIX, SOCK_TYPE, 0, ssk_pair_s) == -1) { + if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair_s) == -1) { fail("socketpair"); exit(1); } diff --git a/test/zdtm/static/socket_queues_seqpacket.c b/test/zdtm/static/socket_queues_seqpacket.c deleted file mode 120000 index 0f3f93ea6..000000000 --- a/test/zdtm/static/socket_queues_seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -socket_queues.c \ No newline at end of file diff --git a/test/zdtm/static/socket_udp_shutdown.c b/test/zdtm/static/socket_udp_shutdown.c index a7658b9dd..91dc8f30a 100644 --- a/test/zdtm/static/socket_udp_shutdown.c +++ b/test/zdtm/static/socket_udp_shutdown.c @@ -28,8 +28,8 @@ int main(int argc, char **argv) test_init(argc, argv); - sk1 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); - sk2 = socket(PF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); + sk1 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + sk2 = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sk1 < 0 || sk2 < 0) { pr_perror("Can't create socket"); exit(1); diff --git a/test/zdtm/static/sockets00-seqpacket.c b/test/zdtm/static/sockets00-seqpacket.c deleted file mode 120000 index 4bce9fc31..000000000 --- a/test/zdtm/static/sockets00-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -sockets00.c \ No newline at end of file diff --git a/test/zdtm/static/sockets00-seqpacket.desc b/test/zdtm/static/sockets00-seqpacket.desc deleted file mode 120000 index 4beea2642..000000000 --- a/test/zdtm/static/sockets00-seqpacket.desc +++ /dev/null @@ -1 +0,0 @@ -sockets00.desc \ No newline at end of file diff --git a/test/zdtm/static/sockets00.c b/test/zdtm/static/sockets00.c index ac5d7d6fe..53890077b 100644 --- a/test/zdtm/static/sockets00.c +++ b/test/zdtm/static/sockets00.c @@ -25,12 +25,6 @@ TEST_OPTION(filename, string, "socket file name", 1); #define TEST_MODE 0640 -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - int main(int argc, char *argv[]) { int ssk_icon[4]; @@ -64,9 +58,9 @@ int main(int argc, char *argv[]) memcpy(addr.sun_path, path, addrlen); addrlen += sizeof(addr.sun_family); - ssk_icon[0] = socket(AF_UNIX, SOCK_TYPE, 0); - ssk_icon[1] = socket(AF_UNIX, SOCK_TYPE, 0); - ssk_icon[2] = socket(AF_UNIX, SOCK_TYPE, 0); + ssk_icon[0] = socket(AF_UNIX, SOCK_STREAM, 0); + ssk_icon[1] = socket(AF_UNIX, SOCK_STREAM, 0); + ssk_icon[2] = socket(AF_UNIX, SOCK_STREAM, 0); if (ssk_icon[0] < 0 || ssk_icon[1] < 0 || ssk_icon[2] < 0) { fail("socket"); exit(1); diff --git a/test/zdtm/static/sockets01-seqpacket.c b/test/zdtm/static/sockets01-seqpacket.c deleted file mode 120000 index 8d51121e1..000000000 --- a/test/zdtm/static/sockets01-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -sockets01.c \ No newline at end of file diff --git a/test/zdtm/static/sockets01.c b/test/zdtm/static/sockets01.c index f56cd219e..e35a31fec 100644 --- a/test/zdtm/static/sockets01.c +++ b/test/zdtm/static/sockets01.c @@ -30,12 +30,6 @@ const char *test_author = "Pavel Emelyanov "; #define TEST_MSG "test-message" static char buf[sizeof(TEST_MSG)]; -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - int main(int argc, char *argv[]) { int spu[2], spb[2], dpu[2], dpb[2], dpd[2]; @@ -46,14 +40,14 @@ int main(int argc, char *argv[]) signal(SIGPIPE, SIG_IGN); /* spu -- stream pair, unidirectional shutdown */ - if (socketpair(PF_UNIX, SOCK_TYPE, 0, spu) < 0) + if (socketpair(PF_UNIX, SOCK_STREAM, 0, spu) < 0) fin("no stream pair 1"); if (shutdown(spu[0], SHUT_RD) < 0) fin("no stream shutdown 1"); /* spb -- stream pair, bidirectional shutdown */ - if (socketpair(PF_UNIX, SOCK_TYPE, 0, spb) < 0) + if (socketpair(PF_UNIX, SOCK_STREAM, 0, spb) < 0) fin("no stream pair 2"); if (shutdown(spb[0], SHUT_RDWR) < 0) diff --git a/test/zdtm/static/sockets02-seqpacket.c b/test/zdtm/static/sockets02-seqpacket.c deleted file mode 120000 index b95831599..000000000 --- a/test/zdtm/static/sockets02-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -sockets02.c \ No newline at end of file diff --git a/test/zdtm/static/sockets02.c b/test/zdtm/static/sockets02.c index d7d84d815..2729ade2c 100644 --- a/test/zdtm/static/sockets02.c +++ b/test/zdtm/static/sockets02.c @@ -16,12 +16,6 @@ const char *test_doc = "Test semi-closed unix stream connection\n"; const char *test_author = "Pavel Emelyanov \n"; -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - int main(int argc, char *argv[]) { int ssk_pair[2], ret; @@ -31,7 +25,7 @@ int main(int argc, char *argv[]) data = (char)lrand48(); - if (socketpair(AF_UNIX, SOCK_TYPE, 0, ssk_pair) == -1) { + if (socketpair(AF_UNIX, SOCK_STREAM, 0, ssk_pair) == -1) { fail("socketpair"); exit(1); } diff --git a/test/zdtm/static/sockets03-seqpacket.c b/test/zdtm/static/sockets03-seqpacket.c deleted file mode 120000 index 997cce673..000000000 --- a/test/zdtm/static/sockets03-seqpacket.c +++ /dev/null @@ -1 +0,0 @@ -sockets03.c \ No newline at end of file diff --git a/test/zdtm/static/sockets03-seqpacket.desc b/test/zdtm/static/sockets03-seqpacket.desc deleted file mode 120000 index 3798a8242..000000000 --- a/test/zdtm/static/sockets03-seqpacket.desc +++ /dev/null @@ -1 +0,0 @@ -sockets03.desc \ No newline at end of file diff --git a/test/zdtm/static/sockets03.c b/test/zdtm/static/sockets03.c index 6b0915aaa..cd6f60831 100644 --- a/test/zdtm/static/sockets03.c +++ b/test/zdtm/static/sockets03.c @@ -22,12 +22,6 @@ const char *test_author = "Andrey Ryabinin "; char *filename; TEST_OPTION(filename, string, "socket file name", 1); -#ifdef ZDTM_UNIX_SEQPACKET -#define SOCK_TYPE SOCK_SEQPACKET -#else -#define SOCK_TYPE SOCK_STREAM -#endif - int main(int argc, char *argv[]) { int sk[3]; @@ -58,8 +52,8 @@ int main(int argc, char *argv[]) memcpy(addr.sun_path, path, addrlen); addrlen += sizeof(addr.sun_family); - sk[0] = socket(AF_UNIX, SOCK_TYPE, 0); - sk[1] = socket(AF_UNIX, SOCK_TYPE, 0); + sk[0] = socket(AF_UNIX, SOCK_STREAM, 0); + sk[1] = socket(AF_UNIX, SOCK_STREAM, 0); if (sk[0] < 0 || sk[1] < 0) { fail("socket"); exit(1); diff --git a/test/zdtm/static/sockets_spair.c b/test/zdtm/static/sockets_spair.c index 202c2e790..2dbb132aa 100644 --- a/test/zdtm/static/sockets_spair.c +++ b/test/zdtm/static/sockets_spair.c @@ -18,12 +18,6 @@ const char *test_author = "Cyrill Gorcunov -#include -#include - -#include "zdtmtst.h" -#include "lock.h" - -const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; -const char *test_author = "Yuriy Vasiliev "; - -#define STOP_SIGNO SIGTSTP -const char *stop_sigstr = "SIGTSTP"; -enum { - FUTEX_INITIALIZED = 0, - TEST_CRIU, - TEST_CHECK, - TEST_DONE, - TEST_EXIT, - TEST_EMERGENCY_ABORT, -}; - -struct shared { - futex_t fstate; - int status; - int code; -} *sh; - -static int new_pgrp(void) -{ - siginfo_t infop; - int ret = 1; - pid_t pid; - - /* - * Set the PGID to avoid creating an orphaned process group, - * which is not to be affected by terminal-generated stop signals. - */ - setpgid(0, 0); - - pid = test_fork(); - if (pid < 0) - goto err_cr; - - if (pid == 0) { - /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ - futex_wait_while_lt(&sh->fstate, TEST_EXIT); - exit(0); - } - - if (kill(pid, STOP_SIGNO)) { - pr_perror("Unable to send %s", stop_sigstr); - goto err_cr; - } - - if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { - pr_perror("Unable to waitid %d", pid); - goto err_cont; - } - - sh->code = infop.si_code; - sh->status = infop.si_status; - - /* Return the control back to MAIN worker to do C/R */ - futex_set_and_wake(&sh->fstate, TEST_CRIU); - futex_wait_while_lt(&sh->fstate, TEST_CHECK); - - infop.si_code = 0; - infop.si_status = 0; - - if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { - pr_perror("Unable to waitid %d", pid); - goto err_cont; - } - - sh->code = infop.si_code; - sh->status = infop.si_status; - - futex_set_and_wake(&sh->fstate, TEST_DONE); - futex_wait_while_lt(&sh->fstate, TEST_EXIT); - - ret = 0; -err_cont: - kill(pid, SIGCONT); -err_cr: - if (ret) - futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); - if (pid > 0) - wait(NULL); - - return ret; -} - -int main(int argc, char **argv) -{ - int fail = 0; - pid_t pid; - - test_init(argc, argv); - - sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (sh == MAP_FAILED) { - pr_perror("Failed to alloc shared region"); - return 1; - } - - futex_set(&sh->fstate, FUTEX_INITIALIZED); - - pid = test_fork(); - if (pid < 0) { - fail = 1; - goto out; - } - - if (pid == 0) - exit(new_pgrp()); - - /* Wait until pgrp is ready to C/R */ - futex_wait_while_lt(&sh->fstate, TEST_CRIU); - if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { - pr_err("Fail in child worker before C/R\n"); - fail = 1; - goto out; - } - - if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { - pr_err("Process is not in correct state before C/R." - " Expected stop signo: %d. Get stop signo: %d\n", - STOP_SIGNO, sh->status); - fail = 1; - goto out; - } - - test_daemon(); - test_waitsig(); - - futex_set_and_wake(&sh->fstate, TEST_CHECK); - futex_wait_while_lt(&sh->fstate, TEST_DONE); - if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { - pr_err("Fail in child worker after C/R\n"); - goto out; - } - - if (sh->code != CLD_STOPPED || sh->status != STOP_SIGNO) { - fail = 1; - pr_err("Process is not in correct state after C/R." - " Expected stop signo: %d. Get stop signo: %d\n", - STOP_SIGNO, sh->status); - } - - if (!fail) - pass(); - - futex_set_and_wake(&sh->fstate, TEST_EXIT); -out: - if (pid > 0) - wait(NULL); - - munmap(sh, sizeof(struct shared)); - - return fail; -} diff --git a/test/zdtm/static/stopped04.c b/test/zdtm/static/stopped04.c deleted file mode 100644 index 9bd968aa2..000000000 --- a/test/zdtm/static/stopped04.c +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include -#include - -#include "zdtmtst.h" -#include "lock.h" - -const char *test_doc = "Check, that stopped by SIGTSTP tasks are restored correctly"; -const char *test_author = "Yuriy Vasiliev "; - -const char *stop_sigstr = "SIGTSTP"; -enum { - FUTEX_INITIALIZED = 0, - TEST_CRIU, - TEST_DONE, - TEST_EXIT, - TEST_EMERGENCY_ABORT, -}; - -struct shared { - futex_t fstate; - int status; - int code; -} *sh; - -static int new_pgrp(void) -{ - sigset_t sigset; - siginfo_t infop; - int ret = 1; - pid_t pid; - - /* - * Set the PGID to avoid creating an orphaned process group, - * which is not to be affected by terminal-generated stop signals. - */ - setpgid(0, 0); - - sigemptyset(&sigset); - sigaddset(&sigset, SIGTSTP); - sigprocmask(SIG_BLOCK, &sigset, NULL); - - pid = test_fork(); - if (pid < 0) - goto err_cr; - - if (pid == 0) { - /* wait for TEST_EXIT or TEST_EMERGENCY_ABORT*/ - futex_wait_while_lt(&sh->fstate, TEST_EXIT); - exit(0); - } - - if (kill(pid, SIGSTOP)) { - pr_perror("Unable to send %s", stop_sigstr); - goto err_cr; - } - - if (waitid(P_PID, pid, &infop, WNOWAIT | WSTOPPED) < 0) { - pr_perror("Unable to waitid %d", pid); - goto err_cont; - } - - if (kill(pid, SIGTSTP)) { - pr_perror("Unable to send %s", stop_sigstr); - goto err_cr; - } - - /* Return the control back to MAIN worker to do C/R */ - futex_set_and_wake(&sh->fstate, TEST_CRIU); - futex_wait_while_lt(&sh->fstate, TEST_EXIT); - - ret = 0; -err_cont: - kill(pid, SIGCONT); -err_cr: - if (ret) - futex_set_and_wake(&sh->fstate, TEST_EMERGENCY_ABORT); - if (pid > 0) - wait(NULL); - - return ret; -} - -int main(int argc, char **argv) -{ - int fail = 0; - pid_t pid; - - test_init(argc, argv); - - sh = mmap(NULL, sizeof(struct shared), PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (sh == MAP_FAILED) { - pr_perror("Failed to alloc shared region"); - return 1; - } - - futex_set(&sh->fstate, FUTEX_INITIALIZED); - - pid = test_fork(); - if (pid < 0) { - fail = 1; - goto out; - } - - if (pid == 0) - exit(new_pgrp()); - - /* Wait until pgrp is ready to C/R */ - futex_wait_while_lt(&sh->fstate, TEST_CRIU); - if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { - pr_err("Fail in child worker before C/R\n"); - fail = 1; - goto out; - } - - test_daemon(); - test_waitsig(); - - if (futex_get(&sh->fstate) == TEST_EMERGENCY_ABORT) { - pr_err("Fail in child worker after C/R\n"); - goto out; - } - - if (!fail) - pass(); - - futex_set_and_wake(&sh->fstate, TEST_EXIT); -out: - if (pid > 0) - wait(NULL); - - munmap(sh, sizeof(struct shared)); - - return fail; -} diff --git a/test/zdtm/static/tempfs_subns.c b/test/zdtm/static/tempfs_subns.c index 490fdad6e..ed3ef9a3a 100644 --- a/test/zdtm/static/tempfs_subns.c +++ b/test/zdtm/static/tempfs_subns.c @@ -20,7 +20,7 @@ int main(int argc, char **argv) { int fds[2], i; pid_t pid; - int status, fd = -1; + int fd, status; test_init(argc, argv); diff --git a/test/zdtm/static/thp_disable.c b/test/zdtm/static/thp_disable.c index 55609f260..ab88120c2 100644 --- a/test/zdtm/static/thp_disable.c +++ b/test/zdtm/static/thp_disable.c @@ -17,7 +17,6 @@ int main(int argc, char **argv) unsigned long orig_flags = 0, new_flags = 0; unsigned long orig_madv = 0, new_madv = 0; void *area; - int ret; test_init(argc, argv); @@ -36,46 +35,9 @@ int main(int argc, char **argv) return -1; } - ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); - if (ret < 0) { - pr_perror("Getting THP-disabled flag failed"); - return -1; - } - if (ret != 1) { - errno = 0; - fail("prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); - return -1; - } - - test_msg("Fetch pre-migration flags/adv\n"); - if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) - return -1; - - errno = 0; - if (orig_flags != new_flags) { - fail("Flags changed %lx -> %lx", orig_flags, new_flags); - return -1; - } - - if (orig_madv != new_madv) { - fail("Madvs changed %lx -> %lx", orig_madv, new_madv); - return -1; - } - test_daemon(); test_waitsig(); - ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); - if (ret < 0) { - pr_perror("Getting post-migration THP-disabled flag failed"); - return -1; - } - if (ret != 1) { - errno = 0; - fail("post-migration prctl(GET_THP_DISABLE) returned unexpected value: %d != 1", ret); - return -1; - } - if (prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0)) { pr_perror("Enabling THP failed"); return -1; @@ -85,14 +47,15 @@ int main(int argc, char **argv) if (get_smaps_bits((unsigned long)area, &new_flags, &new_madv)) return -1; - errno = 0; if (orig_flags != new_flags) { - fail("Flags changed %lx -> %lx", orig_flags, new_flags); + pr_err("Flags are changed %lx -> %lx\n", orig_flags, new_flags); + fail(); return -1; } if (orig_madv != new_madv) { - fail("Madvs changed %lx -> %lx", orig_madv, new_madv); + pr_err("Madvs are changed %lx -> %lx\n", orig_madv, new_madv); + fail(); return -1; } diff --git a/test/zdtm/static/thread_different_uid_gid.c b/test/zdtm/static/thread_different_uid_gid.c index 88f99659b..3a0b6291b 100644 --- a/test/zdtm/static/thread_different_uid_gid.c +++ b/test/zdtm/static/thread_different_uid_gid.c @@ -130,7 +130,7 @@ int main(int argc, char **argv) ret = syscall(SYS_setresgid, maingroup, maingroup, maingroup); if (ret >= 0) { ret = syscall(SYS_setresuid, mainuser, mainuser, mainuser); - } else { + } else if (ret < 0) { pr_perror("Failed to drop privileges"); exit(1); } diff --git a/test/zdtm/static/timers01.c b/test/zdtm/static/timers01.c deleted file mode 100644 index 10ecc3481..000000000 --- a/test/zdtm/static/timers01.c +++ /dev/null @@ -1,74 +0,0 @@ -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Checks non-periodic timers\n"; -const char *test_author = "Andrei Vagin "; - -static struct { - const int timer_type; - const int signal; - volatile sig_atomic_t count; -} timer_tests[] = { - /* from slowest to fastest */ - { ITIMER_VIRTUAL, SIGVTALRM }, - { ITIMER_PROF, SIGPROF }, - { ITIMER_REAL, SIGALRM }, -}; - -#define NUM_TIMERS (sizeof(timer_tests) / sizeof(timer_tests[0])) -#define TIMER_TIMEOUT 3600 -#define TIMER_ALLOWED_DELTA 300 - -static void setup_timers(void) -{ - int i; - struct itimerval tv = { - .it_interval = { .tv_sec = 0, .tv_usec = 0 }, - .it_value = { .tv_sec = TIMER_TIMEOUT, .tv_usec = 0 }, - }; - - for (i = 0; i < NUM_TIMERS; i++) { - if (setitimer(timer_tests[i].timer_type, &tv, NULL) < 0) { - pr_perror("can't set timer %d", i); - exit(1); - } - } -} - -static void check_timers(void) -{ - int i; - - for (i = 0; i < NUM_TIMERS; i++) { - struct itimerval tv = {}; - - if (getitimer(timer_tests[i].timer_type, &tv)) { - pr_perror("gettimer"); - exit(1); - } - if (tv.it_value.tv_sec > TIMER_TIMEOUT || - tv.it_value.tv_sec < TIMER_TIMEOUT - TIMER_ALLOWED_DELTA) { - fail("%ld isn't in [%d, %d]", (long)tv.it_value.tv_sec, - TIMER_TIMEOUT, - TIMER_TIMEOUT - TIMER_ALLOWED_DELTA); - exit(1); - } - } - pass(); -} - -int main(int argc, char **argv) -{ - test_init(argc, argv); - - setup_timers(); - - test_daemon(); - test_waitsig(); - - check_timers(); - return 0; -} diff --git a/test/zdtm/static/uffd-events.c b/test/zdtm/static/uffd-events.c index edd6c09ca..c811bcf4c 100644 --- a/test/zdtm/static/uffd-events.c +++ b/test/zdtm/static/uffd-events.c @@ -153,30 +153,28 @@ int main(int argc, char **argv) return 1; } - test_msg("For a child process\n"); pid = fork(); if (pid < 0) { fail("Can't fork"); return 1; } - test_msg("Check madvise(MADV_DONTNEED)\n"); + /* check madvise(MADV_DONTNEED) */ if (check_madv_dn(1)) return 1; - test_msg("Check growing mremap\n"); + /* check growing mremap */ if (check_mremap_grow(2)) return 1; - test_msg("Check swapped mappings\n"); + /* check swapped mappings */ if (check_swapped_mappings(3)) return 1; if (pid) { - int status = -1; + int status; - test_msg("Wait for the child %d\n", pid); - waitpid(pid, &status, 0); + waitpid(-1, &status, 0); if (status) { fail("child failed"); return status; diff --git a/test/zdtm/static/sk-unix-listen02.desc b/test/zdtm/static/unlink_largefile.desc similarity index 100% rename from test/zdtm/static/sk-unix-listen02.desc rename to test/zdtm/static/unlink_largefile.desc diff --git a/test/zdtm/static/unlink_mmap00.c b/test/zdtm/static/unlink_mmap00.c index 37cbc6b70..405157dac 100644 --- a/test/zdtm/static/unlink_mmap00.c +++ b/test/zdtm/static/unlink_mmap00.c @@ -9,7 +9,7 @@ #include "zdtmtst.h" -const char *test_doc = "Test mmapped and unlinked files"; +const char *test_doc = "Test mmaped and unlinked files"; char *filename; TEST_OPTION(filename, string, "file name", 1); diff --git a/test/zdtm/static/unlink_mmap01.c b/test/zdtm/static/unlink_mmap01.c index 13e808cac..d43246b79 100644 --- a/test/zdtm/static/unlink_mmap01.c +++ b/test/zdtm/static/unlink_mmap01.c @@ -11,7 +11,7 @@ #include "zdtmtst.h" -const char *test_doc = "Test mmapped and unlinked files (2, with hard links)"; +const char *test_doc = "Test mmaped and unlinked files (2, with hard links)"; char *filename; TEST_OPTION(filename, string, "file name", 1); diff --git a/test/zdtm/static/unlink_mmap02.c b/test/zdtm/static/unlink_mmap02.c index ca2ec18ed..a6b1841b3 100644 --- a/test/zdtm/static/unlink_mmap02.c +++ b/test/zdtm/static/unlink_mmap02.c @@ -9,7 +9,7 @@ #include "zdtmtst.h" -const char *test_doc = "Test mmapped, opened and unlinked files"; +const char *test_doc = "Test mmaped, opened and unlinked files"; char *filename; TEST_OPTION(filename, string, "file name", 1); diff --git a/test/zdtm/static/uprobes.c b/test/zdtm/static/uprobes.c deleted file mode 100644 index 6ef9a56bc..000000000 --- a/test/zdtm/static/uprobes.c +++ /dev/null @@ -1,295 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -const char *test_doc = "Test the --allow-uprobes option"; -const char *test_author = "Shashank Balaji "; - -#define UPROBE_GROUP_NAME "zdtm" -#define UPROBE_EVENT_NAME "uprobes_test" -#define UPROBED_FUNCTION uprobe_target - -/* - * A uprobe can be set at the start of a function, but not all instructions - * will trigger the creation of a uprobes vma. - * - * Examples: - * - aarch64: if the function is a single `ret`, then no vma creation - * - x64: if the function is `nop; ret`, then no vma creation - * - * So to guarantee vma creation, create a volatile dummy variable (to prevent - * compiler optimization) and use it (to prevent "unused variable" warning) - */ -void UPROBED_FUNCTION(void) { - volatile int dummy __maybe_unused = 0; - dummy += 1; -} -/* Calling via volatile function pointer ensures noinline at callsite */ -typedef void (*func_ptr)(void); -volatile func_ptr uprobe_target_alias = UPROBED_FUNCTION; - -struct uprobe_context { - struct tracefs_instance *instance; - struct tracefs_dynevent *uprobe; -}; - -volatile bool got_sigtrap = false; - -/* - * Returns the file offset of a symbol in the executable of this program - * Returns 0 on failure -*/ -uint64_t calc_sym_offset(const char *sym_name) -{ - GElf_Shdr section_header; - Elf_Scn *section = NULL; - Elf_Data *symtab_data; - uint64_t offset = 0; - char buf[PATH_MAX]; - GElf_Sym symbol; - ssize_t n_bytes; - int n_entries; - Elf *elf; - int fd; - int i; - - if (elf_version(EV_CURRENT) == EV_NONE) { - pr_err("ELF version of libelf is lower than that of the program\n"); - return 0; - } - - n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); - if (n_bytes < 0) { - pr_perror("Failed to readlink /proc/self/exe"); - return 0; - } - buf[n_bytes] = '\0'; - - fd = open(buf, O_RDONLY); - if (fd < 0) { - pr_perror("Failed to open self-executable"); - return 0; - } - - elf = elf_begin(fd, ELF_C_READ, NULL); - if (!elf) { - pr_err("%s\n", elf_errmsg(elf_errno())); - goto out_fd; - } - - /* Look for the symbol table section and its header */ - while ((section = elf_nextscn(elf, section)) != NULL) { - gelf_getshdr(section, §ion_header); - if (section_header.sh_type == SHT_SYMTAB) - break; - } - if (!section) { - pr_err("Failed to find symbol table\n"); - goto out_elf; - } - symtab_data = elf_getdata(section, NULL); - n_entries = section_header.sh_size / section_header.sh_entsize; - - /* Look for a symbol with the required name */ - for (i = 0; i < n_entries; i++) { - gelf_getsym(symtab_data, i, &symbol); - /* Symbol table's sh_link is the index of the string table section header */ - if (!strcmp(sym_name, - elf_strptr(elf, section_header.sh_link, symbol.st_name))) - break; - } - if (i == n_entries) { - pr_err("Failed to find symbol \"%s\"\n", sym_name); - goto out_elf; - } - - /* Get the section the symbol belongs to (mostly .text) */ - section = elf_getscn(elf, symbol.st_shndx); - gelf_getshdr(section, §ion_header); - offset = symbol.st_value - section_header.sh_addr + section_header.sh_offset; - -out_elf: - elf_end(elf); -out_fd: - close(fd); - return offset; -} - -/* - * Set and enable a uprobe on the file at the given offset - * Returns struct uprobe_context with members set to NULL on failure -*/ -struct uprobe_context enable_uprobe(const char *file, uint64_t offset) -{ - struct tracefs_instance *trace_instance; - struct tracefs_dynevent *uprobe; - struct uprobe_context context = {}; - - trace_instance = tracefs_instance_create("zdtm_uprobes_test"); - if (!trace_instance) { - pr_perror("Failed to create tracefs instance"); - return context; - } - tracefs_instance_reset(trace_instance); - - uprobe = tracefs_uprobe_alloc(UPROBE_GROUP_NAME, UPROBE_EVENT_NAME, file, offset, NULL); - if (!uprobe) { - pr_perror("Failed to allocate uprobe"); - goto instance_destroy; - } - - if (tracefs_dynevent_create(uprobe)) { - pr_perror("Failed to create uprobe"); - goto uprobe_free; - } - - if (tracefs_event_enable(trace_instance, UPROBE_GROUP_NAME, UPROBE_EVENT_NAME)) { - pr_perror("Failed to enable uprobe"); - goto uprobe_destroy; - } - - context.instance = trace_instance; - context.uprobe = uprobe; - return context; - -uprobe_destroy: - tracefs_dynevent_destroy(uprobe, false); -uprobe_free: - tracefs_dynevent_free(uprobe); -instance_destroy: - tracefs_instance_destroy(trace_instance); - tracefs_instance_free(trace_instance); - return context; -} - -void destroy_uprobe(struct uprobe_context context) -{ - tracefs_dynevent_destroy(context.uprobe, true); - tracefs_dynevent_free(context.uprobe); - tracefs_instance_destroy(context.instance); - tracefs_instance_free(context.instance); -} - -/* - * Check for the existence of the "[uprobes]" vma in /proc/self/maps - * Returns -1 on failure, 0 if not found, 1 if found -*/ -int uprobes_vma_exists(void) -{ - FILE *f; - char buf[LINE_MAX]; - int ret = 0; - - f = fopen("/proc/self/maps", "r"); - if (!f) { - pr_perror("Failed to open /proc/self/maps"); - return -1; - } - - while (fgets(buf, sizeof(buf), f)) { - if (strstr(buf, "[uprobes]")) { - ret = 1; - break; - } - } - if (ret == 0 && !feof(f)) { - pr_err("Failed to finish reading /proc/self/maps\n"); - ret = -1; - } - - fclose(f); - return ret; -} - -/* - * SIGTRAP is sent if execution reaches a previously set uprobed location, and - * the corresponding uprobe is not active. We don't want this to happen on restore -*/ -void sigtrap_handler(int signo, siginfo_t *info, void* context) -{ - if (info->si_code == SI_KERNEL) { - got_sigtrap = true; - fail("SIGTRAP on attempting to call uprobed function"); - } -} - -int main(int argc, char **argv) -{ - struct uprobe_context context; - struct sigaction sa; - char buf[PATH_MAX]; - uint64_t offset; - int n_bytes; - int ret = 1; - - test_init(argc, argv); - - offset = calc_sym_offset(__stringify(UPROBED_FUNCTION)); - if (!offset) - return 1; - - n_bytes = readlink("/proc/self/exe", buf, sizeof(buf)); - if (n_bytes < 0) { - pr_perror("Failed to readlink /proc/self/exe"); - return 1; - } - buf[n_bytes] = '\0'; - - sa.sa_flags = SA_SIGINFO; - sa.sa_sigaction = sigtrap_handler; - sigemptyset(&sa.sa_mask); - if (sigaction(SIGTRAP, &sa, NULL)) { - pr_perror("Failed to set SIGTRAP handler"); - return 1; - } - - context = enable_uprobe(buf, offset); - if (!context.instance) - return 1; - - /* - * Execution must reach the uprobed location at least once - * for the kernel to create the uprobes vma - */ - uprobe_target_alias(); - - switch (uprobes_vma_exists()) { - case -1: - goto out_uprobe; - break; - case 0: - pr_err("uprobes vma does not exist\n"); - goto out_uprobe; - break; - case 1: - test_msg("Found uprobes vma\n"); - break; - } - - test_daemon(); - test_waitsig(); - - /* - * Calling the uprobed function after restore should not cause - * a SIGTRAP, since the uprobe is still active - */ - uprobe_target_alias(); - if (!got_sigtrap) { - pass(); - ret = 0; - } - -out_uprobe: - destroy_uprobe(context); - return ret; -} diff --git a/test/zdtm/static/uprobes.desc b/test/zdtm/static/uprobes.desc deleted file mode 100644 index 6eab1f498..000000000 --- a/test/zdtm/static/uprobes.desc +++ /dev/null @@ -1,6 +0,0 @@ -{ - 'feature': 'cgroupns', - 'flags': 'suid nouser', - 'flavor': 'h', - 'opts': '--allow-uprobes' -} diff --git a/test/zdtm/static/vdso-proxy.c b/test/zdtm/static/vdso-proxy.c index a53e6cdc0..43334974f 100644 --- a/test/zdtm/static/vdso-proxy.c +++ b/test/zdtm/static/vdso-proxy.c @@ -70,7 +70,6 @@ static int parse_maps(struct vm_area *vmas) #endif v->is_vvar_or_vdso |= strstr(buf, "[vdso]") != NULL; v->is_vvar_or_vdso |= strstr(buf, "[vvar]") != NULL; - v->is_vvar_or_vdso |= strstr(buf, "[vvar_vclock]") != NULL; test_msg("[NOTE]\tVMA: [%#" PRIx64 ", %#" PRIx64 "]\n", v->start, v->end); } @@ -87,35 +86,42 @@ static int parse_maps(struct vm_area *vmas) return i; } -static int check_vvar_vdso(struct vm_area *before, int nr_before, struct vm_area *after, int nr_after) +int compare_vmas(struct vm_area *vmax, struct vm_area *vmay) +{ + if (vmax->start > vmay->start) + return 1; + if (vmax->start < vmay->start) + return -1; + if (vmax->end > vmay->end) + return 1; + if (vmax->end < vmay->end) + return -1; + + return 0; +} + +static int check_vvar_vdso(struct vm_area *before, struct vm_area *after) { int i, j = 0; - for (i = 0, j = 0; i < nr_before || j < nr_after;) { - if (j == nr_after || before[i].start < after[j].start) { + for (i = 0; i < MAX_VMAS && j < MAX_VMAS; i++, j++) { + int cmp = compare_vmas(&before[i], &after[j]); + + if (cmp == 0) + continue; + + if (cmp < 0) { /* Lost mapping */ test_msg("[NOTE]\tLost mapping: %#" PRIx64 "-%#" PRIx64 "\n", before[i].start, before[i].end); + j--; if (before[i].is_vvar_or_vdso) { fail("Lost vvar/vdso mapping"); return -1; } - i++; continue; } - if (i == nr_before || before[i].start > after[j].start) { - test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); - j++; - continue; - } - if (before[i].end == after[j].end) { - i++; - j++; - } else if (before[i].end > after[j].end) { - before[i].start = after[j].end; - j++; - } else { - after[j].start = before[i].end; - i++; - } + + test_msg("[NOTE]\tNew mapping appeared: %#" PRIx64 "-%#" PRIx64 "\n", after[j].start, after[j].end); + i--; } return 0; @@ -123,10 +129,11 @@ static int check_vvar_vdso(struct vm_area *before, int nr_before, struct vm_area static struct vm_area vmas_before[MAX_VMAS]; static struct vm_area vmas_after[MAX_VMAS]; -static int nr_before, nr_after; int main(int argc, char *argv[]) { + int nr_before, nr_after; + test_init(argc, argv); test_msg("[NOTE]\tMappings before:\n"); @@ -147,7 +154,7 @@ int main(int argc, char *argv[]) } /* After restore vDSO/VVAR blobs must remain in the old place. */ - if (check_vvar_vdso(vmas_before, nr_before, vmas_after, nr_after)) + if (check_vvar_vdso(vmas_before, vmas_after)) return -1; if (nr_before + 2 < nr_after) { diff --git a/test/zdtm/static/vdso00.c b/test/zdtm/static/vdso00.c index 69123a203..a9bef4dbd 100644 --- a/test/zdtm/static/vdso00.c +++ b/test/zdtm/static/vdso00.c @@ -1,6 +1,6 @@ #include #include -#include + #include #include @@ -19,14 +19,14 @@ int main(int argc, char *argv[]) test_msg("%s pid %d\n", argv[0], getpid()); gettimeofday(&tv, &tz); - test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); + test_msg("%d time: %10li\n", getpid(), tv.tv_sec); test_daemon(); test_waitsig(); /* this call will fail if vDSO is corrupted */ gettimeofday(&tv, &tz); - test_msg("%d time: %10" PRId64 "\n", getpid(), (int64_t)tv.tv_sec); + test_msg("%d time: %10li\n", getpid(), tv.tv_sec); pass(); diff --git a/test/zdtm/static/vdso01.c b/test/zdtm/static/vdso01.c index d8b3c94d5..d8d64155a 100644 --- a/test/zdtm/static/vdso01.c +++ b/test/zdtm/static/vdso01.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -325,8 +324,7 @@ static int vdso_clock_gettime_handler(void *func) clock_gettime(CLOCK_REALTIME, &ts1); vdso_clock_gettime(CLOCK_REALTIME, &ts2); - test_msg("clock_gettime: tv_sec %" PRId64 " vdso_clock_gettime: tv_sec %" PRId64 "\n", - (int64_t)ts1.tv_sec, (int64_t)ts2.tv_sec); + test_msg("clock_gettime: tv_sec %li vdso_clock_gettime: tv_sec %li\n", ts1.tv_sec, ts2.tv_sec); if (labs(ts1.tv_sec - ts2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -356,8 +354,7 @@ static int vdso_gettimeofday_handler(void *func) gettimeofday(&tv1, &tz); vdso_gettimeofday(&tv2, &tz); - test_msg("gettimeofday: tv_sec %" PRId64 " vdso_gettimeofday: tv_sec %" PRId64 "\n", - (int64_t)tv1.tv_sec, (int64_t)tv2.tv_sec); + test_msg("gettimeofday: tv_sec %li vdso_gettimeofday: tv_sec %li\n", tv1.tv_sec, tv2.tv_sec); if (labs(tv1.tv_sec - tv2.tv_sec) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); @@ -375,7 +372,7 @@ static int vdso_time_handler(void *func) t1 = time(NULL); t2 = vdso_time(NULL); - test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t2); + test_msg("time: %li vdso_time: %li\n", (long)t1, (long)t1); if (labs(t1 - t2) > TIME_DELTA_SEC) { pr_perror("Delta is too big"); diff --git a/test/zdtm/static/vdso02.c b/test/zdtm/static/vdso02.c index 5779b7fd6..ea28c4453 100644 --- a/test/zdtm/static/vdso02.c +++ b/test/zdtm/static/vdso02.c @@ -29,8 +29,7 @@ static int parse_vm_area(char *buf, struct vm_area *vma) return -1; } -static int find_blobs(pid_t pid, struct vm_area *vdso, - struct vm_area *vvar, struct vm_area *vvar_vclock) +static int find_blobs(pid_t pid, struct vm_area *vdso, struct vm_area *vvar) { char buf[BUF_SZ]; int ret = -1; @@ -40,8 +39,6 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, vdso->end = VDSO_BAD_ADDR; vvar->start = VVAR_BAD_ADDR; vvar->end = VVAR_BAD_ADDR; - vvar_vclock->start = VVAR_BAD_ADDR; - vvar_vclock->end = VVAR_BAD_ADDR; if (snprintf(buf, BUF_SZ, "/proc/%d/maps", pid) < 0) { pr_perror("snprintf() failure for path"); @@ -60,18 +57,12 @@ static int find_blobs(pid_t pid, struct vm_area *vdso, if (strstr(buf, "[vvar]") && parse_vm_area(buf, vvar)) goto err; - if (strstr(buf, "[vvar_vclock]") && - parse_vm_area(buf, vvar_vclock)) - goto err; } if (vdso->start != VDSO_BAD_ADDR) test_msg("[vdso] %lx-%lx\n", vdso->start, vdso->end); if (vvar->start != VVAR_BAD_ADDR) test_msg("[vvar] %lx-%lx\n", vvar->start, vvar->end); - if (vvar_vclock->start != VVAR_BAD_ADDR) - test_msg("[vvar_vclock] %lx-%lx\n", - vvar_vclock->start, vvar_vclock->end); ret = 0; err: fclose(maps); @@ -80,7 +71,7 @@ err: #ifdef __i386__ /* - * On i386 syscalls for speed are optimized through vdso, + * On i386 syscalls for speed are optimized trough vdso, * call raw int80 as vdso is unmapped. */ #define __NR32_munmap 91 @@ -152,10 +143,10 @@ void sys_exit(int status) static int unmap_blobs(void) { - struct vm_area vdso, vvar, vvar_vclock; + struct vm_area vdso, vvar; int ret; - if (find_blobs(getpid(), &vdso, &vvar, &vvar_vclock)) + if (find_blobs(getpid(), &vdso, &vvar)) return -1; if (vdso.start != VDSO_BAD_ADDR) { @@ -168,19 +159,13 @@ static int unmap_blobs(void) if (ret) return ret; } - if (vvar_vclock.start != VVAR_BAD_ADDR) { - ret = sys_munmap((void *)vvar_vclock.start, - vvar_vclock.end - vvar_vclock.start); - if (ret) - return ret; - } return 0; } int main(int argc, char *argv[]) { - struct vm_area vdso, vvar, vvar_vclock; + struct vm_area vdso, vvar; pid_t child; int status, ret = -1; @@ -216,11 +201,9 @@ int main(int argc, char *argv[]) goto out_kill; } - if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) + if (find_blobs(child, &vdso, &vvar)) goto out_kill; - if (vdso.start != VDSO_BAD_ADDR || - vvar.start != VVAR_BAD_ADDR || - vvar_vclock.start != VVAR_BAD_ADDR) { + if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Found vvar or vdso blob(s) in child, which should have unmapped them\n"); goto out_kill; } @@ -228,7 +211,7 @@ int main(int argc, char *argv[]) test_daemon(); test_waitsig(); - if (find_blobs(child, &vdso, &vvar, &vvar_vclock)) + if (find_blobs(child, &vdso, &vvar)) goto out_kill; if (vdso.start != VDSO_BAD_ADDR || vvar.start != VVAR_BAD_ADDR) { pr_err("Child without vdso got it after C/R\n"); diff --git a/test/zdtm/transition/Makefile b/test/zdtm/transition/Makefile index ddf2faaad..9388157e8 100644 --- a/test/zdtm/transition/Makefile +++ b/test/zdtm/transition/Makefile @@ -23,9 +23,6 @@ TST_NOFILE = \ lazy-thp \ pid_reuse \ pidfd_store_sk \ - rseq01 \ - rseq02 \ - stack \ TST_FILE = \ @@ -84,9 +81,6 @@ ptrace: LDFLAGS += -pthread fork2: CFLAGS += -D FORK2 thread-bomb.o: CFLAGS += -pthread thread-bomb: LDFLAGS += -pthread -rseq01: LDLIBS += -pthread -rseq02: CFLAGS += -D NORESTART -rseq02: LDLIBS += -pthread %: %.sh cp $< $@ diff --git a/test/zdtm/transition/epoll.c b/test/zdtm/transition/epoll.c index 803e50541..fdd492ab2 100644 --- a/test/zdtm/transition/epoll.c +++ b/test/zdtm/transition/epoll.c @@ -158,11 +158,9 @@ int main(int argc, char **argv) exit(1); } for (i = 0; i < rv; i++) { - int ret; - - while ((ret = read(events[i].data.fd, buf, buf_size)) > 0) + while (read(events[i].data.fd, buf, buf_size) > 0) ; - if (ret < 0 && errno != EAGAIN) { + if (errno != EAGAIN && errno != 0 && errno) { pr_perror("read error"); killall(); exit(1); diff --git a/test/zdtm/transition/ipc.c b/test/zdtm/transition/ipc.c index 7660f70af..0f16dbc68 100644 --- a/test/zdtm/transition/ipc.c +++ b/test/zdtm/transition/ipc.c @@ -178,7 +178,7 @@ int main(int argc, char **argv) pr_perror("Child 2 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 2 couldn't initialise"); + pr_perror("Child 2 couldn't inititalise"); } out_child: kill(pid1, SIGTERM); @@ -188,7 +188,7 @@ out_child: pr_perror("Child 1 was killed"); } else if (WEXITSTATUS(ret)) { fail_count++; - pr_perror("Child 1 couldn't initialise"); + pr_perror("Child 1 couldn't inititalise"); } out_shdt: shmdt(mem); diff --git a/test/zdtm/transition/lazy-thp.c b/test/zdtm/transition/lazy-thp.c index 2e9722b96..2bf99dc4c 100644 --- a/test/zdtm/transition/lazy-thp.c +++ b/test/zdtm/transition/lazy-thp.c @@ -25,7 +25,7 @@ int main(int argc, char **argv) test_init(argc, argv); - /* we presume that malloc returns not page aligned address */ + /* we presume that malloc returns not page aliged address */ mem = malloc(PAGE_SIZE * N_PAGES); org = malloc(PAGE_SIZE); if (!mem || !org) { diff --git a/test/zdtm/transition/maps007.c b/test/zdtm/transition/maps007.c index 35c196bc4..8a605cfe0 100644 --- a/test/zdtm/transition/maps007.c +++ b/test/zdtm/transition/maps007.c @@ -38,7 +38,7 @@ int main(int argc, char **argv) struct { futex_t delta; futex_t stop; - } *shm; + } * shm; uint32_t v; unsigned long long count = 0; int i; diff --git a/test/zdtm/transition/ptrace.c b/test/zdtm/transition/ptrace.c index ee10c8004..bf6344f1c 100644 --- a/test/zdtm/transition/ptrace.c +++ b/test/zdtm/transition/ptrace.c @@ -31,7 +31,7 @@ int main(int argc, char **argv) { int pid, status, i, stopped; #define PT_REGS_SIZE 4096 /* big enough for any arch */ -#define PT_REGS_ALIGN 16 /* big enough for any arch */ +#define PT_REGS_ALIGN 16 /* big enough for any arch */ char regs[PT_REGS_SIZE] __attribute__((aligned(PT_REGS_ALIGN))); int *pids; diff --git a/test/zdtm/transition/rseq01.c b/test/zdtm/transition/rseq01.c deleted file mode 100644 index 08a7a8e1a..000000000 --- a/test/zdtm/transition/rseq01.c +++ /dev/null @@ -1,310 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "zdtmtst.h" - -#ifdef __has_include -#if __has_include("sys/rseq.h") -#include -#endif -#endif - -#if defined(__x86_64__) - -#if defined(__x86_64__) && defined(RSEQ_SIG) -static inline void *thread_pointer(void) -{ - void *result; - asm("mov %%fs:0, %0" : "=r"(result)); - return result; -} - -static inline void unregister_old_rseq(void) -{ - /* unregister rseq */ - unsigned int size = __rseq_size; - if (__rseq_size < 32) - size = 32; - syscall(__NR_rseq, (void *)((char *)thread_pointer() + __rseq_offset), size, 1, RSEQ_SIG); -} -#else -static inline void unregister_old_rseq(void) -{ -} -#endif - -const char *test_doc = "rseq() transition test"; -const char *test_author = "Alexander Mikhalitsyn "; -/* - * Thanks to Mathieu Desnoyers (rseq author) - * who helped me with review and debugging the problem on the Alpine Linux. - * - * parts of code borrowed from - * https://www.efficios.com/blog/2019/02/08/linux-restartable-sequences/ - */ - -/* some useful definitions from kernel uapi */ -#ifndef RSEQ_SIG - -enum rseq_flags { - RSEQ_FLAG_UNREGISTER = (1 << 0), -}; - -enum rseq_cs_flags_bit { - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, -}; - -enum rseq_cs_flags { - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT), - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), -}; - -struct rseq { - uint32_t cpu_id_start; - uint32_t cpu_id; - uint64_t rseq_cs; - uint32_t flags; -} __attribute__((aligned(4 * sizeof(uint64_t)))); - -#define RSEQ_SIG 0x53053053 - -#endif - -#ifndef __NR_rseq -#define __NR_rseq 334 -#endif -/* EOF */ - -static __thread volatile struct rseq *rseq_ptr; -static __thread volatile struct rseq __rseq_abi; - -static int sys_rseq(volatile struct rseq *rseq_abi, uint32_t rseq_len, int flags, uint32_t sig) -{ - return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); -} - -static void register_thread(void) -{ - int rc; - unregister_old_rseq(); - rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); - if (rc) { - fail("Failed to register rseq"); - exit(1); - } -} - -static void check_thread(void) -{ - int rc; - rc = sys_rseq(rseq_ptr, sizeof(struct rseq), 0, RSEQ_SIG); - if (!(rc && errno == EBUSY)) { - fail("Failed to check rseq %d", rc); - exit(1); - } -} - -#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x)) - -#define rseq_after_asm_goto() asm volatile("" : : : "memory") - -static int rseq_addv(intptr_t *v, intptr_t count, int cpu, bool ignore_abort, const char *id) -{ - double a = 10000000000000000.0; - double b = -1; - uint64_t rseq_cs1 = 0, rseq_cs2 = 0; - - /* clang-format off */ - __asm__ __volatile__ goto( - ".pushsection __rseq_table, \"aw\"\n\t" - ".balign 32\n\t" - "cs_obj:\n\t" - /* version, flags */ - ".long 0, 0\n\t" - /* start_ip, post_commit_offset, abort_ip */ - ".quad 1f, (2f-1f), 4f\n\t" - ".popsection\n\t" - "fldl %[x]\n\t" /* we have st clobbered */ - "leaq cs_obj(%%rip), %%rax\n\t" - "1:\n\t" - "movq %%rax, %[rseq_cs]\n\t" - "cmpl %[cpu_id], %[current_cpu_id]\n\t" - "jnz 4f\n\t" - "addq %[count], %[v]\n\t" /* final store */ - "mov $10000000, %%rcx\n\t" - "5:\n\t" - "fsqrt\n\t" /* heavy instruction */ - "dec %%rcx\n\t" - "jnz 5b\n\t" - "movq %%rax, %[rseq_cs_check2]\n\t" - "movq %[rseq_cs], %%rax\n\t" - "movq %%rax, %[rseq_cs_check1]\n\t" - "fstpl %[y]\n\t" - "2:\n\t" - ".pushsection __rseq_failure, \"ax\"\n\t" - /* Disassembler-friendly signature: nopl (%rip). */ - ".byte 0x0f, 0xb9, 0x3d\n\t" - ".long 0x53053053\n\t" /* RSEQ_FLAGS */ - "4:\n\t" - "fstpl %[y]\n\t" - "jmp %l[abort]\n\t" - /*"jmp 1b\n\t"*/ - ".popsection\n\t" - : /* gcc asm goto does not allow outputs */ - : [cpu_id] "r" (cpu), - [current_cpu_id] "m" (rseq_ptr->cpu_id), - [rseq_cs] "m" (rseq_ptr->rseq_cs), - [rseq_cs_check1] "m" (rseq_cs1), - [rseq_cs_check2] "m" (rseq_cs2), - /* final store input */ - [v] "m" (*v), - [count] "er" (count), - [x] "m" (a), - [y] "m" (b) - : "memory", "cc", "rax", "rcx", "st" - : abort - ); - /* clang-format on */ - rseq_after_asm_goto(); - test_msg("exit %s, %lx %lx %f %f\n", id, rseq_cs1, rseq_cs2, a, b); - if (rseq_cs1 != rseq_cs2) { - /* - * It means that we finished critical section - * *normally* (haven't jumped to abort) but the kernel had cleaned up - * rseq_ptr->rseq_cs before we left critical section - * and CRIU didn't restore it correctly. - * That's a bug picture. - */ - return -1; - } - - return 0; -abort: - rseq_after_asm_goto(); - test_msg("abort %s, %lx %lx %f %f\n", id, rseq_cs1, rseq_cs2, a, b); - if (ignore_abort) - return 0; - return -1; -} - -static task_waiter_t waiter; -static intptr_t *cpu_data; -bool ignore_abort = true; -int thread_ret; - -void *thread_routine(void *args) -{ - int cpu; - - rseq_ptr = &__rseq_abi; - memset((void *)rseq_ptr, 0, sizeof(struct rseq)); - register_thread(); - task_waiter_complete(&waiter, 1); - task_waiter_wait4(&waiter, 2); - - while (test_go()) { - cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); - thread_ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort, "thread"); - - if (thread_ret) - break; - } - - check_thread(); - return NULL; -} - -int main(int argc, char *argv[]) -{ - int cpu = 0; - int ret; - long nr_cpus; - pthread_t thread; - - rseq_ptr = &__rseq_abi; - memset((void *)rseq_ptr, 0, sizeof(struct rseq)); - - test_init(argc, argv); - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - - cpu_data = calloc(nr_cpus, sizeof(*cpu_data)); - if (!cpu_data) { - fail("calloc"); - exit(EXIT_FAILURE); - } - - register_thread(); - - /* - * We want to test that RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * is handled properly by CRIU, but that flag can be used - * only with all another flags set. - * Please, refer to - * https://github.com/torvalds/linux/blob/ce522ba9/kernel/rseq.c#L192 - */ -#ifdef NORESTART - ignore_abort = false; - rseq_ptr->flags = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE; -#endif - - task_waiter_init(&waiter); - if (pthread_create(&thread, NULL, thread_routine, NULL)) { - fail("pthread_create"); - exit(EXIT_FAILURE); - } - task_waiter_wait4(&waiter, 1); - - test_daemon(); - task_waiter_complete(&waiter, 2); - - while (test_go()) { - cpu = RSEQ_ACCESS_ONCE(rseq_ptr->cpu_id_start); - ret = rseq_addv(&cpu_data[cpu], 2, cpu, ignore_abort, "task"); - - if (ret) - break; - } - - check_thread(); - - if (pthread_join(thread, NULL)) { - fail("pthread_join"); - exit(EXIT_FAILURE); - } - - if (ret || thread_ret) - fail(); - else - pass(); - - return 0; -} - -#else /* #if defined(__x86_64__) */ - -int main(int argc, char *argv[]) -{ - test_init(argc, argv); - skip("Unsupported arch"); - test_daemon(); - test_waitsig(); - pass(); - return 0; -} - -#endif /* #if defined(__x86_64__) */ diff --git a/test/zdtm/transition/rseq01.desc b/test/zdtm/transition/rseq01.desc deleted file mode 100644 index 0324fa39c..000000000 --- a/test/zdtm/transition/rseq01.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf'} diff --git a/test/zdtm/transition/rseq02.c b/test/zdtm/transition/rseq02.c deleted file mode 120000 index d56491719..000000000 --- a/test/zdtm/transition/rseq02.c +++ /dev/null @@ -1 +0,0 @@ -rseq01.c \ No newline at end of file diff --git a/test/zdtm/transition/rseq02.desc b/test/zdtm/transition/rseq02.desc deleted file mode 100644 index 1ce7f240f..000000000 --- a/test/zdtm/transition/rseq02.desc +++ /dev/null @@ -1 +0,0 @@ -{'flavor': 'h', 'arch': 'x86_64', 'feature': 'get_rseq_conf', 'flags': 'noauto'} diff --git a/test/zdtm/transition/stack.c b/test/zdtm/transition/stack.c deleted file mode 100644 index 9548b9182..000000000 --- a/test/zdtm/transition/stack.c +++ /dev/null @@ -1,16 +0,0 @@ -#include "zdtmtst.h" - -const char *test_doc = "Tests that parasite code does not write past the start of the stack"; -const char *test_author = "Younes Manton "; - -int main(int argc, char **argv) -{ - test_init(argc, argv); - - test_daemon(); - test_waitsig(); - - pass(); - - return 0; -} diff --git a/test/zdtm_ct.c b/test/zdtm_ct.c index 44316893d..e8d45a9e7 100644 --- a/test/zdtm_ct.c +++ b/test/zdtm_ct.c @@ -41,7 +41,7 @@ static inline int _settime(clockid_t clk_id, time_t offset) return 0; } -static int create_timens(void) +static int create_timens() { struct utsname buf; unsigned major, minor; @@ -61,7 +61,7 @@ static int create_timens(void) if (sscanf(buf.release, "%u.%u", &major, &minor) != 2) return -1; - if ((major < 5) || (major == 5 && minor < 11)) { + if ((major <= 5) || (major == 5 && minor < 11)) { fprintf(stderr, "timens isn't supported on %s\n", buf.release); return 0; } @@ -93,50 +93,44 @@ static int create_timens(void) int main(int argc, char **argv) { - uid_t uid; pid_t pid; int status; - uid = getuid(); - /* * pidns is used to avoid conflicts * mntns is used to mount /proc - * net is used to avoid conflicts between network tests + * net is used to avoid conflicts of parasite sockets */ - if (!uid) - if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) - return 1; + if (unshare(CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC)) + return 1; pid = fork(); if (pid == 0) { - if (!uid) { - if (create_timens()) - exit(1); - if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { - fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); - return 1; - } - umount2("/proc", MNT_DETACH); - umount2("/dev/pts", MNT_DETACH); - if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { - fprintf(stderr, "mount(/proc): %m"); - return 1; - } - if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { - fprintf(stderr, "mount(pts): %m"); - return 1; - } - if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { - fprintf(stderr, "mount(binfmt_misc): %m"); - return 1; - } - if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { - fprintf(stderr, "mount(ptmx): %m"); - return 1; - } - if (system("ip link set up dev lo")) - return 1; + if (create_timens()) + exit(1); + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL)) { + fprintf(stderr, "mount(/, S_REC | MS_SLAVE)): %m"); + return 1; } + umount2("/proc", MNT_DETACH); + umount2("/dev/pts", MNT_DETACH); + if (mount("zdtm_proc", "/proc", "proc", 0, NULL)) { + fprintf(stderr, "mount(/proc): %m"); + return 1; + } + if (mount("zdtm_devpts", "/dev/pts", "devpts", 0, "newinstance,ptmxmode=0666")) { + fprintf(stderr, "mount(pts): %m"); + return 1; + } + if (mount("zdtm_binfmt", "/proc/sys/fs/binfmt_misc", "binfmt_misc", 0, NULL)) { + fprintf(stderr, "mount(binfmt_misc): %m"); + return 1; + } + if (mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL)) { + fprintf(stderr, "mount(ptmx): %m"); + return 1; + } + if (system("ip link set up dev lo")) + return 1; execv(argv[1], argv + 1); fprintf(stderr, "execve: %m"); return 1; diff --git a/test/zdtm_mount_cgroups b/test/zdtm_mount_cgroups index a99e16361..34e0e85ca 100755 --- a/test/zdtm_mount_cgroups +++ b/test/zdtm_mount_cgroups @@ -4,15 +4,13 @@ # Error (cgroup.c:768): cg: Set 3 is not subset of 2 # so lets create all test controllers before executing tests. -uuid=$1 - cat /proc/self/cgroup | grep -q zdtmtst.defaultroot && exit tdir=`mktemp -d zdtm.XXXXXX` for i in "zdtmtst" "zdtmtst.defaultroot"; do mount -t cgroup -o none,name=$i zdtm $tdir && # a fake group prevents destroying of a controller - mkdir -p $tdir/holder.$uuid && + mkdir -p $tdir/holder && umount -l $tdir || exit 1 done rmdir $tdir diff --git a/test/zdtm_umount_cgroups b/test/zdtm_umount_cgroups index decd70ff0..75a8ea28f 100755 --- a/test/zdtm_umount_cgroups +++ b/test/zdtm_umount_cgroups @@ -4,14 +4,12 @@ cat /proc/self/cgroup | grep -q zdtmtst.defaultroot || exit 0 -uuid=$1 - tdir=`mktemp -d zdtm.XXXXXX` for i in "zdtmtst" "zdtmtst.defaultroot"; do mount -t cgroup -o none,name=$i zdtm $tdir || { rmdir $tdir; exit 1; } # remove a fake group if exists - if [ -d "$tdir/holder.$uuid" ]; then - rmdir $tdir/holder.$uuid || { umount -l $tdir && rmdir $tdir; exit 1; } + if [ -d "$tdir/holder" ]; then + rmdir $tdir/holder || { umount -l $tdir && rmdir $tdir; exit 1; } fi umount -l $tdir || exit 1; done